refactor folder of torch 3x API and refine RTN (#1560)

Signed-off-by: xin3he <xin3.he@intel.com> Signed-off-by: chensuyue <suyue.chen@intel.com>
intel · Jan 30, 2024 · 7bf89eb · 7bf89eb
1 parent 02233fb
commit 7bf89eb
Show file tree

Hide file tree

Showing 34 changed files with 702 additions and 770 deletions.
diff --git a/.azure-pipelines/scripts/ut/3x/collect_log_3x.sh b/.azure-pipelines/scripts/ut/3x/collect_log_3x.sh
@@ -1,6 +1,6 @@
 source /neural-compressor/.azure-pipelines/scripts/change_color.sh
 
-set -xe
+set -e
 pip install coverage
 export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.${1}
 coverage_log="/neural-compressor/log_dir/coverage_log"

diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh b/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh
@@ -28,7 +28,8 @@ cp .coverage ${LOG_DIR}/.coverage
 
 echo "------UT end -------"
 
-if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] \
+|| [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "ImportError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
     echo "Find errors in UT test, please check the output..."
     exit 1
 fi

diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh
@@ -11,25 +11,55 @@ pip list
 
 export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.3x_pt
 inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
-cd /neural-compressor/test || exit 1
-find ./3x/torch/* -name "test*.py" | sed 's,\.\/,coverage run --source='"${inc_path}"' --append ,g' | sed 's/$/ --verbose/'> run.sh
-find ./3x/common/* -name "test*.py" | sed 's,\.\/,coverage run --source='"${inc_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
+cd /neural-compressor/test/3x || exit 1
+grep -lrv "import pytest" --include="test*.py" ./torch | sed 's,\.\/,coverage run --source='"${inc_path}"' --append ,g' | sed 's/$/ --verbose/'> run_unittest.sh
+grep -lrv "import pytest" --include="test*.py" ./common | sed 's,\.\/,coverage run --source='"${inc_path}"' --append ,g' | sed 's/$/ --verbose/'>> run_unittest.sh
+grep -lr "import pytest" --include="test*.py" ./torch | sed 's,\.\/,coverage run --source='"${inc_path}"' --append -m pytest --disable-warnings -v ,g' > run_pytest.sh
+grep -lr "import pytest" --include="test*.py" ./common | sed 's,\.\/,coverage run --source='"${inc_path}"' --append -m pytest --disable-warnings -v ,g'>> run_pytest.sh
 
 LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut_3x_pt.log
 
-echo "cat run.sh..."
-sort run.sh -o run.sh
-cat run.sh | tee ${ut_log_name}
-echo "------UT start-------"
-bash -x run.sh 2>&1 | tee -a ${ut_log_name}
+# unittest and pytest has some incompatible issue, so separate the test.
+echo "cat run_unittest.sh..."
+sort run_unittest.sh -o run_unittest.sh
+cat run_unittest.sh | tee ${ut_log_name}
+echo "------unittest start-------"
+bash -x run_unittest.sh 2>&1 | tee -a ${ut_log_name}
+echo "------unittest end -------"
+
+if [ -s run_pytest.sh ]; then
+    echo "cat run_pytest.sh..."
+    sort run_pytest.sh -o run_pytest.sh
+    cat run_pytest.sh | tee -a ${ut_log_name}
+    echo "------pytest start-------"
+    bash -x run_pytest.sh 2>&1 | tee -a ${ut_log_name}
+    echo "------pytest end -------"
+fi
+
 cp .coverage ${LOG_DIR}/.coverage
 
-echo "------UT end -------"
+ut_status="passed"
+# check unittest issue
+if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] \
+|| [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "ImportError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+    echo "Find errors in unittest case, please check the output..."
+    echo "Please search for 'FAILED' or 'core dumped' or 'ModuleNotFoundError:' or 'ImportError:'"
+    ut_status="failed"
+fi
 
-if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
-    echo "Find errors in UT test, please check the output..."
+# check pytest issue
+if [ -s run_pytest.sh ]; then
+    if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c 'passed,' ${ut_log_name}) == 0 ]; then
+        echo "Find errors in pytest case, please check the output..."
+        echo "Please search for '== FAILURES ==' or '== ERRORS =='"
+        ut_status="failed"
+    fi
+fi
+
+if [ "$ut_status" = "failed" ]; then
     exit 1
 fi
+
 echo "UT finished successfully! "
diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_tf.sh b/.azure-pipelines/scripts/ut/3x/run_3x_tf.sh
@@ -28,7 +28,8 @@ cp .coverage ${LOG_DIR}/.coverage
 
 echo "------UT end -------"
 
-if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
+if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] \
+|| [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "ImportError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
     echo "Find errors in UT test, please check the output..."
     exit 1
 fi

diff --git a/...nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py b/...nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py
@@ -48,7 +48,7 @@
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 ### INC code ###
-from neural_compressor.torch.quantization.layers import Matmul, BatchMatmul, Autocast
+from neural_compressor.torch.quantization.modules import Matmul, BatchMatmul, Autocast
 
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
@@ -161,8 +161,8 @@ def itrex_bootstrap_stderr(f, xs, iters):
 if args.approach in ["dynamic", "static"]:
     print("device:", next(user_model.parameters()).device)
     from neural_compressor.torch.quantization.config import FP8QConfig, get_default_fp8_qconfig
-    from neural_compressor.torch.quantization.fp8 import quantize_dynamic
-    from neural_compressor.torch.quantization import quantize, quantize_dynamic
+    from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
+    from neural_compressor.torch.quantization import quantize
     if args.precision == "fp8_e4m3":
         dtype = torch.float8_e4m3fn
     else:

diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py
@@ -11,21 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from neural_compressor.torch.utils.utility import register_algo
-from neural_compressor.torch.algorithms import rtn_quantize_entry, gptq_quantize_entry
-
-from neural_compressor.torch.quantization import (
-    quantize,
-    RTNConfig,
-    get_default_rtn_config,
-    GPTQConfig,
-    get_default_gptq_config,
-    StaticQuantConfig,
-    get_default_static_config,
-    SmoothQuantConfig,
-    get_default_sq_config,
-)
-
-from neural_compressor.common.base_tuning import TuningConfig
-from neural_compressor.torch.quantization.autotune import autotune, get_all_config_set
diff --git a/neural_compressor/torch/algorithms/__init__.py b/neural_compressor/torch/algorithms/__init__.py
@@ -11,7 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-from neural_compressor.torch.algorithms.weight_only_algos import rtn_quantize_entry
-from neural_compressor.torch.algorithms.weight_only_algos import gptq_quantize_entry
diff --git a/...ressor/torch/quantization/fp8/__init__.py → ...r/torch/algorithms/habana_fp8/__init__.py b/...ressor/torch/quantization/fp8/__init__.py → ...r/torch/algorithms/habana_fp8/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .quantization_impl import quantize_dynamic, quantize
+from .fp8_quant import quantize_dynamic, quantize, white_list
diff --git a/...rch/quantization/fp8/quantization_impl.py → .../torch/algorithms/habana_fp8/fp8_quant.py b/...rch/quantization/fp8/quantization_impl.py → .../torch/algorithms/habana_fp8/fp8_quant.py
@@ -22,9 +22,9 @@
 from deepspeed.module_inject.layers import LmHeadLinearAllreduce
 
 from neural_compressor.common.utils import FP8_QUANT
+from neural_compressor.torch.quantization.modules import Autocast, BatchMatmul, Matmul
 from neural_compressor.torch.utils.utility import fetch_module, logger, register_algo, set_module
 
-from ..layers import Autocast, BatchMatmul, Matmul
 from .modules import (
     FP8BatchMatmul,
     FP8Cast,
@@ -198,7 +198,6 @@ def convert(model, qconfig_mapping):
     return model
 
 
-@register_algo(name=FP8_QUANT)
 def quantize(model, qconfig_mapping, run_fn=None, run_args=None, inplace=True):
     q_model = model if inplace else copy.deepcopy(model)
     q_model = prepare(q_model, qconfig_mapping)
@@ -209,7 +208,3 @@ def quantize(model, qconfig_mapping, run_fn=None, run_args=None, inplace=True):
             run_fn(q_model)
     q_model = convert(q_model, qconfig_mapping)
     return q_model
-
-
-# def autotune(fp32_model, quant_config, tune_config, eval_func, ...):
-#     pass
diff --git a/...pressor/torch/quantization/fp8/modules.py → ...or/torch/algorithms/habana_fp8/modules.py b/...pressor/torch/quantization/fp8/modules.py → ...or/torch/algorithms/habana_fp8/modules.py
diff --git a/...ressor/torch/quantization/fp8/observer.py → ...r/torch/algorithms/habana_fp8/observer.py b/...ressor/torch/quantization/fp8/observer.py → ...r/torch/algorithms/habana_fp8/observer.py
diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -757,13 +757,14 @@ def find_params(self, x, weight=False):
         if self.wdtype != "int":
             from .utility import quant_tensor
 
-            tmp = x.clone()  # make sure x is not replaced
+            tmp = x.clone()  # tmp will be replaced after quant_tensor
+
             _, scale, zero = quant_tensor(
                 tmp,
-                self.wbits,
-                self.group_size,
-                scheme=self.scheme,
                 dtype=self.wdtype,
+                bits=self.wbits,
+                group_size=self.group_size,
+                scheme=self.scheme,
                 quantile=1.0,
                 return_int=True,
                 full_range=False,
@@ -854,10 +855,10 @@ def find_params(self, x, weight=False):
                 self.scale = self.scale.reshape(1, -1)
                 quant_tensor(
                     self.scale,
-                    self.double_quant_bits,
-                    self.double_quant_group_size,
-                    scheme=self.double_quant_scheme,
                     dtype=self.double_quant_dtype,
+                    bits=self.double_quant_bits,
+                    group_size=self.double_quant_group_size,
+                    scheme=self.double_quant_scheme,
                     quantile=1.0,
                     return_int=False,
                     full_range=False,
@@ -879,8 +880,7 @@ def quantize(self, x, scale, zero, maxq):
         if self.wdtype != "int":
             from .utility import quantize_4bit
 
-            tmp = x.clone()
-
+            tmp = x.clone()  # tmp will be replaced after quant_tensor
             return quantize_4bit(tmp, dtype=self.wdtype, scale=scale)
         else:
             if maxq < 0:
@@ -893,12 +893,7 @@ def ready(self):
 
 
 # TODO (Yi) remove it after unifying the algo config parser
-from typing import Callable, Dict, Tuple
-
-from neural_compressor.torch.quantization.config import GPTQConfig
-
-
-def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig]):
+def gptq_config_mapping(configs_mapping):
     # convert GPTQ_CONFIG to gptq_quantize's weight config
     # convert tune_cfg to gptq_quantize's weight config
     # for layer_wise quant mode
@@ -950,33 +945,7 @@ def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig])
     return weight_config, nsamples, use_max_length, pad_max_length, device, dataloader_len
 
 
-def gptq_quantize(
-    model,
-    weight_config={},
-    dataloader=None,
-    nsamples=128,
-    use_max_length=True,
-    pad_max_length=2048,
-    device=None,
-    layer_wise=False,
-    model_path=None,
-):
-    """Run weight-only quantization with."""
-    # TODO: unify weight_config keys, add docstring, and support default config
-    assert isinstance(model, torch.nn.Module), "only support torch module"
-    if layer_wise:
-        assert model_path is not None, "model_path should not be None when use layer_wise mode"
-    from .gptq import GPTQuantizer
-
-    gptq_quantizer = GPTQuantizer(
-        model, weight_config, dataloader, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise
-    )
-    fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
-    logger.info("GPTQ quantizing done.")
-    return fp32_modified_model, gptq_config
-
-
-def apply_gptq_quantize(model, configs_mapping, *args, **kwargs):
+def gptq_quantize(model, configs_mapping, *args, **kwargs):
     """Apply gptq."""
     # TODO: unify weight_config keys, add docstring, and support default config
     weight_config, nsamples, use_max_length, pad_max_length, device, dataloader_len = gptq_config_mapping(