Skip to content

Commit

Permalink
Cherry pick Habana software 1.18.0 update (#2025)
Browse files Browse the repository at this point in the history
Signed-off-by: xinhe3 <xinhe3@habana.ai>
Signed-off-by: Yi Liu <yiliu4@habana.ai>
Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
Signed-off-by: chensuyue <suyue.chen@intel.com>
Co-authored-by: yan tomsinsky <ytomsinsky@habana.ai>
Co-authored-by: Uri Livne <ulivne@habana.ai>
Co-authored-by: Dudi Lester <dlester@habana.ai>
Co-authored-by: Danny <dsemiat@habana.ai>
Co-authored-by: Tomer Gafni <tgafni@habana.ai>
Co-authored-by: Eran Geva <egeva@habana.ai>
Co-authored-by: Daniel Ohayon <danielohayon444@gmail.com>
Co-authored-by: Roi Tiefenbrunn <rtiefenbrunn@habana.ai>
Co-authored-by: Kamil Felskowski <kfelskowskix@habana.ai>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
11 people authored Oct 17, 2024
1 parent d6149aa commit 5fb2184
Show file tree
Hide file tree
Showing 67 changed files with 99,756 additions and 945 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ elif [ "${mode}" == "tuning" ]; then
cd ${WORK_SOURCE_DIR}/${model_src_dir}
# for int4 models add "--accuracy" to run tuning after quantize
if [[ "${model}" == *"int4"* ]]; then
sed -i "s|--quantize|--quantize --accuracy --int8|g" run_quant.sh
sed -i "s|--quantize|--quantize --accuracy --load|g" run_quant.sh
fi

$BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
Expand Down
1 change: 1 addition & 0 deletions .azure-pipelines/scripts/ut/3x/coverage.3x_pt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ include =
*/neural_compressor/torch/*
omit =
*/neural_compressor/torch/algorithms/fp8_quant/*
*/neural_compressor/torch/algorithms/mixed_low_precision/*
*/neural_compressor/torch/amp/*
exclude_lines =
pragma: no cover
Expand Down
1 change: 1 addition & 0 deletions .azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ branch = True
[report]
include =
*/neural_compressor/torch/algorithms/fp8_quant/*
*/neural_compressor/torch/algorithms/mixed_low_precision/*
exclude_lines =
pragma: no cover
raise NotImplementedError
Expand Down
2 changes: 1 addition & 1 deletion .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ sed -i '/^intel_extension_for_pytorch/d' /neural-compressor/test/3x/torch/requir
sed -i '/^auto_round/d' /neural-compressor/test/3x/torch/requirements.txt
cat /neural-compressor/test/3x/torch/requirements.txt
pip install -r /neural-compressor/test/3x/torch/requirements.txt
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
pip install pytest-cov
pip install pytest-html
pip install pytest-html-merger
Expand All @@ -27,6 +26,7 @@ pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-co
pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
# pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/weight_only/test_autoround.py 2>&1 | tee -a ${ut_log_name}
pytest --cov="${inc_path}" -vs --disable-warnings --html=report_4.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
pytest --cov="${inc_path}" -vs --disable-warnings --html=report_5.html --self-contained-html torch/algorithms/fp8_quant 2>&1 | tee -a ${ut_log_name}

mkdir -p report && mv *.html report
pytest_html_merger -i ./report -o ./report.html
Expand Down
4 changes: 2 additions & 2 deletions .azure-pipelines/template/docker-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ steps:
- ${{ if eq(parameters.imageSource, 'pull') }}:
- script: |
docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
displayName: "Pull habana docker image"
- script: |
Expand All @@ -95,7 +95,7 @@ steps:
else
docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
-v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
-v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
fi
echo "Show the container list after docker run ... "
docker ps -a
Expand Down
11 changes: 1 addition & 10 deletions docs/source/3x/PT_FP8Quant.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,6 @@ Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 c

## Supported Parameters

<style type="text/css">
.tg {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-fymr{border-color:inherit;font-weight:bold;text-align:left;vertical-align:top}
.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
</style>
<table class="tg"><thead>
<tr>
<th class="tg-fymr">Attribute</th>
Expand Down Expand Up @@ -74,7 +65,7 @@ Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 c
<tr>
<td class="tg-0pky">scale_method</td>
<td class="tg-0pky">The method for calculating the scale from the measurement.</td>
<td class="tg-0pky">- without_scale - Convert to/from FP8 without scaling.<br>- unit_scale - Always use scale of 1.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
<td class="tg-0pky">- unit_scale - Always use scale of 1.<br>- hw_aligned_single_scale - Always use scale that's aligned to the corresponding HW accelerated scale.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
</tr>
<tr>
<td class="tg-0pky">measure_exclude</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ neural-compressor
lm_eval==0.4.3
peft
optimum-intel
intel_extension_for_pytorch
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ def eval_func(model):


if args.load:
# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
if args.int8 or args.int8_bf16_mixed:
print("load int8 model")
from neural_compressor.torch.quantization import load
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ einops
neural-compressor
lm_eval==0.4.3
peft
intel_extension_for_pytorch
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ def run_fn(model):
user_model.save(args.output_dir)

if args.load:
# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
if args.int8 or args.int8_bf16_mixed:
print("load int8 model")
from neural_compressor.torch.quantization import load
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ tiktoken #qwen
einops #qwen
auto_round
lm-eval==0.4.3
numba
tbb
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ einops #qwen
auto_round
lm-eval==0.4.3
huggingface_hub
numba
tbb
Original file line number Diff line number Diff line change
@@ -1,179 +1,96 @@
Step-by-Step
============
This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
Weight-only quantization
===============

The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.

# Prerequisite
## 1. Create Environment
## Prerequisite
```
# Installation
pip install -r requirements.txt
```

# Run
## Support status on HPU

Here is how to run the scripts:
Below is the current support status on Intel Gaudi AI Accelerator with PyTorch.

**Causal Language Modeling (CLM)**
| woq_algo | Status |
|--------------|----------|
| GPTQ | &#10004;|

`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
### GPT-J-6b
> We validated the typical LLMs such as: `meta-llama/Llama-2-7b-hf`, `EleutherAI/gpt-j-6B`, `facebook/opt-125m`.
#### Quantization
## Support status on CPU

```bash
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--double_quant_type "BNB_NF4" \
--output_dir saved_results
Below is the current support status on Intel® Xeon® Scalable Processor with PyTorch.

# "--woq_algo RTN" is used to enable RTN algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--double_quant_type "BNB_NF4"
--output_dir saved_results

# "--woq_algo AWQ" is used to enable AWQ algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AWQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--calib_iters 128
| woq_algo | status |
|--------------|----------|
| RTN | &#10004; |
| GPTQ | &#10004; |
| AutoRound| &#10004; |
| AWQ | &#10004; |
| TEQ | &#10004; |

# "--woq_algo AutoRound" is used to enable AutoRound algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AutoRound \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128
> We validated the typical LLMs such as: `meta-llama/Llama-2-7b-hf`, `EleutherAI/gpt-j-6B`, `facebook/opt-125m`.
# "--accuracy" for eval
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--int8 \
--accuracy \
--tasks "lambada_openai" \
--output_dir saved_results
```
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.

## Run

### OPT-125m
`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates datasets accuracy provided by lm_eval, an example command is as follows.

#### Quantization
### Quantization

```bash
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--model meta-llama/Llama-2-7b-hf \
--dataset NeelNanda/pile-10k \
--quantize \
--batch_size 8 \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--double_quant_type "BNB_NF4"

# "--woq_algo RTN" is used to enable RTN algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--double_quant_type "BNB_NF4"

# "--woq_algo AWQ" is used to enable AWQ algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AWQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--calib_iters 128
--output_dir saved_results
```
### Evaluation

# "--woq_algo AutoRound" is used to enable AutoRound algorithms
```bash
# original model
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AutoRound \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128
--model meta-llama/Llama-2-7b-hf \
--accuracy \
--batch_size 8 \
--tasks "lambada_openai,wikitext" \
--output_dir saved_results

# "--accuracy" for eval
# quantized model
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--int8 \
--model meta-llama/Llama-2-7b-hf \
--load \
--accuracy \
--tasks "lambada_openai" \
--batch_size 8 \
--tasks "lambada_openai,wikitext" \
--output_dir saved_results
```

### LLAMA2-7b/13b/70b
#### Quantization
### Benchmark

```bash
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
# original model
python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo GPTQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--double_quant_type "BNB_NF4"
--performance \
--batch_size 8 \
--output_dir saved_results

# "--woq_algo RTN" is used to enable RTN algorithms
# quantized model
python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo RTN \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--double_quant_type "BNB_NF4"
--load \
--performance \
--batch_size 8 \
--output_dir saved_results
```


[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
For more information about parameter usage, please refer to [PT_WeightOnlyQuant.md](https://github.com/intel/neural-compressor/blob/master/docs/source/3x/PT_WeightOnlyQuant.md)
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ neural-compressor
lm_eval==0.4.3
peft
auto_round
intel_extension_for_pytorch
numba
tbb
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ function run_benchmark {
fi

if [[ ${int8} == "true" ]]; then
extra_cmd=$extra_cmd" --int8"
extra_cmd=$extra_cmd" --load"
fi
echo $extra_cmd

Expand Down
Loading

0 comments on commit 5fb2184

Please sign in to comment.