Remove the now removed and enabled by default flag xla_gpu_simplify_a…

…ll_fp_conversions
NVIDIA · Aug 19, 2024 · b7cfe94 · b7cfe94
1 parent 3f35bf3
commit b7cfe94
Show file tree

Hide file tree

Showing 10 changed files with 9 additions and 12 deletions.
diff --git a/.github/container/patches/t5x/mirror-patch-t5x_te_in_contrib_noindent.patch b/.github/container/patches/t5x/mirror-patch-t5x_te_in_contrib_noindent.patch
@@ -662,7 +662,7 @@ index 89974dd..388d2ec 100755
  # If true, this will duplicate the last checkpoint in MODEL_DIR and add a date/time string. It will finetune on this directory. Useful if running many experiments on the same pretrained checkpoint.
  MAKE_FT_DIR=${9:-false} # 'true' or 'false'. 
 
--export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
+-export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
 +: ${WITH_NSYS:=0}
 
  case $MAKE_FT_DIR in
@@ -771,7 +771,7 @@ index 18bb722..f807105 100755
 +CHECKPOINT_DISABLE=${CHECKPOINT_DISABLE:=0}
 
 -# Setting XLA flags
--export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
+-export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
 +MODEL_DIR_LOCAL=${MODEL_DIR:=model_dir}
 +MODEL_DIR=${T5X_WORKSPACE_DIR}/${MODEL_DIR_LOCAL}
 

diff --git a/rosetta/docs/PGLE.md b/rosetta/docs/PGLE.md
@@ -62,7 +62,6 @@ In order to get the best performance with PGLE, here is a list of all recommende
 ```
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
 --xla_gpu_enable_triton_gemm=false
---xla_gpu_simplify_all_fp_conversions
 --xla_gpu_graph_level=0
 --xla_gpu_enable_highest_priority_async_stream=true
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824

diff --git a/rosetta/rosetta/projects/imagen/scripts/singlenode_mp_train_singlegpu.sh b/rosetta/rosetta/projects/imagen/scripts/singlenode_mp_train_singlegpu.sh
@@ -37,7 +37,7 @@ TRAIN_GPUS=$((NUM_GPUS - INF_SERV_CT))
 echo "Please make sure ${NUM_GPUS} is the number of visible CUDA devices you have"
 
 # Setting XLA flags
-export XLA_FLAGS='--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880'
+export XLA_FLAGS='--xla_gpu_all_reduce_combine_threshold_bytes=136314880'
 
 # Global batch size
 BSIZE=$(( TRAIN_GPUS * BSIZE_PER_GPU  ))

diff --git a/rosetta/rosetta/projects/maxtext/README.md b/rosetta/rosetta/projects/maxtext/README.md
@@ -71,7 +71,6 @@ XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
             --xla_gpu_enable_async_all_gather=true 
             --xla_gpu_enable_async_reduce_scatter=true 
             --xla_gpu_enable_triton_gemm=false
-            --xla_gpu_simplify_all_fp_conversions 
             --xla_gpu_graph_level=0 
             --xla_gpu_enable_async_all_reduce=true 
             --xla_gpu_enable_highest_priority_async_stream=true

diff --git a/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub b/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub
@@ -56,7 +56,6 @@ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_async_all_gather=true
                 --xla_gpu_enable_async_reduce_scatter=true
                 --xla_gpu_enable_triton_gemm=false
-                --xla_gpu_simplify_all_fp_conversions
                 --xla_gpu_graph_level=0
                 --xla_gpu_enable_async_all_reduce=true
                 --xla_gpu_enable_highest_priority_async_stream=true
@@ -114,4 +113,4 @@ mkdir -p "${BASE_WORKSPACE_DIR}/${OUTPUT_DIR}/${RUN_NAME}"
 OUTFILE="${BASE_WORKSPACE_DIR}/${OUTPUT_DIR}/${RUN_NAME}/output-%j-%n-%t.txt"
 
 echo $cmd
-srun -o $OUTFILE -e $OUTFILE --container-image="$CONTAINER" $MOUNTS $EXPORTS bash -c "${cmd}"
+srun -o $OUTFILE -e $OUTFILE --container-image="$CONTAINER" $MOUNTS $EXPORTS bash -c "${cmd}"
diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md
@@ -139,7 +139,7 @@ For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_thres
 
 ```
 BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_async_all_gather=true
+                --xla_gpu_enable_async_all_gather=true
                 --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                 --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=33554432
                 --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true" bash run_pile_multinode.sh ...

diff --git a/rosetta/rosetta/projects/vit/scripts/multiprocess_finetune.sh b/rosetta/rosetta/projects/vit/scripts/multiprocess_finetune.sh
@@ -38,7 +38,7 @@ TRAIN_GPUS=$(( NUM_GPUS * SLURM_JOB_NUM_NODES ))
 echo "Please make sure ${TRAIN_GPUS} is the number of visible CUDA devices you have"
 
 # Setting XLA flags
-export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
+export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.9
 
 # Global batch size

diff --git a/rosetta/rosetta/projects/vit/scripts/multiprocess_pretrain.sh b/rosetta/rosetta/projects/vit/scripts/multiprocess_pretrain.sh
@@ -38,7 +38,7 @@ TRAIN_GPUS=$(( NUM_GPUS * SLURM_JOB_NUM_NODES ))
 echo "Please make sure ${TRAIN_GPUS} is the number of visible CUDA devices you have"
 
 # Setting XLA flags
-export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
+export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.9
 
 # Global batch size

diff --git a/rosetta/rosetta/projects/vit/scripts/singleprocess_finetune.sh b/rosetta/rosetta/projects/vit/scripts/singleprocess_finetune.sh
@@ -38,7 +38,7 @@ TRAIN_GPUS=${NUM_GPUS}
 echo "Please make sure ${TRAIN_GPUS} is the number of visible CUDA devices you have"
 
 # Setting XLA flags
-export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
+export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.9
 
 # Global batch size

diff --git a/rosetta/rosetta/projects/vit/scripts/singleprocess_pretrain.sh b/rosetta/rosetta/projects/vit/scripts/singleprocess_pretrain.sh
@@ -38,7 +38,7 @@ TRAIN_GPUS=${NUM_GPUS}
 echo "Please make sure ${TRAIN_GPUS} is the number of visible CUDA devices you have"
 
 # Setting XLA flags
-export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
+export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.9
 
 # Global batch size