Skip to content

Commit

Permalink
Remove the now removed and enabled by default flag xla_gpu_simplify_a…
Browse files Browse the repository at this point in the history
…ll_fp_conversions
  • Loading branch information
nouiz committed Aug 19, 2024
1 parent 3f35bf3 commit b7cfe94
Show file tree
Hide file tree
Showing 10 changed files with 9 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ index 89974dd..388d2ec 100755
# If true, this will duplicate the last checkpoint in MODEL_DIR and add a date/time string. It will finetune on this directory. Useful if running many experiments on the same pretrained checkpoint.
MAKE_FT_DIR=${9:-false} # 'true' or 'false'.

-export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
-export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
+: ${WITH_NSYS:=0}

case $MAKE_FT_DIR in
Expand Down Expand Up @@ -771,7 +771,7 @@ index 18bb722..f807105 100755
+CHECKPOINT_DISABLE=${CHECKPOINT_DISABLE:=0}

-# Setting XLA flags
-export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
-export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
+MODEL_DIR_LOCAL=${MODEL_DIR:=model_dir}
+MODEL_DIR=${T5X_WORKSPACE_DIR}/${MODEL_DIR_LOCAL}

Expand Down
1 change: 0 additions & 1 deletion rosetta/docs/PGLE.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ In order to get the best performance with PGLE, here is a list of all recommende
```
export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
--xla_gpu_enable_triton_gemm=false
--xla_gpu_simplify_all_fp_conversions
--xla_gpu_graph_level=0
--xla_gpu_enable_highest_priority_async_stream=true
--xla_gpu_all_reduce_combine_threshold_bytes=1073741824
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ TRAIN_GPUS=$((NUM_GPUS - INF_SERV_CT))
echo "Please make sure ${NUM_GPUS} is the number of visible CUDA devices you have"

# Setting XLA flags
export XLA_FLAGS='--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880'
export XLA_FLAGS='--xla_gpu_all_reduce_combine_threshold_bytes=136314880'

# Global batch size
BSIZE=$(( TRAIN_GPUS * BSIZE_PER_GPU ))
Expand Down
1 change: 0 additions & 1 deletion rosetta/rosetta/projects/maxtext/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
--xla_gpu_enable_async_all_gather=true
--xla_gpu_enable_async_reduce_scatter=true
--xla_gpu_enable_triton_gemm=false
--xla_gpu_simplify_all_fp_conversions
--xla_gpu_graph_level=0
--xla_gpu_enable_async_all_reduce=true
--xla_gpu_enable_highest_priority_async_stream=true
Expand Down
3 changes: 1 addition & 2 deletions rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
--xla_gpu_enable_async_all_gather=true
--xla_gpu_enable_async_reduce_scatter=true
--xla_gpu_enable_triton_gemm=false
--xla_gpu_simplify_all_fp_conversions
--xla_gpu_graph_level=0
--xla_gpu_enable_async_all_reduce=true
--xla_gpu_enable_highest_priority_async_stream=true
Expand Down Expand Up @@ -114,4 +113,4 @@ mkdir -p "${BASE_WORKSPACE_DIR}/${OUTPUT_DIR}/${RUN_NAME}"
OUTFILE="${BASE_WORKSPACE_DIR}/${OUTPUT_DIR}/${RUN_NAME}/output-%j-%n-%t.txt"

echo $cmd
srun -o $OUTFILE -e $OUTFILE --container-image="$CONTAINER" $MOUNTS $EXPORTS bash -c "${cmd}"
srun -o $OUTFILE -e $OUTFILE --container-image="$CONTAINER" $MOUNTS $EXPORTS bash -c "${cmd}"
2 changes: 1 addition & 1 deletion rosetta/rosetta/projects/pax/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_thres

```
BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
--xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_async_all_gather=true
--xla_gpu_enable_async_all_gather=true
--xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_highest_priority_async_stream=true
--xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_all_reduce_combine_threshold_bytes=33554432
--xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true" bash run_pile_multinode.sh ...
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ TRAIN_GPUS=$(( NUM_GPUS * SLURM_JOB_NUM_NODES ))
echo "Please make sure ${TRAIN_GPUS} is the number of visible CUDA devices you have"

# Setting XLA flags
export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
export XLA_PYTHON_CLIENT_MEM_FRACTION=0.9

# Global batch size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ TRAIN_GPUS=$(( NUM_GPUS * SLURM_JOB_NUM_NODES ))
echo "Please make sure ${TRAIN_GPUS} is the number of visible CUDA devices you have"

# Setting XLA flags
export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
export XLA_PYTHON_CLIENT_MEM_FRACTION=0.9

# Global batch size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ TRAIN_GPUS=${NUM_GPUS}
echo "Please make sure ${TRAIN_GPUS} is the number of visible CUDA devices you have"

# Setting XLA flags
export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
export XLA_PYTHON_CLIENT_MEM_FRACTION=0.9

# Global batch size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ TRAIN_GPUS=${NUM_GPUS}
echo "Please make sure ${TRAIN_GPUS} is the number of visible CUDA devices you have"

# Setting XLA flags
export XLA_FLAGS="--xla_gpu_simplify_all_fp_conversions --xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
export XLA_FLAGS="--xla_gpu_all_reduce_combine_threshold_bytes=136314880 ${XLA_FLAGS}"
export XLA_PYTHON_CLIENT_MEM_FRACTION=0.9

# Global batch size
Expand Down

0 comments on commit b7cfe94

Please sign in to comment.