Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update EFA/AWS-OFI-NCCL installation recipe #1116

Merged
merged 1 commit into from
Oct 24, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 31 additions & 29 deletions .github/container/install-efa.sh
Original file line number Diff line number Diff line change
@@ -1,35 +1,37 @@
#!/bin/bash

set -ex

# Update distro
apt-get update

# Install required packages
apt-get install -y curl

# clean up all previously installed library to avoid conflicts
# while installing Amazon EFA version
dpkg --purge efa-config efa-profile libfabric openmpi \
ibacm ibverbs-providers ibverbs-utils infiniband-diags \
libibmad-dev libibmad5 libibnetdisc-dev libibnetdisc5 \
libibumad-dev libibumad3 libibverbs-dev libibverbs1 librdmacm-dev \
librdmacm1 rdma-core rdmacm-utils

# Download Amazon EFA package and install
EFA_INSTALLER_VERSION=latest
WORKDIR=$(mktemp -d)

pushd ${WORKDIR}

AMAZON_EFA_LINK="https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz"
curl -O "$AMAZON_EFA_LINK"
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && cd aws-efa-installer
./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify

EFA_INSTALLER_VERSION=1.34.0 # or: latest
AWS_OFI_NCCL_PREFIX=/opt/aws-ofi-nccl
AWS_OFI_NCCL_VERSION=1.11.0

apt update

EFA_TMP=$(mktemp -d)
pushd $EFA_TMP
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz
cd aws-efa-installer
rm -v DEBS/UBUNTU2204/x86_64/{libpmix,openmpi,prrte}* # block installation of MPI components
apt-get purge -y ibverbs-providers libibverbs-dev libibverbs1 libibumad-dev libibumad3 librdmacm1 librdmacm-dev ibverbs-utils
./efa_installer.sh -g -y --skip-kmod --skip-limit-conf --no-verify |& tee install.log
mv -v install.log /opt/amazon/efa/install.log
popd
rm -rf $EFA_TMP

AWS_OFI_NCCL_TMP=$(mktemp -d)
pushd $AWS_OFI_NCCL_TMP
apt-get install -y libhwloc-dev
curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}-aws/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz
tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz
cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws
./configure --prefix=${AWS_OFI_NCCL_PREFIX} --with-libfabric=/opt/amazon/efa --with-cuda=/usr/local/cuda --with-mpi=/usr/local/mpi
make -j$(nproc) install
popd
rm -rf $AWS_OFI_NCCL_TMP

# Clean up
apt-get clean
rm -rf /var/lib/apt/lists/*
rm -rf ${WORKDIR}

# Ranks higher than HPC-X => newly-installed libnccl-net.so becomes the default
echo "${AWS_OFI_NCCL_PREFIX}/lib" > /etc/ld.so.conf.d/000_aws_ofi_nccl.conf
ldconfig
Loading