Skip to content

Commit

Permalink
Revert unintended update of configs/sites/aws-pcluster/README.md
Browse files Browse the repository at this point in the history
  • Loading branch information
climbfuji committed Jan 25, 2024
1 parent 55411a1 commit dc789a6
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 54 deletions.
176 changes: 123 additions & 53 deletions configs/sites/aws-pcluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

### Base instance
Choose a basic AMI from the Community AMIs tab that matches your desired OS and parallelcluster version. Select an instance type of the same family that you are planning to use for the head and the compute nodes, and enough storage for a swap file and a spack-stack installation. For example:
- AMI ID: ami-07410779598773e7d (aws-parallelcluster-3.8.0-ubuntu-2204-lts-hvm-x86_64-202312160956 2023-12-16T10-00-45.861Z)
- Instance hpc7a.96xlarge
- Use 500GB of gp3 storage as /
- AMI ID: ami-093dab62f7840644b
- Instance hpc6a.48xlarge
- Use 350GB of gp3 storage as /

### Prerequisites
1. As `root`:
Expand Down Expand Up @@ -47,7 +47,8 @@ tar -xvf Lmod-8.7.tar.bz2
cd Lmod-8.7
# Note the weird prefix, lmod installs in PREFIX/lmod/X.Y automatically
./configure --prefix=/opt/ \
--with-lmodConfigDir=/opt/lmod/8.7/config 2>&1 | tee log.config
--with-lmodConfigDir=/opt/lmod/8.7/config \
2>&1 | tee log.config
make install 2>&1 | tee log.install
ln -sf /opt/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh
ln -sf /opt/lmod/lmod/init/cshrc /etc/profile.d/z00_lmod.csh
Expand All @@ -56,19 +57,19 @@ ln -sf /opt/lmod/lmod/init/profile.fish /etc/profile.d/z00_lmod.fish
# Add custom module locations and fix existing modules
#
# intelmpi
echo "conflict openmpi" >> /opt/intel/mpi/2021.9.0/modulefiles/intelmpi
echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.19.0amzn4.0 ] } {' >> /opt/intel/mpi/2021.9.0/modulefiles/intelmpi
echo ' module load libfabric-aws/1.19.0amzn4.0' >> /opt/intel/mpi/2021.9.0/modulefiles/intelmpi
echo '}' >> /opt/intel/mpi/2021.9.0/modulefiles/intelmpi
echo "conflict openmpi" >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi
echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.16.0~amzn4.0 ] } {' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi
echo ' module load libfabric-aws/1.16.0~amzn4.0' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi
echo '}' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi
# openmpi
echo "conflict intelmpi" >> /usr/share/modules/modulefiles/openmpi/4.1.6
echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.19.0amzn4.0 ] } {' >> /usr/share/modules/modulefiles/openmpi/4.1.6
echo ' module load libfabric-aws/1.19.0amzn4.0' >> /usr/share/modules/modulefiles/openmpi/4.1.6
echo '}' >> /usr/share/modules/modulefiles/openmpi/4.1.6
echo "conflict intelmpi" >> /usr/share/modules/modulefiles/openmpi/4.1.4
echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.16.0~amzn4.0 ] } {' >> /usr/share/modules/modulefiles/openmpi/4.1.4
echo ' module load libfabric-aws/1.16.0~amzn4.0' >> /usr/share/modules/modulefiles/openmpi/4.1.4
echo '}' >> /usr/share/modules/modulefiles/openmpi/4.1.4
#
echo "module use /usr/share/modules/modulefiles" >> /etc/profile.d/z01_lmod.sh
### NO NOT ANY MORE ### echo "module use /opt/intel/mpi/2021.9.0/modulefiles" >> /etc/profile.d/z01_lmod.sh
### NO NOT ANY MORE ### echo "module use /home/ubuntu/jedi/modulefiles" >> /etc/profile.d/z01_lmod.sh
echo "module use /opt/intel/mpi/2021.6.0/modulefiles" >> /etc/profile.d/z01_lmod.sh
echo "module use /home/ubuntu/jedi/modulefiles" >> /etc/profile.d/z01_lmod.sh
#
# Log out completely, ssh back into the instance and check if lua modules work
exit
Expand All @@ -77,10 +78,10 @@ exit
ssh ...
# Now user ubuntu
module av
module load libfabric-aws/1.19.0amzn4.0
module load openmpi/4.1.6
module load libfabric-aws/1.16.0~amzn4.0
module load openmpi/4.1.4
module list
module unload openmpi/4.1.6
module unload openmpi/4.1.4
module load intelmpi
module list
module purge
Expand All @@ -102,18 +103,29 @@ apt install -y unzip
apt install -y automake
apt install -y xterm
apt install -y texlive
apt install -y cmake
# This is for ecflow
apt install -y qtcreator qtbase5-dev qt5-qmake
apt install -y libqt5widgets5
apt install -y qt5-default
apt install -y libqt5svg5-dev
apt install -y qt5dxcb-plugin
# For mysql
apt install -y mysql-server
# Test
mysql -u root
### # Remove AWS openmpi
### apt remove -y openmpi40-aws
# This is because boost doesn't work with the Intel compiler
apt install -y libboost1.71-dev
apt install -y libboost-chrono1.71-dev
apt install -y libboost-date-time1.71-dev
apt install -y libboost-exception1.71-dev
apt install -y libboost-filesystem1.71-dev
apt install -y libboost-program-options1.71-dev
apt install -y libboost-python1.71-dev
apt install -y libboost-regex1.71-dev
apt install -y libboost-serialization1.71-dev
apt install -y libboost-system1.71-dev
apt install -y libboost-test1.71-dev
apt install -y libboost-thread1.71-dev
apt install -y libboost-timer1.71-dev
# Python
apt install -y python3-dev python3-pip
Expand All @@ -122,16 +134,7 @@ apt install -y python3-dev python3-pip
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
apt-get update
apt-get install -y intel-hpckit-2024.0/all
exit
# As ubuntu
/opt/intel/modulefiles-setup.sh
# Back to root
sudo su
mv /home/ubuntu/modulefiles /opt/intel/modulefiles
echo "module unuse /opt/intel/mpi/2021.9.0/modulefiles" >> /etc/profile.d/z01_lmod.sh
echo "module use /opt/intel/modulefiles" >> /etc/profile.d/z01_lmod.sh
apt-get install -y intel-hpckit-2022.2.0/all
# Docker
# See https://docs.docker.com/engine/install/ubuntu/
Expand All @@ -155,7 +158,15 @@ service sshd restart
cd /usr/lib64/
ln -sf /usr/lib/x86_64-linux-gnu/libcrypt.so .
cd /usr/include
ln -sf python3.10/pyconfig.h .
ln -sf python3.8/pyconfig.h .
# Create swapfile - 100GB
dd if=/dev/zero of=/swapfile bs=128M count=800
chmod 600 /swapfile
mkswap /swapfile
swapon /swapfile
swapon -s
echo "/swapfile swap swap defaults 0 0" >> /etc/fstab
# Exit root session
exit
Expand All @@ -166,12 +177,73 @@ git config --global credential.helper cache

2. Log out and back in to enable x11 forwarding

3. Create directory for spack-stack external packages
3. Build ecflow outside of spack to be able to link against OS boost
```
mkdir -p /home/ubuntu/spack-stack/external
mkdir -p /home/ubuntu/jedi/ecflow-5.8.4/src
cd /home/ubuntu/jedi/ecflow-5.8.4/src
wget https://confluence.ecmwf.int/download/attachments/8650755/ecFlow-5.8.4-Source.tar.gz?api=v2
mv ecFlow-5.8.4-Source.tar.gz\?api\=v2 ecFlow-5.8.4-Source.tar.gz
tar -xvzf ecFlow-5.8.4-Source.tar.gz
export WK=/home/ubuntu/jedi/ecflow-5.8.4/src/ecFlow-5.8.4-Source
export BOOST_ROOT=/usr
# Build ecFlow
cd $WK
mkdir build
cd build
cmake .. -DPython3_EXECUTABLE=/usr/bin/python3 -DENABLE_STATIC_BOOST_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/home/ubuntu/jedi/ecflow-5.8.4 2>&1 | tee log.cmake
make -j4 2>&1 | tee log.make
make install 2>&1 | tee log.install
# Create a modulefiles directory with the following ecflow/5.8.4 module in it (w/o the '%%%%...' lines):
mkdir -p /home/ubuntu/jedi/modulefiles/ecflow
vi /home/ubuntu/jedi/modulefiles/ecflow/5.8.4
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%Module1.0
module-whatis "Provides an ecflow-5.8.4 server+ui installation for use with spack."
conflict ecflow
proc ModulesHelp { } {
puts stderr "Provides an ecflow-5.8.4 server+ui installation for use with spack."
}
# Set this value
set ECFLOW_PATH "/home/ubuntu/jedi/ecflow-5.8.4"
prepend-path PATH "${ECFLOW_PATH}/bin"
prepend-path LD_LIBRARY_PATH "${ECFLOW_PATH}/lib"
prepend-path LIBRARY_PATH "${ECFLOW_PATH}/lib"
prepend-path CPATH "${ECFLOW_PATH}/include"
prepend-path CMAKE_PREFIX_PATH "${ECFLOW_PATH}"
prepend-path PYTHONPATH "${ECFLOW_PATH}/lib/python3.8/site-packages"
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
```

4. Install msql community server
```
cd /home/ubuntu/jedi
mkdir -p mysql-8.0.31/src
cd mysql-8.0.31/src
wget https://dev.mysql.com/get/Downloads/MySQL-8.0/mysql-server_8.0.32-1ubuntu20.04_amd64.deb-bundle.tar
tar -xvf mysql-server_8.0.32-1ubuntu20.04_amd64.deb-bundle.tar
# Switch to root
sudo su
dpkg -i *.deb
apt --fix-broken install
dpkg -i *.deb
# Use an empty password for root, choose legacy authentication method; test connection
mysql -u root
show databases;
# exit mysql
exit
# exit root session
exit
rm *.deb
```

4. Option 1: Testing existing site config in spack-stack (skip steps 5-7 afterwards)
5. Option 1: Testing existing site config in spack-stack (skip steps 5-7 afterwards)
```
mkdir -p /home/ubuntu/sandpit
cd /home/ubuntu/sandpit
Expand All @@ -188,7 +260,7 @@ spack module lmod refresh
spack stack setup-meta-modules
```

5. Option 2: Test configuring site from scratch
6. Option 2: Test configuring site from scratch
```
mkdir /home/ubuntu/jedi && cd /home/ubuntu/jedi
git clone -b develop --recursive https://github.com/jcsda/spack-stack spack-stack
Expand All @@ -199,41 +271,39 @@ spack env activate -p envs/unified-env
export SPACK_SYSTEM_CONFIG_PATH=/home/ubuntu/jedi/spack-stack/envs/unified-env/site
spack external find --scope system \
--exclude bison --exclude cmake \
--exclude curl --exclude openssl \
--exclude openssh
spack external find --scope system
spack external find --scope system perl
spack external find --scope system python
spack external find --scope system wget
spack external find --scope system mysql
spack external find --scope system texlive
spack external find --scope system sed
spack external find --scope system mysql
# No external find for pre-installed intel-oneapi-mpi (from pcluster AMI),
# and no way to add object entry to list using "spack config add".
echo " intel-oneapi-mpi:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " externals:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: intel-oneapi-mpi@2021.9.0%intel@2022.1.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: intel-oneapi-mpi@2021.6.0%intel@2022.1.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " prefix: /opt/intel" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " modules:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - libfabric-aws/1.19.0amzn4.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - libfabric-aws/1.16.0~amzn4.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - intelmpi" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
# Add external openmpi
echo " openmpi:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " externals:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: openmpi@4.1.6%gcc@9.4.0~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: openmpi@4.1.4%gcc@9.4.0~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " fabrics=ofi schedulers=slurm" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " prefix: /opt/amazon/openmpi" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " modules:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - libfabric-aws/1.19.0amzn4.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - openmpi/4.1.6" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - libfabric-aws/1.16.0~amzn3.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - openmpi/4.1.4" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
# Can't find qt5 because qtpluginfo is broken,
# and no way to add object entry to list using "spack config add".
echo " qt:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " buildable: False" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " externals:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: qt@5.15.3" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: qt@5.12.8" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " prefix: /usr" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
# Add external boost
Expand All @@ -255,15 +325,15 @@ spack compiler find --scope system
export -n SPACK_SYSTEM_CONFIG_PATH
spack config add "packages:mpi:buildable:False"
spack config add "packages:all:providers:mpi:[intel-oneapi-mpi@2021.9.0, openmpi@4.1.6]"
spack config add "packages:all:providers:mpi:[intel-oneapi-mpi@2021.6.0, openmpi@4.1.4]"
spack config add "packages:all:compiler:[intel@2022.1.0, gcc@9.4.0]"
# edit envs/unified-env/site/compilers.yaml and replace the following line in the **Intel** compiler section:
# environment: {}
# -->
# environment:
# prepend_path:
# LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2021.9.0/linux/compiler/lib/intel64_lin'
# LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2021.6.0/linux/compiler/lib/intel64_lin'
# set:
# I_MPI_PMI_LIBRARY: '/opt/slurm/lib/libpmi.so'
```
Expand Down
2 changes: 1 addition & 1 deletion spack

0 comments on commit dc789a6

Please sign in to comment.