-
Notifications
You must be signed in to change notification settings - Fork 30
Expand file tree
/
Copy pathcreate_derecho_env.sh
More file actions
executable file
·115 lines (96 loc) · 3.69 KB
/
create_derecho_env.sh
File metadata and controls
executable file
·115 lines (96 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/bin/bash
set -e
# ml conda
# ENV_NAME="credit-derecho"
# CURR_DIR=`pwd`
# WHEEL_DIR="/glade/work/dgagne/credit-pytorch-envs/derecho-pytorch-mpi/wheels"
# echo $CURR_DIR
# conda create -n $ENV_NAME python=3.11
# conda init
# conda activate $ENV_NAME
# cd /glade/work/dgagne/credit-pytorch-envs/derecho-pytorch-mpi
# ./embed_nccl_vars_conda.sh
# cd $CURR_DIR
# pip install ${WHEEL_DIR}/torch-2.5.1+derecho.gcc.12.4.0.cray.mpich.8.1.29-cp311-cp311-linux_x86_64.whl
# pip install ${WHEEL_DIR}/torchvision-0.20.1+derecho.gcc.12.4.0-cp311-cp311-linux_x86_64.whl
# pip install -e .
#-----------------------------------------------------------
# set up an initial conda environment at ${CREDIT_ENV_PATH}
# containing Derecho-specific torch & MPI bits.
# (install torchmetrics at this point too, installing it later
# via pip risks an undesirable torch update.)
module load ncarenv/24.12
module load gcc
module load conda
module load mkl
module list
export ESMFMKFILE="/glade/work/dgagne/esmf-8.9.1/lib/libO/Linux.gfortran.64.mpiuni.default/esmf.mk"
topdir=$(git rev-parse --show-toplevel)
CREDIT_ENV_NAME=${CREDIT_ENV_NAME:-"credit-derecho"}
yml=$(mktemp --tmpdir=${topdir} credit-derecho-tmp-XXXXXXXXXX.yml)
echo ${yml}
#yml=derecho.yml
echo "Creating conda env \"${CREDIT_ENV_NAME}\""
cat <<EOF > ${yml}
name: credit
channels:
- file:///glade/work/benkirk/consulting/conda-recipes/output
- conda-forge/label/mpi-external
- conda-forge
dependencies:
- python=3.11
- conda-tree
- mpi4py =*=derecho*
- pip
- pytorch ==2.8.0=derecho*2000
- torchvision =*=derecho*
- torchmetrics
- pip:
- pipdeptree
EOF
# create the environment
export CONDA_VERBOSITY=1
export TIMEFORMAT=$'--> Real time: %3R seconds'
time conda env create \
--name ${CREDIT_ENV_NAME} \
--file ${yml} \
|| { cat ${yml}; echo "ERROR Creating Conda env!"; exit 1; }
rm -f ${yml}
#-----------------------------------------------------------
# activate & fix the environment
conda activate ${CREDIT_ENV_NAME}
echo "NCCLs - before cleanup:"
find ${CONDA_PREFIX} -name "libnccl.*"
# remove PIP NCCL, if any.
# (echo-opt -> xgboost -> nvidia-nccl-cu12 -> problem.)
pip uninstall -y $(pip list | grep nvidia-nccl | awk '{print $1}') || true
#-----------------------------------------------------------
# install credit (editable) with constraints to prevent pip
# from overwriting the conda-installed torch/torchvision/torchmetrics.
constraint_file=$(mktemp --tmpdir=${topdir} credit-constraints-XXXXXXXXXX.txt)
pip list --format=freeze | grep -iE "^(torch|torchvision|torchmetrics)==" > "${constraint_file}"
echo "Using pip constraints:"
cat "${constraint_file}"
pip install --constraint "${constraint_file}" -e .
rm -f "${constraint_file}"
#----------------------------------------------------------------------
# Add activation file with appropriate modules to conda activation path
activate_shell="${CONDA_PREFIX}/etc/conda/activate.d/esmf_activate.sh"
cat << EOF > ${activate_shell}
module load ncarenv/24.12
module load gcc
module load mkl
export ESMFMKFILE="/glade/work/dgagne/esmf-8.9.1/lib/libO/Linux.gfortran.64.mpiuni.default/esmf.mk"
EOF
# Install esmpy and xesmf. Built against DJ's install of esmf since module loaded esmf wasn't working properly
pip install /glade/work/dgagne/esmf-8.9.1/src/addon/esmpy/
pip install xesmf
conda-tree deptree --small
pipdeptree --depth 3
echo "NCCLs - after cleanup:"
find ${CONDA_PREFIX} -name "libnccl.*"
python -c "import torch; print('torch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available())"
python -c "import credit"
echo
echo "\"${CREDIT_ENV_NAME}\" conda environment for Derecho successfully installed into CONDA_PREFIX"
echo "use \"conda activate ${CREDIT_ENV_NAME}\" to activate"