Skip to content

Commit

Permalink
Fix slurm installation. Moved config files form /common to /common/.s…
Browse files Browse the repository at this point in the history
…lurm dir. Don't delete config files after installation, since it linked to target location. (#3328)
  • Loading branch information
SilinPavel authored Aug 8, 2023
1 parent 639c44b commit b8f849b
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 20 deletions.
36 changes: 19 additions & 17 deletions workflows/pipe-common/shell/slurm_setup_master
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

SLURM_COMMON_CONFIG_DIR="$SHARED_FOLDER/.slurm"
SLURM_MASTER_SETUP_TASK="SLURMMasterSetup"
SLURM_MASTER_SETUP_TASK_WORKERS="SLURMMasterSetupWorkers"
CURRENT_PID=$$

configure_slurm() {

mkdir -p ${SLURM_COMMON_CONFIG_DIR}

mkdir /var/spool/slurmctld
chown slurm: /var/spool/slurmctld
chmod 755 /var/spool/slurmctld
Expand All @@ -28,14 +31,14 @@ configure_slurm() {
touch /var/log/slurm_jobacct.log /var/log/slurm_jobcomp.log
chown slurm: /var/log/slurm_jobacct.log /var/log/slurm_jobcomp.log

cat > /common/cgroup.conf <<EOL
cat > ${SLURM_COMMON_CONFIG_DIR}/cgroup.conf <<EOL
CgroupAutomount=yes
CgroupMountpoint=/cgroup
ConstrainCores=no
ConstrainRAMSpace=no
EOL

cat > /common/slurm.conf <<EOL
cat > ${SLURM_COMMON_CONFIG_DIR}/slurm.conf <<EOL
ControlMachine=$HOSTNAME
#
#MailProg=/bin/mail
Expand Down Expand Up @@ -75,7 +78,7 @@ EOL

if (( _NODE_GPUS_COUNT > 0 ))
then
echo "GresTypes=gpu" >> /common/slurm.conf
echo "GresTypes=gpu" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
for device in $(ls /dev/ | grep -E "nvidia[0-9]+") ; do
echo "Name=gpu File=/dev/$device" >> /$_SLURM_CONFIG_LOCATION/gres.conf
done
Expand All @@ -90,23 +93,23 @@ EOL
for _NODE in ${_NODE_NAMES[*]} ; do
if (( _NODE_GPUS_COUNT > 0 ))
then
echo "NodeName=$_NODE NodeHostname=$_NODE NodeAddr=$(getent hosts $_NODE | awk '{ print $1 }') CPUs=$_WORKER_CORES RealMemory=$_NODE_RAM_COUNT Gres=gpu:$_NODE_GPUS_COUNT State=UNKNOWN" >> /common/slurm.conf
echo "NodeName=$_NODE NodeHostname=$_NODE NodeAddr=$(getent hosts $_NODE | awk '{ print $1 }') CPUs=$_WORKER_CORES RealMemory=$_NODE_RAM_COUNT Gres=gpu:$_NODE_GPUS_COUNT State=UNKNOWN" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
else
echo "NodeName=$_NODE NodeHostname=$_NODE NodeAddr=$(getent hosts $_NODE | awk '{ print $1 }') CPUs=$_WORKER_CORES RealMemory=$_NODE_RAM_COUNT State=UNKNOWN" >> /common/slurm.conf
echo "NodeName=$_NODE NodeHostname=$_NODE NodeAddr=$(getent hosts $_NODE | awk '{ print $1 }') CPUs=$_WORKER_CORES RealMemory=$_NODE_RAM_COUNT State=UNKNOWN" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
fi
done
echo "PartitionName=main.q Nodes=ALL Default=YES MaxTime=INFINITE State=UP" >> /common/slurm.conf
echo "PartitionName=main.q Nodes=ALL Default=YES MaxTime=INFINITE State=UP" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf

if [ ! -z $CP_SLURM_LICENSES ]; then
echo "Licenses=$CP_SLURM_LICENSES" >> /common/slurm.conf
echo "Licenses=$CP_SLURM_LICENSES" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
fi

if [ ! -z "${CP_CAP_AUTOSCALE}" ]; then
echo "" >> /common/slurm.conf
echo "#" >> /common/slurm.conf
echo "#DYNAMIC CLUSTER" >> /common/slurm.conf
echo "MaxNodeCount=$(( ${CP_CAP_AUTOSCALE_WORKERS:-0} + ${node_count:-0} + 1 ))" >> /common/slurm.conf
echo "TreeWidth=65533" >> /common/slurm.conf
echo "" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
echo "#" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
echo "#DYNAMIC CLUSTER" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
echo "MaxNodeCount=$(( ${CP_CAP_AUTOSCALE_WORKERS:-0} + ${node_count:-0} + 1 ))" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
echo "TreeWidth=65533" >> ${SLURM_COMMON_CONFIG_DIR}/slurm.conf
fi
}

Expand Down Expand Up @@ -196,14 +199,14 @@ fi
_SLURM_CONFIG_LOCATION=$( slurm_config_location )
configure_slurm

dd if=/dev/urandom bs=1 count=1024 > /common/munge.key
cp /common/munge.key /etc/munge/
dd if=/dev/urandom bs=1 count=1024 > ${SLURM_COMMON_CONFIG_DIR}/munge.key
cp ${SLURM_COMMON_CONFIG_DIR}/munge.key /etc/munge/
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
su -c /usr/sbin/munged -s /bin/bash munge

ln -s /common/slurm.conf "$_SLURM_CONFIG_LOCATION"
ln -s /common/cgroup.conf "$_SLURM_CONFIG_LOCATION"
ln -s ${SLURM_COMMON_CONFIG_DIR}/slurm.conf "$_SLURM_CONFIG_LOCATION"
ln -s ${SLURM_COMMON_CONFIG_DIR}/cgroup.conf "$_SLURM_CONFIG_LOCATION"

slurmctld && slurmd

Expand Down Expand Up @@ -245,7 +248,6 @@ else
fi
done
pipe_log_success "All SLURM hosts are connected" "$SLURM_MASTER_SETUP_TASK_WORKERS"
rm -rf /common/slurm.conf /common/cgroup.conf /common/munge.key
fi

if [ ! -z "${CP_CAP_AUTOSCALE}" ]; then
Expand Down
7 changes: 4 additions & 3 deletions workflows/pipe-common/shell/slurm_setup_worker
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.

SLURM_COMMON_CONFIG_DIR="$SHARED_FOLDER/.slurm"
SLURM_WORKER_SETUP_TASK="SLURMWorkerSetup"
CURRENT_PID=$$

add_worker() {

cp /common/munge.key /etc/munge/
cp ${SLURM_COMMON_CONFIG_DIR}/munge.key /etc/munge/
chown munge: /etc/munge/munge.key
chmod 0700 /etc/munge/munge.key
su -c /usr/sbin/munged -s /bin/bash munge

ln -s /common/slurm.conf "$_SLURM_CONFIG_LOCATION"
ln -s /common/cgroup.conf "$_SLURM_CONFIG_LOCATION"
ln -s ${SLURM_COMMON_CONFIG_DIR}/slurm.conf "$_SLURM_CONFIG_LOCATION"
ln -s ${SLURM_COMMON_CONFIG_DIR}/cgroup.conf "$_SLURM_CONFIG_LOCATION"
mkdir /var/spool/slurmd
chown slurm: /var/spool/slurmd
chmod 755 /var/spool/slurmd
Expand Down

0 comments on commit b8f849b

Please sign in to comment.