From dfa3b8a5aa053a218568e5129c1a5f00351b629e Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Wed, 13 Jan 2021 07:19:43 +0530 Subject: [PATCH 1/2] vdev_id: Support daisy-chained JBODs in multipath mode Within function sas_handler() userspace commands like '/usr/sbin/multipath' have been replaced with sourcing device details from within sysfs which reduced a significant amount of overhead and processing time. Multiple JBOD enclosures and their order are sourced from the bsg driver (/sys/class/enclosure) to isolate chassis top-level expanders, which are then dynamically indexed based on host channel of the multipath subordinate disk member device being processed. Additionally added a "mixed" mode for slot identification for environments where a ZFS server system may contain SAS disk slots where there is no expander (direct connect to HBA) while an attached external JBOD with an expander have different slot identifier methods. How Has This Been Tested? ~~~~~~~~~~~~~~~~~~~~~~ Testing was performed on a AMD EPYC based dual-server high-availability multipath environment with multiple HBAs per ZFS server and four SAS JBODs. The two primary JBODs were multipath/cross-connected between the two ZFS-HA servers. The secondary JBODs were daisy-chained off of the primary JBODs using aligned SAS expander channels (JBOD-0 expanderA--->JBOD-1 expanderA, JBOD-0 expanderB--->JBOD-1 expanderB, etc). Pools were created, exported and re-imported, imported globally with 'zpool import -a -d /dev/disk/by-vdev'. Low level udev debug outputs were traced to isolate and resolve errors. Result: ~~~~~~~ Initial testing of a previous version of this change showed how reliance on userspace utilities like '/usr/sbin/multipath' and '/usr/bin/lsscsi' were exacerbated by increasing numbers of disks and JBODs. With four 60-disk SAS JBODs and 240 disks the time to process a udevadm trigger was 3 minutes 30 seconds during which nearly all CPU cores were above 80% utilization. By switching reliance on userspace utilities to sysfs in this version, the udevadm trigger processing time was reduced to 12.2 seconds and negligible CPU load. This patch also fixes few shellcheck complains. Signed-off-by: Jeff Johnson Signed-off-by: Arshad Hussain --- cmd/vdev_id/vdev_id | 296 +++++++++++++++++++++++++++++++------------- 1 file changed, 208 insertions(+), 88 deletions(-) diff --git a/cmd/vdev_id/vdev_id b/cmd/vdev_id/vdev_id index fc79d22e9927..0df17fee5fc9 100755 --- a/cmd/vdev_id/vdev_id +++ b/cmd/vdev_id/vdev_id @@ -79,6 +79,34 @@ # channel 86:00.0 1 A # channel 86:00.0 0 B +# # +# # Example vdev_id.conf - multipath / multijbod-daisychaining +# # +# +# multipath yes +# multijbod yes +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 A +# channel 86:00.0 0 B + +# # +# # Example vdev_id.conf - multipath / mixed +# # +# +# multipath yes +# slot mix +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 3 A +# channel 85:00.0 2 B +# channel 86:00.0 3 A +# channel 86:00.0 2 B +# channel af:00.0 0 C +# channel af:00.0 1 C + # # # # Example vdev_id.conf - alias # # @@ -92,7 +120,6 @@ PATH=/bin:/sbin:/usr/bin:/usr/sbin CONFIG=/etc/zfs/vdev_id.conf PHYS_PER_PORT= DEV= -MULTIPATH= TOPOLOGY= BAY= @@ -107,6 +134,7 @@ Usage: vdev_id [-h] -e Create enclose device symlinks only (/dev/by-enclosure) -g Storage network topology [default="$TOPOLOGY"] -m Run in multipath mode + -j Run in multijbod mode -p number of phy's per switch port [default=$PHYS_PER_PORT] -h show this summary EOF @@ -114,11 +142,12 @@ EOF } map_slot() { - LINUX_SLOT=$1 - CHANNEL=$2 + local LINUX_SLOT=$1 + local CHANNEL=$2 + local MAPPED_SLOT= - MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \ - \\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG` + MAPPED_SLOT=$(awk '$1 == "slot" && $2 == "${LINUX_SLOT}" && \ + $4 ~ /^${CHANNEL}$|^$/ { print $3; exit}' $CONFIG) if [ -z "$MAPPED_SLOT" ] ; then MAPPED_SLOT=$LINUX_SLOT fi @@ -126,46 +155,80 @@ map_slot() { } map_channel() { - MAPPED_CHAN= - PCI_ID=$1 - PORT=$2 + local MAPPED_CHAN= + local PCI_ID=$1 + local PORT=$2 case $TOPOLOGY in "sas_switch") - MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \ - { print \\$3; exit }" $CONFIG` + MAPPED_CHAN=$(awk -v port=$PORT \ + '$1 == "channel" && $2 == ${PORT} \ + { print $3; exit }' $CONFIG) ;; "sas_direct"|"scsi") - MAPPED_CHAN=`awk "\\$1 == \"channel\" && \ - \\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \ - { print \\$4; exit }" $CONFIG` + MAPPED_CHAN=$(awk -v pciID=$PCI_ID -v port=$PORT \ + '$1 == "channel" && $2 == pciID && $3 == port \ + {print $4}' $CONFIG) ;; esac printf "%s" ${MAPPED_CHAN} } +# map_jbod explainer: The bsg driver knows the difference between a SAS +# expander and fanout expander. Use hostX instance along with top-level +# (whole enclosure) expander instances in /sys/class/enclosure and +# matching a field in an array of expanders, using the index of the +# matched array field as the enclosure instance, thereby making jbod IDs +# dynamic. Avoids reliance on high overhead userspace commands like +# multipath and lsscsi and instead uses existing sysfs data. $HOSTCHAN +# variable derived from devpath gymnastics in sas_handler() function. +map_jbod() { + local DEV=$1 + DEVEXP=$(ls -l /sys/block/$DEV/device/ | grep enclos | awk -F/ '{print $(NF-1) }') + # Use "set --" to create index values (Arrays) + set -- $(ls -l /sys/class/enclosure | grep host${HOSTCHAN}/port-${HOSTCHAN}:${PORT} | awk '{print $9}') + # Push whole array set into JBOD. Now JBOD will work like an + # array + JBOD=$@ + + for i in "${JBOD}" + do + if [ "$i" = "$DEVEXP" ] ; then + MAPPED_JBOD=$i + break + fi + done + printf "%d" ${MAPPED_JBOD} +} + sas_handler() { if [ -z "$PHYS_PER_PORT" ] ; then - PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ - {print \\$2; exit}" $CONFIG` + PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ + {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then - MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ - {print \\$2; exit}" $CONFIG` + MULTIPATH_MODE=$(awk '$1 == "multipath" \ + {print $2; exit}' $CONFIG) + fi + + if [ -z "$MULTIJBOD_MODE" ] ; then + MULTIJBOD_MODE=$(awk '$1 == "multijbod" \ + {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then - DM_NAME=`ls -l --full-time /dev/mapper | - awk "/\/$DEV$/{print \\$9}"` + DM_NAME=$(ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print $9}") fi # For raw disks udev exports DEVTYPE=partition when @@ -175,19 +238,31 @@ sas_handler() { # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then - PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + PART=$(echo $DM_NAME | awk -Fp '/p/{print "-part"$2}') fi # Strip off partition information. - DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + DM_NAME=$(echo $DM_NAME | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi - # Get the raw scsi device name from multipath -ll. Strip off - # leading pipe symbols to make field numbering consistent. - DEV=`multipath -ll $DM_NAME | - awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + # Utilize DM device name to gather subordinate block devices + # using sysfs to avoid userspace utilities + DMDEV=$(echo $DM_NAME |awk '{gsub("../", " "); print $NF}') + + # Use sysfs pointers in /sys/block/dm-X/slaves because using + # userspace tools creates lots of overhead and should be avoided + # whenever possible. Use awk to isolate lowest instance of + # sd device member in dm device group regardless of string + # length. + DEV=$(ls /sys/block/$DMDEV/slaves | awk ' + { len=sprintf ("%20s",length($0)); gsub(/ /,0,str); a[NR]=len "_" $0; } + END { + asort(a) + print substr(a[1],22) + }') + if [ -z "$DEV" ] ; then return fi @@ -196,7 +271,7 @@ sas_handler() { if echo $DEV | grep -q ^/devices/ ; then sys_path=$DEV else - sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + sys_path=$(udevadm info -q path -p /sys/block/$DEV 2>/dev/null) fi # Use positional parameters as an ad-hoc array @@ -206,44 +281,51 @@ sas_handler() { # Get path up to /sys/.../hostX i=1 + while [ $i -le $num_dirs ] ; do d=$(eval echo \${$i}) scsi_host_dir="$scsi_host_dir/$d" echo $d | grep -q -E '^host[0-9]+$' && break - i=$(($i + 1)) + i=$((i + 1)) done + # Lets grab the SAS host channel number and save it for JBOD sorting later + HOSTCHAN=$(echo $d | awk -F/ '{ gsub("host","",$NF); print $NF}') + if [ $i = $num_dirs ] ; then return fi - PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + PCI_ID=$(eval echo \${$((i -1))} | awk -F: '{print $2":"$3}') # In sas_switch mode, the directory four levels beneath # /sys/.../hostX contains symlinks to phy devices that reveal # the switch port number. In sas_direct mode, the phy links one # directory down reveal the HBA port. port_dir=$scsi_host_dir + case $TOPOLOGY in - "sas_switch") j=$(($i + 4)) ;; - "sas_direct") j=$(($i + 1)) ;; + "sas_switch") j=$((i + 4)) ;; + "sas_direct") j=$((i + 1)) ;; esac - i=$(($i + 1)) + i=$((i + 1)) + while [ $i -le $j ] ; do port_dir="$port_dir/$(eval echo \${$i})" - i=$(($i + 1)) + i=$((i + 1)) done - PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'` + PHY=$(ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}') if [ -z "$PHY" ] ; then PHY=0 fi - PORT=$(( $PHY / $PHYS_PER_PORT )) + PORT=$((PHY / PHYS_PER_PORT)) # Look in /sys/.../sas_device/end_device-X for the bay_identifier # attribute. end_device_dir=$port_dir + while [ $i -lt $num_dirs ] ; do d=$(eval echo \${$i}) end_device_dir="$end_device_dir/$d" @@ -251,37 +333,50 @@ sas_handler() { end_device_dir="$end_device_dir/sas_device/$d" break fi - i=$(($i + 1)) + i=$((i + 1)) done + # Add 'mix' slot type for environments where dm-multipath devices + # include end-devices connected via SAS expanders or direct connection + # to SAS HBA. A mixed connectivity environment such as # pool disk + # contained in a SAS JBOD and spare drives or log devices directly + # connected in a server backplane without expanders in the I/O path. SLOT= + case $BAY in "bay") - SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null` + SLOT=$(cat $end_device_dir/bay_identifier 2>/dev/null) + ;; + "mix") + if [[ $(cat $end_device_dir/bay_identifier 2>/dev/null) ]] ; then + SLOT=$(cat $end_device_dir/bay_identifier 2>/dev/null) + else + SLOT=$(cat $end_device_dir/phy_identifier 2>/dev/null) + fi ;; "phy") - SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null` + SLOT=$(cat $end_device_dir/phy_identifier 2>/dev/null) ;; "port") d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + SLOT=$(echo $d | sed -e 's/^.*://') ;; "id") - i=$(($i + 1)) + i=$((i + 1)) d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + SLOT=$(echo $d | sed -e 's/^.*://') ;; "lun") - i=$(($i + 2)) + i=$((i + 2)) d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + SLOT=$(echo $d | sed -e 's/^.*://') ;; "ses") # look for this SAS path in all SCSI Enclosure Services # (SES) enclosures - sas_address=`cat $end_device_dir/sas_address 2>/dev/null` - enclosures=`lsscsi -g | \ - sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'` + sas_address=$(cat $end_device_dir/sas_address 2>/dev/null) + enclosures=$(lsscsi -g | \ + sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p') for enclosure in $enclosures; do set -- $(sg_ses -p aes $enclosure | \ awk "/device slot number:/{slot=\$12} \ @@ -298,42 +393,55 @@ sas_handler() { return fi - CHAN=`map_channel $PCI_ID $PORT` - SLOT=`map_slot $SLOT $CHAN` - if [ -z "$CHAN" ] ; then - return + if [ "$MULTIJBOD_MODE" = "yes" ] ; then + CHAN=$(map_channel $PCI_ID $PORT) + SLOT=$(map_slot $SLOT $CHAN) + JBOD=$(map_jbod $DEV) + + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}-${JBOD}-${SLOT}${PART} + else + CHAN=$(map_channel $PCI_ID $PORT) + SLOT=$(map_slot $SLOT $CHAN) + + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}${SLOT}${PART} fi - echo ${CHAN}${SLOT}${PART} } scsi_handler() { if [ -z "$FIRST_BAY_NUMBER" ] ; then - FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \ - {print \\$2; exit}" $CONFIG` + FIRST_BAY_NUMBER=$(awk '$1 == "first_bay_number" \ + {print $2; exit}' $CONFIG) fi FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0} if [ -z "$PHYS_PER_PORT" ] ; then - PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ - {print \\$2; exit}" $CONFIG` + PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ + {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then - MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ - {print \\$2; exit}" $CONFIG` + MULTIPATH_MODE=$(awk '$1 == "multipath" \ + {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then - DM_NAME=`ls -l --full-time /dev/mapper | - awk "/\/$DEV$/{print \\$9}"` + DM_NAME=$(ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print \\$9}") fi # For raw disks udev exports DEVTYPE=partition when @@ -343,19 +451,19 @@ scsi_handler() { # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then - PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + PART=$(echo $DM_NAME | awk -Fp '/p/{print "-part"$2}') fi # Strip off partition information. - DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + DM_NAME=$(echo $DM_NAME | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi # Get the raw scsi device name from multipath -ll. Strip off # leading pipe symbols to make field numbering consistent. - DEV=`multipath -ll $DM_NAME | - awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + DEV=$(multipath -ll $DM_NAME | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}') if [ -z "$DEV" ] ; then return fi @@ -364,7 +472,7 @@ scsi_handler() { if echo $DEV | grep -q ^/devices/ ; then sys_path=$DEV else - sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + sys_path=$(udevadm info -q path -p /sys/block/$DEV 2>/dev/null) fi # expect sys_path like this, for example: @@ -377,40 +485,43 @@ scsi_handler() { # Get path up to /sys/.../hostX i=1 + while [ $i -le $num_dirs ] ; do d=$(eval echo \${$i}) scsi_host_dir="$scsi_host_dir/$d" + echo $d | grep -q -E '^host[0-9]+$' && break - i=$(($i + 1)) + i=$((i + 1)) done if [ $i = $num_dirs ] ; then return fi - PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + PCI_ID=$(eval echo \${$((i -1))} | awk -F: '{print $2":"$3}') # In scsi mode, the directory two levels beneath # /sys/.../hostX reveals the port and slot. port_dir=$scsi_host_dir - j=$(($i + 2)) + j=$((i + 2)) - i=$(($i + 1)) + i=$((i + 1)) while [ $i -le $j ] ; do port_dir="$port_dir/$(eval echo \${$i})" - i=$(($i + 1)) + i=$((i + 1)) done set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') PORT=$1 - SLOT=$(($2 + $FIRST_BAY_NUMBER)) + SLOT=$(($2 + FIRST_BAY_NUMBER)) if [ -z "$SLOT" ] ; then return fi - CHAN=`map_channel $PCI_ID $PORT` - SLOT=`map_slot $SLOT $CHAN` + CHAN=$(map_channel $PCI_ID $PORT) + SLOT=$(map_slot $SLOT $CHAN) + if [ -z "$CHAN" ] ; then return fi @@ -450,8 +561,8 @@ enclosure_handler () { PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g') # Name our device according to vdev_id.conf (like "L0" or "U1"). - NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \ - \$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG) + NAME=$(awk "/channel/{if ($1 == "channel" && $2 == "$PCI_ID" && \ + $3 == "$PORT_ID") {print ${4}int(count[$4])}; count[$4]++}" $CONFIG) echo "${NAME}" } @@ -489,7 +600,7 @@ alias_handler () { DM_PART= if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then if [ "$DEVTYPE" != "partition" ] ; then - DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + DM_PART=$(echo $DM_NAME | awk -Fp '/p/{print "-part"$2}') fi fi @@ -497,21 +608,25 @@ alias_handler () { for link in $DEVLINKS ; do # Remove partition information to match key of top-level device. if [ -n "$DM_PART" ] ; then - link=`echo $link | sed 's/p[0-9][0-9]*$//'` + link=$(echo $link | sed 's/p[0-9][0-9]*$//') fi # Check both the fully qualified and the base name of link. - for l in $link `basename $link` ; do - alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \ - { print \\$2; exit }" $CONFIG` - if [ -n "$alias" ] ; then - echo ${alias}${DM_PART} - return + for l in $link $(basename $link) ; do + if [ ! -z $l ]; then + alias=$(awk -v var=$l '($1 == "alias") && \ + ($3 == var) \ + { print $2; exit }' $CONFIG) + if [ -n "$alias" ] ; then + echo ${alias}${DM_PART} + return + fi fi done done } -while getopts 'c:d:eg:mp:h' OPTION; do +# main +while getopts 'c:d:eg:jmp:h' OPTION; do case ${OPTION} in c) CONFIG=${OPTARG} @@ -524,7 +639,9 @@ while getopts 'c:d:eg:mp:h' OPTION; do # create the enclosure device symlinks only. We also need # "enclosure_symlinks yes" set in vdev_id.config to actually create the # symlink. - ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG) + ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") \ + print $2}' $CONFIG) + if [ "$ENCLOSURE_MODE" != "yes" ] ; then exit 0 fi @@ -535,6 +652,9 @@ while getopts 'c:d:eg:mp:h' OPTION; do p) PHYS_PER_PORT=${OPTARG} ;; + j) + MULTIJBOD_MODE=yes + ;; m) MULTIPATH_MODE=yes ;; @@ -555,11 +675,11 @@ if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then fi if [ -z "$TOPOLOGY" ] ; then - TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG` + TOPOLOGY=$(awk '($1 == "topology") {print $2; exit}' $CONFIG) fi if [ -z "$BAY" ] ; then - BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG` + BAY=$(awk '($1 == "slot") {print $2; exit}' $CONFIG) fi TOPOLOGY=${TOPOLOGY:-sas_direct} @@ -582,16 +702,16 @@ if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then fi # First check if an alias was defined for this device. -ID_VDEV=`alias_handler` +ID_VDEV=$(alias_handler) if [ -z "$ID_VDEV" ] ; then BAY=${BAY:-bay} case $TOPOLOGY in sas_direct|sas_switch) - ID_VDEV=`sas_handler` + ID_VDEV=$(sas_handler) ;; scsi) - ID_VDEV=`scsi_handler` + ID_VDEV=$(scsi_handler) ;; *) echo "Error: unknown topology $TOPOLOGY" From d2a1141e82f248bf8850566d8494459fdc7c9a38 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Wed, 13 Jan 2021 07:19:43 +0530 Subject: [PATCH 2/2] vdev_id: Support daisy-chained JBODs in multipath mode Within function sas_handler() userspace commands like '/usr/sbin/multipath' have been replaced with sourcing device details from within sysfs which reduced a significant amount of overhead and processing time. Multiple JBOD enclosures and their order are sourced from the bsg driver (/sys/class/enclosure) to isolate chassis top-level expanders, which are then dynamically indexed based on host channel of the multipath subordinate disk member device being processed. Additionally added a "mixed" mode for slot identification for environments where a ZFS server system may contain SAS disk slots where there is no expander (direct connect to HBA) while an attached external JBOD with an expander have different slot identifier methods. How Has This Been Tested? ~~~~~~~~~~~~~~~~~~~~~~ Testing was performed on a AMD EPYC based dual-server high-availability multipath environment with multiple HBAs per ZFS server and four SAS JBODs. The two primary JBODs were multipath/cross-connected between the two ZFS-HA servers. The secondary JBODs were daisy-chained off of the primary JBODs using aligned SAS expander channels (JBOD-0 expanderA--->JBOD-1 expanderA, JBOD-0 expanderB--->JBOD-1 expanderB, etc). Pools were created, exported and re-imported, imported globally with 'zpool import -a -d /dev/disk/by-vdev'. Low level udev debug outputs were traced to isolate and resolve errors. Result: ~~~~~~~ Initial testing of a previous version of this change showed how reliance on userspace utilities like '/usr/sbin/multipath' and '/usr/bin/lsscsi' were exacerbated by increasing numbers of disks and JBODs. With four 60-disk SAS JBODs and 240 disks the time to process a udevadm trigger was 3 minutes 30 seconds during which nearly all CPU cores were above 80% utilization. By switching reliance on userspace utilities to sysfs in this version, the udevadm trigger processing time was reduced to 12.2 seconds and negligible CPU load. This patch also fixes few shellcheck complains. Signed-off-by: Jeff Johnson Signed-off-by: Arshad Hussain --- cmd/vdev_id/vdev_id | 287 +++++++++++++++++++++++++++++++------------- 1 file changed, 204 insertions(+), 83 deletions(-) diff --git a/cmd/vdev_id/vdev_id b/cmd/vdev_id/vdev_id index fc79d22e9927..660af88c1d83 100755 --- a/cmd/vdev_id/vdev_id +++ b/cmd/vdev_id/vdev_id @@ -79,6 +79,34 @@ # channel 86:00.0 1 A # channel 86:00.0 0 B +# # +# # Example vdev_id.conf - multipath / multijbod-daisychaining +# # +# +# multipath yes +# multijbod yes +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 A +# channel 86:00.0 0 B + +# # +# # Example vdev_id.conf - multipath / mixed +# # +# +# multipath yes +# slot mix +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 3 A +# channel 85:00.0 2 B +# channel 86:00.0 3 A +# channel 86:00.0 2 B +# channel af:00.0 0 C +# channel af:00.0 1 C + # # # # Example vdev_id.conf - alias # # @@ -92,7 +120,6 @@ PATH=/bin:/sbin:/usr/bin:/usr/sbin CONFIG=/etc/zfs/vdev_id.conf PHYS_PER_PORT= DEV= -MULTIPATH= TOPOLOGY= BAY= @@ -107,6 +134,7 @@ Usage: vdev_id [-h] -e Create enclose device symlinks only (/dev/by-enclosure) -g Storage network topology [default="$TOPOLOGY"] -m Run in multipath mode + -j Run in multijbod mode -p number of phy's per switch port [default=$PHYS_PER_PORT] -h show this summary EOF @@ -116,9 +144,10 @@ EOF map_slot() { LINUX_SLOT=$1 CHANNEL=$2 + MAPPED_SLOT= - MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \ - \\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG` + MAPPED_SLOT=$(awk '$1 == "slot" && $2 == "${LINUX_SLOT}" && \ + $4 ~ /^${CHANNEL}$|^$/ { print $3; exit}' $CONFIG) if [ -z "$MAPPED_SLOT" ] ; then MAPPED_SLOT=$LINUX_SLOT fi @@ -132,40 +161,75 @@ map_channel() { case $TOPOLOGY in "sas_switch") - MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \ - { print \\$3; exit }" $CONFIG` + MAPPED_CHAN=$(awk -v port=$PORT \ + '$1 == "channel" && $2 == ${PORT} \ + { print $3; exit }' $CONFIG) ;; "sas_direct"|"scsi") - MAPPED_CHAN=`awk "\\$1 == \"channel\" && \ - \\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \ - { print \\$4; exit }" $CONFIG` + MAPPED_CHAN=$(awk -v pciID=$PCI_ID -v port=$PORT \ + '$1 == "channel" && $2 == pciID && $3 == port \ + {print $4}' $CONFIG) ;; esac printf "%s" ${MAPPED_CHAN} } +# map_jbod explainer: The bsg driver knows the difference between a SAS +# expander and fanout expander. Use hostX instance along with top-level +# (whole enclosure) expander instances in /sys/class/enclosure and +# matching a field in an array of expanders, using the index of the +# matched array field as the enclosure instance, thereby making jbod IDs +# dynamic. Avoids reliance on high overhead userspace commands like +# multipath and lsscsi and instead uses existing sysfs data. $HOSTCHAN +# variable derived from devpath gymnastics in sas_handler() function. +map_jbod() { + DEV=$1 + DEVEXP=$(ls -l /sys/block/$DEV/device/ | grep enclos | awk -F/ '{print $(NF-1) }') + + # Use "set --" to create index values (Arrays) + set -- $(ls -l /sys/class/enclosure | grep host${HOSTCHAN}/port-${HOSTCHAN}:${PORT} | awk '{print $9}') + # Push whole array set into JBOD. Now JBOD will work like an + # array + JBOD=$@ + + for i in "${JBOD}" + do + if [ "$i" = "$DEVEXP" ] ; then + MAPPED_JBOD=$i + break + fi + done + printf "%d" ${MAPPED_JBOD} +} + sas_handler() { if [ -z "$PHYS_PER_PORT" ] ; then - PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ - {print \\$2; exit}" $CONFIG` + PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ + {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then - MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ - {print \\$2; exit}" $CONFIG` + MULTIPATH_MODE=$(awk '$1 == "multipath" \ + {print $2; exit}' $CONFIG) + fi + + if [ -z "$MULTIJBOD_MODE" ] ; then + MULTIJBOD_MODE=$(awk '$1 == "multijbod" \ + {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then - DM_NAME=`ls -l --full-time /dev/mapper | - awk "/\/$DEV$/{print \\$9}"` + DM_NAME=$(ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print $9}") fi # For raw disks udev exports DEVTYPE=partition when @@ -175,19 +239,31 @@ sas_handler() { # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then - PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + PART=$(echo $DM_NAME | awk -Fp '/p/{print "-part"$2}') fi # Strip off partition information. - DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + DM_NAME=$(echo $DM_NAME | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi - # Get the raw scsi device name from multipath -ll. Strip off - # leading pipe symbols to make field numbering consistent. - DEV=`multipath -ll $DM_NAME | - awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + # Utilize DM device name to gather subordinate block devices + # using sysfs to avoid userspace utilities + DMDEV=$(echo $DM_NAME |awk '{gsub("../", " "); print $NF}') + + # Use sysfs pointers in /sys/block/dm-X/slaves because using + # userspace tools creates lots of overhead and should be avoided + # whenever possible. Use awk to isolate lowest instance of + # sd device member in dm device group regardless of string + # length. + DEV=$(ls /sys/block/$DMDEV/slaves | awk ' + { len=sprintf ("%20s",length($0)); gsub(/ /,0,str); a[NR]=len "_" $0; } + END { + asort(a) + print substr(a[1],22) + }') + if [ -z "$DEV" ] ; then return fi @@ -196,7 +272,7 @@ sas_handler() { if echo $DEV | grep -q ^/devices/ ; then sys_path=$DEV else - sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + sys_path=$(udevadm info -q path -p /sys/block/$DEV 2>/dev/null) fi # Use positional parameters as an ad-hoc array @@ -206,44 +282,51 @@ sas_handler() { # Get path up to /sys/.../hostX i=1 + while [ $i -le $num_dirs ] ; do d=$(eval echo \${$i}) scsi_host_dir="$scsi_host_dir/$d" echo $d | grep -q -E '^host[0-9]+$' && break - i=$(($i + 1)) + i=$((i + 1)) done + # Lets grab the SAS host channel number and save it for JBOD sorting later + HOSTCHAN=$(echo $d | awk -F/ '{ gsub("host","",$NF); print $NF}') + if [ $i = $num_dirs ] ; then return fi - PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + PCI_ID=$(eval echo \${$((i -1))} | awk -F: '{print $2":"$3}') # In sas_switch mode, the directory four levels beneath # /sys/.../hostX contains symlinks to phy devices that reveal # the switch port number. In sas_direct mode, the phy links one # directory down reveal the HBA port. port_dir=$scsi_host_dir + case $TOPOLOGY in - "sas_switch") j=$(($i + 4)) ;; - "sas_direct") j=$(($i + 1)) ;; + "sas_switch") j=$((i + 4)) ;; + "sas_direct") j=$((i + 1)) ;; esac - i=$(($i + 1)) + i=$((i + 1)) + while [ $i -le $j ] ; do port_dir="$port_dir/$(eval echo \${$i})" - i=$(($i + 1)) + i=$((i + 1)) done - PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'` + PHY=$(ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}') if [ -z "$PHY" ] ; then PHY=0 fi - PORT=$(( $PHY / $PHYS_PER_PORT )) + PORT=$((PHY / PHYS_PER_PORT)) # Look in /sys/.../sas_device/end_device-X for the bay_identifier # attribute. end_device_dir=$port_dir + while [ $i -lt $num_dirs ] ; do d=$(eval echo \${$i}) end_device_dir="$end_device_dir/$d" @@ -251,37 +334,50 @@ sas_handler() { end_device_dir="$end_device_dir/sas_device/$d" break fi - i=$(($i + 1)) + i=$((i + 1)) done + # Add 'mix' slot type for environments where dm-multipath devices + # include end-devices connected via SAS expanders or direct connection + # to SAS HBA. A mixed connectivity environment such as # pool disk + # contained in a SAS JBOD and spare drives or log devices directly + # connected in a server backplane without expanders in the I/O path. SLOT= + case $BAY in "bay") - SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null` + SLOT=$(cat $end_device_dir/bay_identifier 2>/dev/null) + ;; + "mix") + if [ $(cat $end_device_dir/bay_identifier 2>/dev/null) ] ; then + SLOT=$(cat $end_device_dir/bay_identifier 2>/dev/null) + else + SLOT=$(cat $end_device_dir/phy_identifier 2>/dev/null) + fi ;; "phy") - SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null` + SLOT=$(cat $end_device_dir/phy_identifier 2>/dev/null) ;; "port") d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + SLOT=$(echo $d | sed -e 's/^.*://') ;; "id") - i=$(($i + 1)) + i=$((i + 1)) d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + SLOT=$(echo $d | sed -e 's/^.*://') ;; "lun") - i=$(($i + 2)) + i=$((i + 2)) d=$(eval echo \${$i}) - SLOT=`echo $d | sed -e 's/^.*://'` + SLOT=$(echo $d | sed -e 's/^.*://') ;; "ses") # look for this SAS path in all SCSI Enclosure Services # (SES) enclosures - sas_address=`cat $end_device_dir/sas_address 2>/dev/null` - enclosures=`lsscsi -g | \ - sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'` + sas_address=$(cat $end_device_dir/sas_address 2>/dev/null) + enclosures=$(lsscsi -g | \ + sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p') for enclosure in $enclosures; do set -- $(sg_ses -p aes $enclosure | \ awk "/device slot number:/{slot=\$12} \ @@ -298,42 +394,55 @@ sas_handler() { return fi - CHAN=`map_channel $PCI_ID $PORT` - SLOT=`map_slot $SLOT $CHAN` - if [ -z "$CHAN" ] ; then - return + if [ "$MULTIJBOD_MODE" = "yes" ] ; then + CHAN=$(map_channel $PCI_ID $PORT) + SLOT=$(map_slot $SLOT $CHAN) + JBOD=$(map_jbod $DEV) + + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}-${JBOD}-${SLOT}${PART} + else + CHAN=$(map_channel $PCI_ID $PORT) + SLOT=$(map_slot $SLOT $CHAN) + + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}${SLOT}${PART} fi - echo ${CHAN}${SLOT}${PART} } scsi_handler() { if [ -z "$FIRST_BAY_NUMBER" ] ; then - FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \ - {print \\$2; exit}" $CONFIG` + FIRST_BAY_NUMBER=$(awk '$1 == "first_bay_number" \ + {print $2; exit}' $CONFIG) fi FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0} if [ -z "$PHYS_PER_PORT" ] ; then - PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ - {print \\$2; exit}" $CONFIG` + PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ + {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then - MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ - {print \\$2; exit}" $CONFIG` + MULTIPATH_MODE=$(awk '$1 == "multipath" \ + {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then - DM_NAME=`ls -l --full-time /dev/mapper | - awk "/\/$DEV$/{print \\$9}"` + DM_NAME=$(ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print \\$9}") fi # For raw disks udev exports DEVTYPE=partition when @@ -343,19 +452,19 @@ scsi_handler() { # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then - PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + PART=$(echo $DM_NAME | awk -Fp '/p/{print "-part"$2}') fi # Strip off partition information. - DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + DM_NAME=$(echo $DM_NAME | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi # Get the raw scsi device name from multipath -ll. Strip off # leading pipe symbols to make field numbering consistent. - DEV=`multipath -ll $DM_NAME | - awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + DEV=$(multipath -ll $DM_NAME | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}') if [ -z "$DEV" ] ; then return fi @@ -364,7 +473,7 @@ scsi_handler() { if echo $DEV | grep -q ^/devices/ ; then sys_path=$DEV else - sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + sys_path=$(udevadm info -q path -p /sys/block/$DEV 2>/dev/null) fi # expect sys_path like this, for example: @@ -377,40 +486,43 @@ scsi_handler() { # Get path up to /sys/.../hostX i=1 + while [ $i -le $num_dirs ] ; do d=$(eval echo \${$i}) scsi_host_dir="$scsi_host_dir/$d" + echo $d | grep -q -E '^host[0-9]+$' && break - i=$(($i + 1)) + i=$((i + 1)) done if [ $i = $num_dirs ] ; then return fi - PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + PCI_ID=$(eval echo \${$((i -1))} | awk -F: '{print $2":"$3}') # In scsi mode, the directory two levels beneath # /sys/.../hostX reveals the port and slot. port_dir=$scsi_host_dir - j=$(($i + 2)) + j=$((i + 2)) - i=$(($i + 1)) + i=$((i + 1)) while [ $i -le $j ] ; do port_dir="$port_dir/$(eval echo \${$i})" - i=$(($i + 1)) + i=$((i + 1)) done set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') PORT=$1 - SLOT=$(($2 + $FIRST_BAY_NUMBER)) + SLOT=$(($2 + FIRST_BAY_NUMBER)) if [ -z "$SLOT" ] ; then return fi - CHAN=`map_channel $PCI_ID $PORT` - SLOT=`map_slot $SLOT $CHAN` + CHAN=$(map_channel $PCI_ID $PORT) + SLOT=$(map_slot $SLOT $CHAN) + if [ -z "$CHAN" ] ; then return fi @@ -450,8 +562,8 @@ enclosure_handler () { PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g') # Name our device according to vdev_id.conf (like "L0" or "U1"). - NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \ - \$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG) + NAME=$(awk "/channel/{if ($1 == "channel" && $2 == "$PCI_ID" && \ + $3 == "$PORT_ID") {print ${4}int(count[$4])}; count[$4]++}" $CONFIG) echo "${NAME}" } @@ -489,7 +601,7 @@ alias_handler () { DM_PART= if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then if [ "$DEVTYPE" != "partition" ] ; then - DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + DM_PART=$(echo $DM_NAME | awk -Fp '/p/{print "-part"$2}') fi fi @@ -497,21 +609,25 @@ alias_handler () { for link in $DEVLINKS ; do # Remove partition information to match key of top-level device. if [ -n "$DM_PART" ] ; then - link=`echo $link | sed 's/p[0-9][0-9]*$//'` + link=$(echo $link | sed 's/p[0-9][0-9]*$//') fi # Check both the fully qualified and the base name of link. - for l in $link `basename $link` ; do - alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \ - { print \\$2; exit }" $CONFIG` - if [ -n "$alias" ] ; then - echo ${alias}${DM_PART} - return + for l in $link $(basename $link) ; do + if [ ! -z $l ]; then + alias=$(awk -v var=$l '($1 == "alias") && \ + ($3 == var) \ + { print $2; exit }' $CONFIG) + if [ -n "$alias" ] ; then + echo ${alias}${DM_PART} + return + fi fi done done } -while getopts 'c:d:eg:mp:h' OPTION; do +# main +while getopts 'c:d:eg:jmp:h' OPTION; do case ${OPTION} in c) CONFIG=${OPTARG} @@ -524,7 +640,9 @@ while getopts 'c:d:eg:mp:h' OPTION; do # create the enclosure device symlinks only. We also need # "enclosure_symlinks yes" set in vdev_id.config to actually create the # symlink. - ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG) + ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") \ + print $2}' $CONFIG) + if [ "$ENCLOSURE_MODE" != "yes" ] ; then exit 0 fi @@ -535,6 +653,9 @@ while getopts 'c:d:eg:mp:h' OPTION; do p) PHYS_PER_PORT=${OPTARG} ;; + j) + MULTIJBOD_MODE=yes + ;; m) MULTIPATH_MODE=yes ;; @@ -555,11 +676,11 @@ if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then fi if [ -z "$TOPOLOGY" ] ; then - TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG` + TOPOLOGY=$(awk '($1 == "topology") {print $2; exit}' $CONFIG) fi if [ -z "$BAY" ] ; then - BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG` + BAY=$(awk '($1 == "slot") {print $2; exit}' $CONFIG) fi TOPOLOGY=${TOPOLOGY:-sas_direct} @@ -582,16 +703,16 @@ if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then fi # First check if an alias was defined for this device. -ID_VDEV=`alias_handler` +ID_VDEV=$(alias_handler) if [ -z "$ID_VDEV" ] ; then BAY=${BAY:-bay} case $TOPOLOGY in sas_direct|sas_switch) - ID_VDEV=`sas_handler` + ID_VDEV=$(sas_handler) ;; scsi) - ID_VDEV=`scsi_handler` + ID_VDEV=$(scsi_handler) ;; *) echo "Error: unknown topology $TOPOLOGY"