iomonitor – wrapper script for ioping
Link to it on my github because formatting is screwed up here
This is a wrapper script for ioping. Can be implemented in to a cronjob (ex: with https://healthchecks.io ) or as an NRPE command for nagios. Use –nagios-perfdata to generate perfdata for Nagios to consume
I needed a way to track I/O latency on a VM hypervisor node (ovirt) because one ovirt node of 3 kept reporting latency to storage but it was the only one reporting it (and guaranteed not a config issue). I set this up in nagios to run every minute and run for 15 runs which is usually ~15 seconds
This is what it looks like inside NagiosXI
#!/usr/bin/env bash # # Wrapper script for ioping. Can be implemented in to a cron # job or as an NRPE command for nagios. Use --nagios-perfdata to generate perfdata # for Nagios to consume # # I needed a way to track I/O latency on a VM hypervisor node (ovirt) # because the ovirt engine kept reporting latencies but it was the only one # reporting it (and guaranteed not a config issue). I set this up in nagios # to run every minute and run for 15 runs which is usually ~15 seconds # # # It is suggested to first get a baseline for what your system looks like by # running the script with all zeros for crit/warn then using "raw data" line # to generate some values you consider warn/critical. I used a # count of 120 (2 minutes) then min/max/avg * 1.5 for warning and * 2.5 for critical # # * While running this I did the following on my home directory # # while [ true ]; do ls -alhtrR $HOME; done # # to generate some I/O without using DD, figured all the stat() calls would be # better geared towards real use # # # Example: # # ./iomonitor --directory /tmp --min-warn 0 --min-crit 0 --max-warn 0 --max-crit 0 --avg-warn 0 --avg-crit 0 --count 120 # # Check dependencies if [ -z $(command -v ioping) ]; then echo "* ERROR: Cannot find ioping command" exit 254 fi if [ -z $(command -v bc) ]; then echo "* ERROR: Cannot find bc command" exit 254 fi # This prints when using the -v flag function debug_write() { if [ ${dbg} ]; then echo "* $@" else return fi } # Collect arguments setargs(){ while [ "$1" != "" ]; do case $1 in "--min-warn") shift min_warn=$1 ;; "--min-crit") shift min_crit=$1 ;; "--max-warn") shift max_warn=$1 ;; "--max-crit") shift max_crit=$1 ;; "--avg-warn") shift avg_warn=$1 ;; "--avg-crit") shift avg_crit=$1 ;; "-c" | "--count" ) shift count=$1 ;; "-d" | "--directory") shift directory="$1" ;; "--nagios-perfdata") perfdata=1 ;; "-v" | "--verbose") #shift dbg=1 ;; esac shift done } setargs "$@" # Startup debug_write "min_warn=${min_warn}" debug_write "min_crit=${min_crit}" debug_write "max_warn=${max_warn}" debug_write "max_crit=${max_crit}" debug_write "avg_warn=${avg_warn}" debug_write "avg_crit=${avg_crit}" debug_write "count=${count}" debug_write "directory=${directory}" # If count is empty, default to 15 if [ -z ${count} ]; then count=15 fi # Move in to the directory for ioping to run cd "${directory}" cdres=$? if [ ${cdres} -ne 0 ]; then echo "* ERROR: Failed to CD to ${directory} to run ioping test. Exiting" exit 254 fi # Stuff debug_write "Current directory - $(pwd)" # Run ioping debug_write "Running ${count} times" cmd=$(ioping -c ${count} .) # --verbose debug_write "output: ${cmd}" # Grep the line we care about line=$(echo "${cmd}" | grep "^min/avg/max/mdev" ) debug_write "line: '${line}'" # Now awk the fields out data_lines=$(echo "${line}" | awk '{ print $3 " " $4 "\n" $6 " " $7 "\n" $9 " " $10 "\n" $12 " " $13 };') # Array for data parsing declare -a data # Conversions SAVEIFS=$IFS IFS=$(echo -en "\n\b") count=0 for i in $(echo "${data_lines}"); do # TODO: Make what to convert to an argument # we default now to seconds. People may want to monitor at ms level #... but I suck at math value=$(echo "$i" | cut -d ' ' -f1) unit=$(echo "$i" | cut -d ' ' -f2) case "${unit}" in ns) conversion="0.000000001" ;; us) conversion="0.000001" ;; ms) conversion="0.001" ;; s) conversion="1" ;; m) conversion="60" ;; h) conversion="3600" ;; *) echo "* ERROR: Received unit we could not convert. Got ${unit}" exit 245 ;; esac debug_write "(${unit}) - ${value} * ${conversion}" converted=$(echo "scale=6; ${value} * ${conversion}" | bc | awk '{printf "%f", $0}') data[${count}]=${converted} count=$((${count}+1)) done IFS=$SAVEIFS min=${data[0]} avg=${data[1]} max=${data[2]} mdev=${data[3]} debug_write "Converted to seconds: $min / $avg / $max / $mdev" # now check warn/crit exit_crit=0 exit_warn=0 output="" perfdataoutput="" # Because im lazy and using a function is prettier function append() { output="${output}$@" } function perfdata_append() { perfdataoutput="${perfdataoutput}$@ " } # Use BC to do float comparison function comp() { bc <<< "$@" return $? } # Iterate the fields we need. Doing it this way avoids repeat code # Why repeat code when we can use bashes flexibility?! for i in $(echo min max avg); do # Yay bash variable substitution! # use the value when we need to and the variable name when we need to # ex: ${idx_name} would expand to min then $idx_warn would expand to min_warn # so when we use ${!idx_warn} it would expand to min_warn value (the arg input field) idx_inner_val="${!i}" idx_name="$i" idx_warn="${idx_name}_warn" idx_crit="${idx_name}_crit" debug_write "${idx_inner_val} > ${!idx_warn}" debug_write "${idx_inner_val} < ${!idx_crit}" if [ $(comp "${idx_inner_val} > ${!idx_warn}" ) -eq 1 ] && [ $(comp "${idx_inner_val} < ${!idx_crit}" ) -eq 1 ]; then append " * WARNING: '$directory' storage latency ${idx_name} response time ${idx_inner_val} > ${!idx_warn}\n" exit_warn=1 fi if [ $(comp "${idx_inner_val} > ${!idx_crit}" ) -eq 1 ]; then append " * CRITICAL: '$directory' storage latency ${idx_name} response time ${idx_inner_val} > ${!idx_crit}\n" exit_crit=1 fi perfdata_append "${idx_name}=${idx_inner_val}" done # May as well print the raw data when we print anything else or the OK append "raw data: ${line}" # Warn / crit / OK logic # Crit if [ ${exit_crit} -eq 1 ]; then echo -e "${output}" if [ ! -z "${perfdata}" ]; then echo -e " | ${perfdataoutput}" fi exit 2 fi # Warn if [ ${exit_warn} -eq 1 ]; then echo -e "${output}" if [ ! -z "${perfdata}" ]; then echo -e " | ${perfdataoutput}" fi exit 1 fi # Else OK echo -e "OK - ${directory} latency - ${output}" | tr -d '\n' if [ ! -z "${perfdata}" ]; then echo -e " | ${perfdataoutput}" fi exit 0