debian.ansible.runchecks/runchecks.yml
2022-09-28 14:13:44 +02:00

396 lines
12 KiB
YAML

---
- name: Checks
hosts: all
tasks:
- name: Install Packages
apt:
name:
- smartmontools
- mdadm
update_cache: no
install_recommends: no
- name: Create checks dir /usr/local/sbin/runchecks.d
ansible.builtin.file:
path: /usr/local/sbin/runchecks.d
owner: root
group: root
state: directory
mode: '0700'
- name: /usr/local/sbin/runchecks.sh
blockinfile:
path: /usr/local/sbin/runchecks.sh
mode: "0500"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
. /etc/bash/gaboshlib.include
g_lockfile
g_nice
g_all-to-syslog
g_echo_ok "Starting $0"
while true
do
g_echo "Waiting 5min"
sleep 300
g_echo "Next Loop"
find /usr/local/sbin/runchecks.d -name "*.check" -type f | sort | while read check
do
g_echo "Running: $check"
. "$check"
done
done
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/services
lineinfile:
path: /usr/local/sbin/runchecks.d/services
create: yes
insertbefore: EOF
line: "cron rsyslog sshd"
- name: /usr/local/sbin/runchecks.sh shebang
lineinfile:
path: /usr/local/sbin/runchecks.sh
insertbefore: BOF
line: "#!/bin/bash"
- name: /etc/systemd/system/runchecks.service
blockinfile:
path: /etc/systemd/system/runchecks.service
create: yes
mode: 0444
owner: root
group: root
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
[Unit]
Description=checks
After=syslog.target network.target
[Service]
Type=simple
User=root
ExecStart=/usr/local/sbin/runchecks.sh
Restart=on-abort
[Install]
WantedBy=multi-user.target
notify:
- Restart systemd
- name: /usr/local/sbin/runchecks.d/disks.check
blockinfile:
path: /usr/local/sbin/runchecks.d/disks.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
for disk in $(find /dev/ \( -name "nvme?" -o -name "sd?" \) \( -type b -o -type c \))
do
# check if the disk is used and not suspended
dev=$(basename $disk)
[ -f ${g_tmp}/${dev}-stat ] || cat /sys/block/${dev}/stat | perl -pe 's/ +/ /g' | cut -d" " -f1,2,3,4,5,6,7,8 > ${g_tmp}/${dev}-stat
cat /sys/block/${dev}/stat | perl -pe 's/ +/ /g' | cut -d" " -f1,2,3,4,5,6,7,8 > ${g_tmp}/${dev}-stat-now
if diff ${g_tmp}/${dev}-stat-now ${g_tmp}/${dev}-stat >/dev/null 2>&1
then
cat ${g_tmp}/${dev}-stat-now >${g_tmp}/${dev}-stat
# Check disk SMART
OPT=""
udevadm info --query=all -n $disk | grep -q usb- && OPT="-d sat"
smartctl -H $OPT $disk >${g_tmp}/smartctl
if ! cat ${g_tmp}/smartctl | egrep -iq "SMART overall-health self-assessment test result: PASSED|SMART Health Status: OK"
then
g_echo_error "SMART of $disk unhealthy: $(cat ${g_tmp}/smartctl)"
fi
fi
done
validate: /bin/bash -n %s
backup: yes
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/raid-mdadm.check
blockinfile:
path: /usr/local/sbin/runchecks.d/raid-mdadm.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
mdadm -D --scan | egrep "^ARRAY" | cut -d" " -f2 | while read array
do
mdadm -D $array >$g_tmp/raiddetails
if ! cat $g_tmp/raiddetails | egrep "^ +State : " | egrep -q "^ +State : clean $|^ +State : active $"
then
g_echo_error "RAID $array state unhealthy - mdadm -D $array: $(cat $g_tmp/raiddetails)"
fi
done
validate: /bin/bash -n %s
backup: yes
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/internet.check
blockinfile:
path: /usr/local/sbin/runchecks.d/internet.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
testip=9.9.9.9
if ! ping -c3 ${testip} >/dev/null 2>&1
then
g_echo_error "No Internet connection? ping $testip failed!"
fi
validate: /bin/bash -n %s
backup: yes
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/processes.check
blockinfile:
path: /usr/local/sbin/runchecks.d/processes.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
# process check
mkdir -p ${g_tmp}/proc-check
# Ggf wieder OK-Prozesse löschen
ls ${g_tmp}/proc-check | while read odzpid
do
if ! egrep -q "^State:.+D \(|State:.+Z \(|State:.+R \(" /proc/$odzpid/status 2>/dev/null
then
rm ${g_tmp}/proc-check/$odzpid
fi
done
# Neue oder alte Finden und bei Status älter als 30 Minuten melden
egrep "^State:.+D \(|State:.+Z \(|State:.+R \(" /proc/[0-9]*/status | cut -d/ -f3 | while read dzpid
do
if [ -f ${g_tmp}/proc-check/$dzpid ]
then
timestamp=$(ls --full-time ${g_tmp}/proc-check/$dzpid | sed 's/ */ /g' | cut -d" " -f6,7)
date >>${g_tmp}/proc-check/$dzpid
touch -d "$timestamp" ${g_tmp}/proc-check/$dzpid
if find ${g_tmp}/proc-check/$dzpid -mmin +30 | grep -q $dzpid
then
ps aufx | grep $dzpid | grep -v grep >${g_tmp}/proc-check-$dzpid.notify
pstree $dzpid >>${g_tmp}/proc-check-$dzpid.notify
cat ${g_tmp}/proc-check/$dzpid >>${g_tmp}/proc-check-$dzpid.notify
g_echo_error "$(cat ${g_tmp}/proc-check-$dzpid.notify)"
fi
else
date >${g_tmp}/proc-check/$dzpid
fi
done
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/thermal.check
blockinfile:
path: /usr/local/sbin/runchecks.d/thermal.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
if [ -f /sys/class/thermal/thermal_zone0/temp ]
then
let temp=`cat /sys/class/thermal/thermal_zone0/temp`/1000
if [ $temp -gt 75 ]
then
g_echo_error "Server temperature high $temp°C"
fi
fi
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/measuretemp.check
blockinfile:
path: /usr/local/sbin/runchecks.d/measuretemp.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
if which vcgencmd >/dev/null 2&>1
then
temp=$(vcgencmd measure_temp | grep ^temp= | cut -d"=" -f2 | cut -d. -f1)
if [ $temp -gt 75 ]
then
g_echo_error "Server Temperatur zu hoch $temp°C"
fi
fi
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/load.check
blockinfile:
path: /usr/local/sbin/runchecks.d/load.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
let maxload=$(cat /proc/cpuinfo | grep ^processor | wc -l)+10
load=`cat /proc/loadavg | cut -d" " -f1`
intload=`echo $load | cut -d'.' -f1`
if [ $intload -gt $maxload ]
then
g_echo_error "System Load high at $load"
fi
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/services.check
blockinfile:
path: /usr/local/sbin/runchecks.d/services.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
for service in $(cat /usr/local/sbin/runchecks.d/services | sort)
do
if ! ps aux | grep -v grep | grep -q "$service"
then
g_echo_error "$service down"
fi
done
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/df.check
blockinfile:
path: /usr/local/sbin/runchecks.d/df.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
df -al | egrep -v ' - |Mounted on|/rom' | sed 's/ */ /g; s/\%//' | cut -d" " -f1,5 | sort -u | while read i
do
disk=`echo $i | cut -d" " -f1`
usa=`echo $i | cut -d" " -f2`
if [ $usa -gt 88 ]
then
g_echo_error "Space of $disk full: $usa%"
fi
done
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/dfinodes.check
blockinfile:
path: /usr/local/sbin/runchecks.d/dfinodes.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
df -ali | egrep -v ' - |Mounted on|/rom' | sed 's/ */ /g; s/\%//' | cut -d" " -f1,5 | sort -u | while read i
do
disk=`echo $i | cut -d" " -f1`
usa=`echo $i | cut -d" " -f2`
if [ $usa -gt 88 ]
then
g_echo_error "Inodes if $disk full: $usa%"
fi
done
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/freemem.check
blockinfile:
path: /usr/local/sbin/runchecks.d/freemem.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
mem=`free | grep "^Mem:" | sed 's/ */ /g' | cut -d" " -f 4`
if [ $mem -lt 20480 ]
then
g_echo_error "Memory full! Free: ${mem}kB"
fi
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: /usr/local/sbin/runchecks.d/freeswap.check
blockinfile:
path: /usr/local/sbin/runchecks.d/freeswap.check
mode: "0400"
owner: root
group: root
create: yes
marker: "# {mark} ANSIBLE MANAGED BLOCK"
block: |
if ! [ $(free | grep "^Swap:" | sed 's/ */ /g' | cut -d" " -f 2) == 0 ]
then
mem=`free | grep "^Swap:" | sed 's/ */ /g' | cut -d" " -f 4`
if [ $mem -lt 50000 ]
then
g_echo_error "Swap full! Free: ${mem}kB"
fi
fi
backup: yes
validate: /bin/bash -n %s
notify:
- Restart runchecks
- name: 'add runchecks to startup'
command: systemctl enable runchecks
args:
creates: /etc/systemd/system/multi-user.target.wants/runchecks.service
handlers:
- name: Restart systemd
ansible.builtin.shell: systemctl daemon-reload
notify: Restart runchecks
- name: Restart runchecks
service:
name: runchecks
state: restarted