396 lines
12 KiB
YAML
396 lines
12 KiB
YAML
---
|
|
- name: Checks
|
|
hosts: all
|
|
tasks:
|
|
- name: Install Packages
|
|
apt:
|
|
name:
|
|
- smartmontools
|
|
- mdadm
|
|
update_cache: no
|
|
install_recommends: no
|
|
|
|
- name: Create checks dir /usr/local/sbin/runchecks.d
|
|
ansible.builtin.file:
|
|
path: /usr/local/sbin/runchecks.d
|
|
owner: root
|
|
group: root
|
|
state: directory
|
|
mode: '0700'
|
|
|
|
- name: /usr/local/sbin/runchecks.sh
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.sh
|
|
mode: "0500"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
. /etc/bash/gaboshlib.include
|
|
g_lockfile
|
|
g_nice
|
|
g_all-to-syslog
|
|
g_echo_ok "Starting $0"
|
|
while true
|
|
do
|
|
g_echo "Waiting 5min"
|
|
sleep 300
|
|
g_echo "Next Loop"
|
|
find /usr/local/sbin/runchecks.d -name "*.check" -type f | sort | while read check
|
|
do
|
|
g_echo "Running: $check"
|
|
. "$check"
|
|
done
|
|
done
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/services
|
|
lineinfile:
|
|
path: /usr/local/sbin/runchecks.d/services
|
|
create: yes
|
|
insertbefore: EOF
|
|
line: "cron rsyslog sshd"
|
|
|
|
- name: /usr/local/sbin/runchecks.sh shebang
|
|
lineinfile:
|
|
path: /usr/local/sbin/runchecks.sh
|
|
insertbefore: BOF
|
|
line: "#!/bin/bash"
|
|
|
|
- name: /etc/systemd/system/runchecks.service
|
|
blockinfile:
|
|
path: /etc/systemd/system/runchecks.service
|
|
create: yes
|
|
mode: 0444
|
|
owner: root
|
|
group: root
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
[Unit]
|
|
Description=checks
|
|
After=syslog.target network.target
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=root
|
|
ExecStart=/usr/local/sbin/runchecks.sh
|
|
Restart=on-abort
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
notify:
|
|
- Restart systemd
|
|
|
|
|
|
- name: /usr/local/sbin/runchecks.d/disks.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/disks.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
for disk in $(find /dev/ \( -name "nvme?" -o -name "sd?" \) \( -type b -o -type c \))
|
|
do
|
|
# check if the disk is used and not suspended
|
|
dev=$(basename $disk)
|
|
[ -f ${g_tmp}/${dev}-stat ] || cat /sys/block/${dev}/stat | perl -pe 's/ +/ /g' | cut -d" " -f1,2,3,4,5,6,7,8 > ${g_tmp}/${dev}-stat
|
|
cat /sys/block/${dev}/stat | perl -pe 's/ +/ /g' | cut -d" " -f1,2,3,4,5,6,7,8 > ${g_tmp}/${dev}-stat-now
|
|
if diff ${g_tmp}/${dev}-stat-now ${g_tmp}/${dev}-stat >/dev/null 2>&1
|
|
then
|
|
cat ${g_tmp}/${dev}-stat-now >${g_tmp}/${dev}-stat
|
|
# Check disk SMART
|
|
OPT=""
|
|
udevadm info --query=all -n $disk | grep -q usb- && OPT="-d sat"
|
|
smartctl -H $OPT $disk >${g_tmp}/smartctl
|
|
if ! cat ${g_tmp}/smartctl | egrep -iq "SMART overall-health self-assessment test result: PASSED|SMART Health Status: OK"
|
|
then
|
|
g_echo_error "SMART of $disk unhealthy: $(cat ${g_tmp}/smartctl)"
|
|
fi
|
|
fi
|
|
done
|
|
validate: /bin/bash -n %s
|
|
backup: yes
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/raid-mdadm.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/raid-mdadm.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
mdadm -D --scan | egrep "^ARRAY" | cut -d" " -f2 | while read array
|
|
do
|
|
mdadm -D $array >$g_tmp/raiddetails
|
|
if ! cat $g_tmp/raiddetails | egrep "^ +State : " | egrep -q "^ +State : clean $|^ +State : active $"
|
|
then
|
|
g_echo_error "RAID $array state unhealthy - mdadm -D $array: $(cat $g_tmp/raiddetails)"
|
|
fi
|
|
done
|
|
validate: /bin/bash -n %s
|
|
backup: yes
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/internet.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/internet.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
testip=9.9.9.9
|
|
if ! ping -c3 ${testip} >/dev/null 2>&1
|
|
then
|
|
g_echo_error "No Internet connection? ping $testip failed!"
|
|
fi
|
|
validate: /bin/bash -n %s
|
|
backup: yes
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/processes.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/processes.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
# process check
|
|
mkdir -p ${g_tmp}/proc-check
|
|
# Ggf wieder OK-Prozesse löschen
|
|
ls ${g_tmp}/proc-check | while read odzpid
|
|
do
|
|
if ! egrep -q "^State:.+D \(|State:.+Z \(|State:.+R \(" /proc/$odzpid/status 2>/dev/null
|
|
then
|
|
rm ${g_tmp}/proc-check/$odzpid
|
|
fi
|
|
done
|
|
# Neue oder alte Finden und bei Status älter als 30 Minuten melden
|
|
egrep "^State:.+D \(|State:.+Z \(|State:.+R \(" /proc/[0-9]*/status | cut -d/ -f3 | while read dzpid
|
|
do
|
|
if [ -f ${g_tmp}/proc-check/$dzpid ]
|
|
then
|
|
timestamp=$(ls --full-time ${g_tmp}/proc-check/$dzpid | sed 's/ */ /g' | cut -d" " -f6,7)
|
|
date >>${g_tmp}/proc-check/$dzpid
|
|
touch -d "$timestamp" ${g_tmp}/proc-check/$dzpid
|
|
if find ${g_tmp}/proc-check/$dzpid -mmin +30 | grep -q $dzpid
|
|
then
|
|
ps aufx | grep $dzpid | grep -v grep >${g_tmp}/proc-check-$dzpid.notify
|
|
pstree $dzpid >>${g_tmp}/proc-check-$dzpid.notify
|
|
cat ${g_tmp}/proc-check/$dzpid >>${g_tmp}/proc-check-$dzpid.notify
|
|
g_echo_error "$(cat ${g_tmp}/proc-check-$dzpid.notify)"
|
|
fi
|
|
else
|
|
date >${g_tmp}/proc-check/$dzpid
|
|
fi
|
|
done
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/thermal.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/thermal.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
if [ -f /sys/class/thermal/thermal_zone0/temp ]
|
|
then
|
|
let temp=`cat /sys/class/thermal/thermal_zone0/temp`/1000
|
|
if [ $temp -gt 75 ]
|
|
then
|
|
g_echo_error "Server temperature high $temp°C"
|
|
fi
|
|
fi
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/measuretemp.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/measuretemp.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
if which vcgencmd >/dev/null 2&>1
|
|
then
|
|
temp=$(vcgencmd measure_temp | grep ^temp= | cut -d"=" -f2 | cut -d. -f1)
|
|
if [ $temp -gt 75 ]
|
|
then
|
|
g_echo_error "Server Temperatur zu hoch $temp°C"
|
|
fi
|
|
fi
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/load.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/load.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
let maxload=$(cat /proc/cpuinfo | grep ^processor | wc -l)+10
|
|
load=`cat /proc/loadavg | cut -d" " -f1`
|
|
intload=`echo $load | cut -d'.' -f1`
|
|
if [ $intload -gt $maxload ]
|
|
then
|
|
g_echo_error "System Load high at $load"
|
|
fi
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/services.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/services.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
for service in $(cat /usr/local/sbin/runchecks.d/services | sort)
|
|
do
|
|
if ! ps aux | grep -v grep | grep -q "$service"
|
|
then
|
|
g_echo_error "$service down"
|
|
fi
|
|
done
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/df.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/df.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
df -al | egrep -v ' - |Mounted on|/rom' | sed 's/ */ /g; s/\%//' | cut -d" " -f1,5 | sort -u | while read i
|
|
do
|
|
disk=`echo $i | cut -d" " -f1`
|
|
usa=`echo $i | cut -d" " -f2`
|
|
if [ $usa -gt 88 ]
|
|
then
|
|
g_echo_error "Space of $disk full: $usa%"
|
|
fi
|
|
done
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/dfinodes.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/dfinodes.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
df -ali | egrep -v ' - |Mounted on|/rom' | sed 's/ */ /g; s/\%//' | cut -d" " -f1,5 | sort -u | while read i
|
|
do
|
|
disk=`echo $i | cut -d" " -f1`
|
|
usa=`echo $i | cut -d" " -f2`
|
|
if [ $usa -gt 88 ]
|
|
then
|
|
g_echo_error "Inodes if $disk full: $usa%"
|
|
fi
|
|
done
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/freemem.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/freemem.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
mem=`free | grep "^Mem:" | sed 's/ */ /g' | cut -d" " -f 4`
|
|
if [ $mem -lt 20480 ]
|
|
then
|
|
g_echo_error "Memory full! Free: ${mem}kB"
|
|
fi
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: /usr/local/sbin/runchecks.d/freeswap.check
|
|
blockinfile:
|
|
path: /usr/local/sbin/runchecks.d/freeswap.check
|
|
mode: "0400"
|
|
owner: root
|
|
group: root
|
|
create: yes
|
|
marker: "# {mark} ANSIBLE MANAGED BLOCK"
|
|
block: |
|
|
if ! [ $(free | grep "^Swap:" | sed 's/ */ /g' | cut -d" " -f 2) == 0 ]
|
|
then
|
|
mem=`free | grep "^Swap:" | sed 's/ */ /g' | cut -d" " -f 4`
|
|
if [ $mem -lt 50000 ]
|
|
then
|
|
g_echo_error "Swap full! Free: ${mem}kB"
|
|
fi
|
|
fi
|
|
backup: yes
|
|
validate: /bin/bash -n %s
|
|
notify:
|
|
- Restart runchecks
|
|
|
|
- name: 'add runchecks to startup'
|
|
command: systemctl enable runchecks
|
|
args:
|
|
creates: /etc/systemd/system/multi-user.target.wants/runchecks.service
|
|
|
|
|
|
handlers:
|
|
|
|
- name: Restart systemd
|
|
ansible.builtin.shell: systemctl daemon-reload
|
|
notify: Restart runchecks
|
|
|
|
- name: Restart runchecks
|
|
service:
|
|
name: runchecks
|
|
state: restarted
|
|
|