脚本功能
- 脚本输出到
/opt/xunjian/当前时间-xunjian.txt
- 检查所有节点的kubelet状态
- 检查集群状态
- 检查节点状态
- 检查namespace状态
- 检查pod状态
- 正常pod数量
- 镜像无法拉取的pod
- Error和Terminating的Pod
- 一小时内重启次数大于3次的Running Pod
- 1小时内重启次数大于3次的非Running Pod(CrashLoopBackOff)
- 检查事件情况
- 检查所有节点资源使用情况
- 检查资源CPU大于1000m的pod
- 检查资源内存大于1000Mi的pod
#!/bin/bash
# k8s集群巡检脚本
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
# 创建巡检目录
#if ! [ -d /opt/xunjian ];then
# mkdir -p /opt/xunjian
#fi
# 检查kubelet状态
function check_kubelet() {
echo -e "===检查kubelet状态==="
rm -rf check_kubelet.txt
for node in $(kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do
if kubectl get node "$node" &> /dev/null; then
# kubectl get node $node |awk '$2 == "Ready" {print; found=1} END{if (!found) exit 1}'
kubectl get node $node -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' | grep -q "True" && echo "Ready" || (echo "NotReady"; exit 1)
if
[ $? -eq 0 ];then
echo "$node kubelet无异常" >> check_kubelet.txt
else
echo "$node kubelet status is not running" >> check_kubelet.txt
fi
else
echo "Node not found" > check_kubelet.txt
fi
done
}
# 检查集群异常状态:
function check_cluster() {
echo -e "===检查scheduler,controller-manager,etcd状态==="
num=$(kubectl get cs 2>/dev/null | grep -c Unhealthy)
if (( num > 0 )); then
echo -e "异常项为:"
kubectl get cs 2>/dev/null | awk 'NR!=1 {if($2 != "Healthy") print}' > check_cluster.txt
else
echo -e "scheduler,controller-manager,etcd无异常" > check_cluster.txt
fi
}
# 检查节点状态:
function check_node() {
echo -e "===检查异常节点状态==="
errnode_num=$(kubectl get nodes |grep -c NotReady)
if (( errnode_num > 0 )); then
echo -e "异常节点" > check_node.txt
kubectl get nodes | grep NotReady |awk '{print $1}' >> check_node.txt
else
echo -e "无异常节点" > check_node.txt
fi
}
# 检查命名空间
function check_namespace() {
echo -e "===检查命名空间状态==="
nserr_num=$(kubectl get ns | awk 'NR!=1{print}' | grep -cv Active)
if (( nserr_num > 0 )); then
echo -e "异常的namespace为:"
kubectl get ns | awk 'NR!=1{print}' | grep -v Active | awk '{print $1}' > check_nsmespace.txt
else
echo "Namespace无异常" > check_nsmespace.txt
fi
}
# 检查Pod状态:
function check_pod() {
echo -e "===检查正常Pod数量==="
pod_runn_nu=$(kubectl get pods -A | grep -c Running)
echo -n "正常Pod数量为:$pod_runn_nu" > check_pod_num.txt
echo -e "===检查镜像无法拉取的Pod==="
imgerrpod_num=$(kubectl get pods -A | awk 'NR!=1{print}' | grep -cE 'ImagePullBackOff|ErrImagePull')
if (( imgerrpod_num > 0 )); then
echo -e "\n\n镜像无法拉取的Pod为:" >> check_pod_num.txt
echo -e "\n命名空间\tpod名称" >> check_pod_num.txt
kubectl get pods -A | awk 'NR!=1{print}' | grep -E 'ImagePullBackOff|ErrImagePull' | awk '{print $1, $2}' >> check_pod_num.txt
else
echo -e "\n没有镜像无法拉取的Pod" >> check_pod_num.txt
fi
echo -e "===检查Error和Terminating的Pod==="
errpod_num=$(kubectl get pods -A | awk 'NR!=1{print}' | grep -cE 'Error|Terminating')
if (( errpod_num > 0 )); then
echo -e "\n\nError和Terminating的Pod为:" >> check_pod_num.txt
echo -e "\n命名空间\tpod名称" >> check_pod_num.txt
kubectl get pods -A | awk 'NR!=1{print}' | grep -E 'Error|Terminating' | awk '{print $1, $2}' >> check_pod_num.txt
else
echo -e "\n没有Error和Terminating的Pod" >> check_pod_num.txt
fi
start_time=$(date -d "-9hours" +%Y-%m-%dT%H:%M:%SZ)
echo -e "===检查1小时内重启次数大于3次的Running Pod==="
echo -e "命名空间\tpod名称" > check_restart_runn.txt
running_restart_pod_num=$(kubectl get po -A -o json | jq -r '.items[] | select(.status.containerStatuses != null) | select(.status.containerStatuses[0].restartCount >= 3 and .status.containerStatuses[0].lastState.terminated.startedAt >= "'"$start_time"'") | "\(.metadata.namespace) \(.metadata.name)"' | wc -l)
if (( running_restart_pod_num > 0 ));then
echo -e "\n1小时内重启次数大于3次的Running Pod为:"
kubectl get po -A -o json | jq -r '.items[] | select(.status.containerStatuses != null) | select(.status.containerStatuses[0].restartCount >= 3 and .status.containerStatuses[0].lastState.terminated.startedAt >= "'"$start_time"'") | "\(.metadata.namespace) \(.metadata.name)"' >> check_restart_runn.txt
else
echo "没有1小时内重启次数大于3次的Running Pod" >> check_restart_runn.txt
fi
echo -e "===检查1小时内重启次数大于3次的非Running Pod==="
echo -e "命名空间\tpod名称" > check_restart_notrunn.txt
norunning_restart_pod_num=$(kubectl get po -A -o json | jq -r '.items[] | select(.status.containerStatuses != null) | select(.status.containerStatuses[0].restartCount >= 3 and .status.containerStatuses[0].lastState.terminated.startedAt >= "'"2023-11-17T01:55:35Z"'") | "\(.metadata.namespace) \(.metadata.name)"' | wc -l)
if (( norunning_restart_pod_num > 0 ));then
echo -e "\n1小时内重启次数大于3次的非Running Pod为:"
kubectl get po -A -o json | jq -r '.items[] | select(.status.containerStatuses != null) | select(.status.containerStatuses[0].restartCount >= 3 and .status.containerStatuses[0].lastState.terminated.startedAt >= "'"2023-11-17T01:55:35Z"'") | "\(.metadata.namespace) \(.metadata.name)"'
>> check_restart_notrunn.txt
else
echo "没有1小时内重启次数大于3次的非Running Pod" >> check_restart_notrunn.txt
fi
}
# 检查事件状态:
function check_event() {
echo -e "===检查事件情况==="
kubectl get ev -A |awk 'NR==1' > check_ev.txt
events_num=$(kubectl get events -A | grep -c Warning)
if (( events_num > 0 )); then
echo -e "异常事件为:"
kubectl get events -A | grep Warning >> check_ev.txt
else
echo -e "\n无异常事件" >> check_ev.txt
fi
sed -i 's/ \{1,\}/。 /g' check_ev.txt
}
# 检查资源使用情况
function check_resource() {
echo -e "===检测节点资源使用情况==="
echo -e "node节点\tcpu\t内存" > check_node_resources.txt
warning=0
for i in $(kubectl get nodes | awk 'NR == 1 {next}{print $1}');do
cpuUsed=$(kubectl top nodes "$i" | awk 'NR == 1 {next}{print $3}'| sed "s/[^0-9]//g")
memUsed=$(kubectl top nodes "$i" | awk 'NR == 1 {next}{print $5}'| sed "s/[^0-9]//g")
if (( cpuUsed > 0)) || (( memUsed > 0 ));then
(( memWarn++ )) || true
echo -e "$i CPU使用率:$cpuUsed%\t内存使用率:$memUsed%" >> check_node_resources.txt
fi
if [ "$memUsed" -gt 85 ];then
(( warning++ )) || true
echo -e "$i 内存使用率过高!"
else
echo -e "$i 该节点的内存在正常范围内"
fi
if [ "$cpuUsed" -gt 85 ];then
(( warning++ )) || true
echo -e "$i CPU使用率过高!"
else
echo -e "$i 该节点的CPU在正常范围内"
fi
done
echo -e "===检查资源CPU大于1000m的pod==="
echo -e "命名空间\tpod名称\tcpu使用值(单位m)" > check_pod_cpu_resources.txt
kubectl top pods 2>/dev/null -A |awk '{print $1, $2, gensub(/m/, "", 1, $3)}' |awk '{if ($3 > 1000) print$1,$2,$3}'|awk 'NR!=1 {print}' | uniq | sort -k3r >> check_pod_cpu_resources.txt
echo -e "命名空间\tpod名称\t内存使用值(单位Mi)" > check_pod_mem_resources.txt
echo -e "===检查资源内存大于1000Mi的pod==="
kubectl top pods 2>/dev/null --all-namespaces |awk '{print $1,$2,$4}'|sed 's@Mi@@g' |awk '{if ($3 > 1000) print$1,$2,$3}'|awk 'NR!=1 {print}' | uniq | sort -k3r >> check_pod_mem_resources.txt
}
function main() {
check_kubelet
check_cluster
check_node
check_namespace
check_pod
check_event
check_resource
}
main
txt转换成html
由于输出格式不一致,所以上面的脚本生成了多个txt文件,针对每个txt分别生成html,再合并到一起。
下面是一个转换集群状态的脚本,每个txt都需要进行转换。
cat html-cluster.sh
# !/bin/sh
file_input='/root/xunjian/check_cluster.txt'
file_output="/opt/xunjian/check-`date +%F`.html"
td_str=''
function create_html_head(){
echo -e "<html>
<body>
<h3>巡检项目二:检查k8s集群状态</h3>"
}
function create_table_head(){
echo -e "<table border="1">"
}
function create_td(){
# if [ -e ./"$1" ]; then
echo $1
td_str=`echo $1 | awk 'BEGIN{FS=" "}''{i=1; while(i<=NF) {print "<td style=background-color:#F0F0F0>"$i"</td>";i++}}'` ###以句号为分隔符
echo $td_str
# fi
}
function create_tr(){
create_td "$1"
echo -e "<tr>
$td_str
</tr>" >> $file_output
}
function create_table_end(){
echo -e "</table>"
}
function create_html_end(){
echo -e "</body></html>"
}
function create_html(){
touch $file_output ###创建需要生成的html文件
create_html_head >> $file_output
create_table_head >> $file_output
while read line
do
echo $line
create_tr "$line"
done < $file_input
create_table_end >> $file_output
create_html_end >> $file_output
}
create_html
输出是这样的:
<html>
<body>
<h3>巡检项目二:检查k8s集群状态</h3>
<table border=1>
<tr>
<td style=background-color:#F0F0F0>scheduler,controller-manager,etcd无异常</td>
</tr>
</table>
</body></html>
web查看
创建nginx deployment,映射主机的/opt/xunjian/
目录,暴露端口。我这里脚本是在master01上,放在别的节点上需要有kubeconfig,所以需要加容忍。
cat nginx.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: xunjian
spec:
replicas: 1
selector:
matchLabels:
app: xunjian
template:
metadata:
labels:
app: xunjian
spec:
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
containers:
- name: xunjian
image: nginx:stable-alpine
imagePullPolicy: IfNotPresent
volumeMounts:
- name: xunjian-volume
mountPath: /usr/share/nginx/html/xunjian
volumes:
- name: xunjian-volume
hostPath:
path: /opt/xunjian
type: Directory
nodeName: kube-master01
---
apiVersion: v1
kind: Service
metadata:
name: xunjian
labels:
app: xunjian
spec:
type: NodePort
ports:
- port: 80
protocol: TCP
name: http
nodePort: 32360
selector:
app: xunjian
k apply -f nginx.yaml
访问:http://ip:32360/xunjian/check-2023-11-17.html
创建定时任务
创建执行检测和转换txt脚本:
cat /root/check_send.sh
#/bin/bash
##执行巡检脚本
sh /root/xunjian/check.sh
sleep 60s;
##执行转换脚本
sh /root/xunjian/html-kubelet.sh
sleep 10s;
sh /root/xunjian/html-cluster.sh
sleep 3s ;
sh /root/xunjian/html-node.sh
sleep 3s ;
sh /root/xunjian/html-namespace.sh
sleep 3s;
sh /root/xunjian/html-pod.sh
sleep 3s;
sh /root/xunjian/html-restartrun.sh
sleep 3s;
sh /root/xunjian/html-restartnotrun.sh
sleep 3s;
sh /root/xunjian/html-ev.sh
sleep 3s;
sh /root/xunjian/html-node-resources.sh
sleep 3s;
sh /root/xunjian/html-pod-mem.sh
sleep 3s;
sh /root/xunjian/html-pod-cpu.sh
每天早上9点执行。
0 9 * * * sh /root//root/check_send.sh
发送邮件
#!/usr/bin/python
#coding:utf-8
import smtplib, time, os
from email.mime.text import MIMEText
from email.header import Header
def send_mail_html(file):
'''发送html内容邮件'''
sender = ("%s<123@qq.com>")%(Header('k8s巡检报告','utf-8'),)
# 接收邮箱
receiver = '234@qq.com'
# 发送邮件主题
t = time.strftime("%Y年%m月%d日", time.localtime())
subject = t +'k8s巡检报告!'
# 发送邮箱服务器
smtpserver = 'smtp.163.com'
# 发送邮箱用户/密码
username = 'qwer'
password = 'asdf'
# 读取html文件内容
f = open(file, 'rb')
mail_body = f.read()
f.close()
# 组装邮件内容和标题,中文需参数‘utf-8’,单字节字符不需要
msg = MIMEText(mail_body, _subtype='html', _charset='utf-8')
msg['Subject'] = Header(subject, 'utf-8')
msg['From'] = sender
msg['To'] = receiver
# 登录并发送邮件
try:
smtp = smtplib.SMTP()
smtp.connect(smtpserver)
smtp.login(username, password)
smtp.sendmail(sender, receiver, msg.as_string())
except:
print("邮件发送失败!")
else:
print("邮件发送成功!")
finally:
smtp.quit()
def find_new_file(dir):
'''查找目录下最新的文件'''
file_lists = os.listdir(dir)
file_lists.sort(key=lambda fn: os.path.getmtime(dir + "/" + fn)
if not os.path.isdir(dir + "/" + fn)
else 0)
file = os.path.join(dir, file_lists[-1])
print('/opt/xunjian/:', file)
return file
dir = '/opt/xunjian/' # 指定文件目录
file = find_new_file(dir) # 查找最新的html文件
send_mail_html(file) # 发送html内容邮件
添加定时任务发送邮件:
10 9 * * * /root/mail_send.py