k8s

k8s巡检脚本

脚本功能

  • 脚本输出到/opt/xunjian/当前时间-xunjian.txt
  • 检查所有节点的kubelet状态
  • 检查集群状态
  • 检查节点状态
  • 检查namespace状态
  • 检查pod状态
    • 正常pod数量
    • 镜像无法拉取的pod
    • Error和Terminating的Pod
    • 一小时内重启次数大于3次的Running Pod
    • 1小时内重启次数大于3次的非Running Pod(CrashLoopBackOff)
  • 检查事件情况
  • 检查所有节点资源使用情况
  • 检查资源CPU大于1000m的pod
  • 检查资源内存大于1000Mi的pod
#!/bin/bash
# k8s集群巡检脚本
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
# 创建巡检目录
#if ! [ -d /opt/xunjian ];then
#    mkdir -p /opt/xunjian
#fi
# 检查kubelet状态
function check_kubelet() {
    echo -e "===检查kubelet状态==="
    rm -rf check_kubelet.txt
    for node in $(kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do
        if kubectl get node "$node" &> /dev/null; then
           # kubectl get node $node |awk '$2 == "Ready" {print; found=1} END{if (!found) exit 1}'
           kubectl get node $node -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' | grep -q "True" && echo "Ready" || (echo "NotReady"; exit 1)
           if
              [ $? -eq 0 ];then
              echo "$node kubelet无异常" >> check_kubelet.txt
           else
              echo "$node kubelet status is not running" >> check_kubelet.txt
           fi
        else
            echo "Node not found" > check_kubelet.txt
        fi
    done
}

# 检查集群异常状态:
function check_cluster() {
    echo -e "===检查scheduler,controller-manager,etcd状态==="
    num=$(kubectl get cs 2>/dev/null | grep -c Unhealthy)
    if (( num > 0 )); then
        echo -e "异常项为:"
        kubectl get cs 2>/dev/null | awk 'NR!=1 {if($2 != "Healthy") print}' > check_cluster.txt
    else
        echo -e "scheduler,controller-manager,etcd无异常" > check_cluster.txt
    fi
}

# 检查节点状态:
function check_node() {
    echo -e "===检查异常节点状态==="
    errnode_num=$(kubectl get nodes |grep -c NotReady)
    if (( errnode_num > 0 )); then
        echo -e "异常节点" > check_node.txt
        kubectl get nodes | grep NotReady |awk '{print $1}' >> check_node.txt
    else
        echo -e "无异常节点" > check_node.txt
    fi
}

# 检查命名空间
function check_namespace() {
    echo -e "===检查命名空间状态==="
    nserr_num=$(kubectl get ns | awk 'NR!=1{print}' | grep -cv Active)
    if (( nserr_num > 0 )); then
        echo -e "异常的namespace为:"
        kubectl get ns | awk 'NR!=1{print}' | grep -v Active | awk '{print $1}' > check_nsmespace.txt
    else
        echo "Namespace无异常" > check_nsmespace.txt
    fi
}

# 检查Pod状态:
function check_pod() {
    echo -e "===检查正常Pod数量==="
    pod_runn_nu=$(kubectl get pods -A | grep -c Running)
    echo -n "正常Pod数量为:$pod_runn_nu" > check_pod_num.txt
    echo -e "===检查镜像无法拉取的Pod==="
    imgerrpod_num=$(kubectl get pods -A | awk 'NR!=1{print}' | grep -cE 'ImagePullBackOff|ErrImagePull')
    if (( imgerrpod_num > 0 )); then
        echo -e "\n\n镜像无法拉取的Pod为:" >> check_pod_num.txt
        echo -e "\n命名空间\tpod名称"  >> check_pod_num.txt
        kubectl get pods -A | awk 'NR!=1{print}' | grep -E 'ImagePullBackOff|ErrImagePull' | awk '{print $1, $2}' >> check_pod_num.txt
    else
        echo -e "\n没有镜像无法拉取的Pod" >> check_pod_num.txt
    fi

    echo -e "===检查Error和Terminating的Pod==="
    errpod_num=$(kubectl get pods -A | awk 'NR!=1{print}' | grep -cE 'Error|Terminating')
    if (( errpod_num > 0 )); then
        echo -e "\n\nError和Terminating的Pod为:" >> check_pod_num.txt
        echo -e "\n命名空间\tpod名称"  >> check_pod_num.txt
        kubectl get pods -A | awk 'NR!=1{print}' | grep -E 'Error|Terminating' | awk '{print $1, $2}' >> check_pod_num.txt
    else
        echo -e "\n没有Error和Terminating的Pod" >> check_pod_num.txt
    fi

    start_time=$(date -d "-9hours" +%Y-%m-%dT%H:%M:%SZ)
    echo -e "===检查1小时内重启次数大于3次的Running Pod==="
    echo -e "命名空间\tpod名称" > check_restart_runn.txt
    running_restart_pod_num=$(kubectl get po -A -o json | jq -r '.items[] | select(.status.containerStatuses != null) | select(.status.containerStatuses[0].restartCount >= 3 and .status.containerStatuses[0].lastState.terminated.startedAt >= "'"$start_time"'") | "\(.metadata.namespace) \(.metadata.name)"' | wc -l)
    if (( running_restart_pod_num > 0 ));then
        echo -e "\n1小时内重启次数大于3次的Running Pod为:"
        kubectl get po -A -o json | jq -r '.items[] | select(.status.containerStatuses != null) | select(.status.containerStatuses[0].restartCount >= 3 and .status.containerStatuses[0].lastState.terminated.startedAt >= "'"$start_time"'") | "\(.metadata.namespace) \(.metadata.name)"' >> check_restart_runn.txt
    else
        echo "没有1小时内重启次数大于3次的Running Pod"  >>  check_restart_runn.txt
    fi

    echo -e "===检查1小时内重启次数大于3次的非Running Pod==="
    echo -e "命名空间\tpod名称" > check_restart_notrunn.txt
    norunning_restart_pod_num=$(kubectl get po -A -o json | jq -r '.items[] | select(.status.containerStatuses != null) | select(.status.containerStatuses[0].restartCount >= 3 and .status.containerStatuses[0].lastState.terminated.startedAt >= "'"2023-11-17T01:55:35Z"'") | "\(.metadata.namespace) \(.metadata.name)"' | wc -l)
    if (( norunning_restart_pod_num > 0 ));then
        echo -e "\n1小时内重启次数大于3次的非Running Pod为:"
        kubectl get po -A -o json | jq -r '.items[] | select(.status.containerStatuses != null) | select(.status.containerStatuses[0].restartCount >= 3 and .status.containerStatuses[0].lastState.terminated.startedAt >= "'"2023-11-17T01:55:35Z"'") | "\(.metadata.namespace) \(.metadata.name)"'
 >> check_restart_notrunn.txt
  else
        echo "没有1小时内重启次数大于3次的非Running Pod" >> check_restart_notrunn.txt
    fi
}

# 检查事件状态:
function check_event() {
    echo -e "===检查事件情况==="
    kubectl get ev -A |awk 'NR==1' > check_ev.txt
    events_num=$(kubectl get events -A | grep -c Warning)
    if (( events_num > 0 )); then
        echo -e "异常事件为:"
        kubectl get events -A | grep Warning >> check_ev.txt
    else
        echo -e "\n无异常事件" >> check_ev.txt
    fi
    sed -i 's/   \{1,\}/。 /g' check_ev.txt
}

# 检查资源使用情况
function check_resource() {
    echo -e "===检测节点资源使用情况==="
    echo -e "node节点\tcpu\t内存" > check_node_resources.txt
    warning=0
    for i in $(kubectl get nodes | awk 'NR == 1 {next}{print $1}');do
        cpuUsed=$(kubectl top nodes "$i" | awk 'NR == 1 {next}{print $3}'| sed "s/[^0-9]//g")
        memUsed=$(kubectl top nodes "$i" | awk 'NR == 1 {next}{print $5}'| sed "s/[^0-9]//g")
        if (( cpuUsed > 0)) || (( memUsed > 0 ));then
            (( memWarn++ )) || true
            echo -e "$i CPU使用率:$cpuUsed%\t内存使用率:$memUsed%" >> check_node_resources.txt
        fi
        if [ "$memUsed" -gt 85 ];then
            (( warning++ )) || true
            echo -e "$i 内存使用率过高!"
        else
            echo -e "$i 该节点的内存在正常范围内"
        fi

        if [ "$cpuUsed" -gt 85 ];then
            (( warning++ )) || true
            echo -e "$i CPU使用率过高!"
        else
            echo -e "$i 该节点的CPU在正常范围内"
        fi

    done

    echo -e "===检查资源CPU大于1000m的pod==="
    echo -e "命名空间\tpod名称\tcpu使用值(单位m)" > check_pod_cpu_resources.txt
    kubectl top pods 2>/dev/null -A |awk '{print $1, $2, gensub(/m/, "", 1, $3)}' |awk '{if ($3 > 1000) print$1,$2,$3}'|awk 'NR!=1 {print}' | uniq | sort -k3r  >> check_pod_cpu_resources.txt
    echo -e "命名空间\tpod名称\t内存使用值(单位Mi)" > check_pod_mem_resources.txt
    echo -e "===检查资源内存大于1000Mi的pod==="
    kubectl top pods 2>/dev/null --all-namespaces |awk '{print $1,$2,$4}'|sed 's@Mi@@g' |awk '{if ($3 > 1000) print$1,$2,$3}'|awk 'NR!=1 {print}' | uniq | sort -k3r >> check_pod_mem_resources.txt
}

function main() {
    check_kubelet
    check_cluster
    check_node
    check_namespace
    check_pod
    check_event
    check_resource
}

main

txt转换成html

由于输出格式不一致,所以上面的脚本生成了多个txt文件,针对每个txt分别生成html,再合并到一起。

file

下面是一个转换集群状态的脚本,每个txt都需要进行转换。

cat html-cluster.sh
# !/bin/sh

file_input='/root/xunjian/check_cluster.txt'
file_output="/opt/xunjian/check-`date +%F`.html"

td_str=''

function create_html_head(){
  echo -e "<html>
    <body>
      <h3>巡检项目二:检查k8s集群状态</h3>"
}

function create_table_head(){
  echo -e "<table border="1">"
}

function create_td(){
#  if [ -e ./"$1" ]; then
   echo $1
    td_str=`echo $1 | awk 'BEGIN{FS=" "}''{i=1; while(i<=NF) {print "<td style=background-color:#F0F0F0>"$i"</td>";i++}}'`   ###以句号为分隔符
    echo $td_str
#  fi
}

function create_tr(){
  create_td "$1"
  echo -e "<tr>
    $td_str
  </tr>" >> $file_output
}

function create_table_end(){
  echo -e "</table>"
}

function create_html_end(){
  echo -e "</body></html>"
}

function create_html(){
  touch $file_output           ###创建需要生成的html文件

  create_html_head >> $file_output
  create_table_head >> $file_output

  while read line
  do
    echo $line
    create_tr "$line"
  done < $file_input

  create_table_end >> $file_output
  create_html_end >> $file_output
}

create_html

输出是这样的:

<html>
    <body>
      <h3>巡检项目二:检查k8s集群状态</h3>
<table border=1>
<tr>
    <td style=background-color:#F0F0F0>scheduler,controller-manager,etcd无异常</td>
  </tr>
</table>
</body></html>

web查看

创建nginx deployment,映射主机的/opt/xunjian/目录,暴露端口。我这里脚本是在master01上,放在别的节点上需要有kubeconfig,所以需要加容忍。

cat nginx.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: xunjian
spec:
  replicas: 1
  selector:
    matchLabels:
      app: xunjian
  template:
    metadata:
      labels:
        app: xunjian
    spec:
      tolerations:
        - key: node-role.kubernetes.io/master
          operator: Exists
          effect: NoSchedule
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
      - name: xunjian
        image: nginx:stable-alpine
        imagePullPolicy: IfNotPresent
        volumeMounts:
        - name: xunjian-volume
          mountPath: /usr/share/nginx/html/xunjian
      volumes:
      - name: xunjian-volume
        hostPath:
          path: /opt/xunjian
          type: Directory
      nodeName: kube-master01
---

apiVersion: v1
kind: Service
metadata:
  name: xunjian
  labels:
    app: xunjian
spec:
  type: NodePort
  ports:
  - port: 80
    protocol: TCP
    name: http
    nodePort: 32360
  selector:
    app: xunjian

k apply -f nginx.yaml

访问:http://ip:32360/xunjian/check-2023-11-17.html

file

创建定时任务

创建执行检测和转换txt脚本:

cat /root/check_send.sh
#/bin/bash
##执行巡检脚本
sh /root/xunjian/check.sh
sleep 60s;
##执行转换脚本
sh /root/xunjian/html-kubelet.sh
sleep 10s;
sh /root/xunjian/html-cluster.sh
sleep 3s ;
sh /root/xunjian/html-node.sh
sleep 3s ;
sh /root/xunjian/html-namespace.sh
sleep 3s;
sh /root/xunjian/html-pod.sh
sleep 3s;
sh /root/xunjian/html-restartrun.sh
sleep 3s;
sh /root/xunjian/html-restartnotrun.sh
sleep 3s;
sh /root/xunjian/html-ev.sh
sleep 3s;
sh /root/xunjian/html-node-resources.sh
sleep 3s;
sh /root/xunjian/html-pod-mem.sh
sleep 3s;
sh /root/xunjian/html-pod-cpu.sh

每天早上9点执行。

0 9 * * * sh /root//root/check_send.sh

发送邮件

#!/usr/bin/python
#coding:utf-8
import smtplib, time, os
from email.mime.text import MIMEText
from email.header import Header

def send_mail_html(file):
    '''发送html内容邮件'''
    sender = ("%s<123@qq.com>")%(Header('k8s巡检报告','utf-8'),)
    # 接收邮箱
    receiver =  '234@qq.com'
    # 发送邮件主题
    t = time.strftime("%Y年%m月%d日", time.localtime())
    subject = t +'k8s巡检报告!'
    # 发送邮箱服务器
    smtpserver = 'smtp.163.com'
    # 发送邮箱用户/密码
    username = 'qwer'
    password = 'asdf'

    # 读取html文件内容
    f = open(file, 'rb')
    mail_body = f.read()
    f.close()

    # 组装邮件内容和标题,中文需参数‘utf-8’,单字节字符不需要
    msg = MIMEText(mail_body, _subtype='html', _charset='utf-8')
    msg['Subject'] = Header(subject, 'utf-8')
    msg['From'] = sender
    msg['To'] = receiver
    # 登录并发送邮件
    try:
        smtp = smtplib.SMTP()
        smtp.connect(smtpserver)
        smtp.login(username, password)
        smtp.sendmail(sender, receiver, msg.as_string())
    except:
        print("邮件发送失败!")
    else:
        print("邮件发送成功!")
    finally:
        smtp.quit()

def find_new_file(dir):
    '''查找目录下最新的文件'''
    file_lists = os.listdir(dir)
    file_lists.sort(key=lambda fn: os.path.getmtime(dir + "/" + fn)
                    if not os.path.isdir(dir + "/" + fn)
                    else 0)
    file = os.path.join(dir, file_lists[-1])
    print('/opt/xunjian/:', file)
    return file

dir = '/opt/xunjian/'  # 指定文件目录
file = find_new_file(dir)  # 查找最新的html文件
send_mail_html(file)  # 发送html内容邮件

添加定时任务发送邮件:

10 9 * * * /root/mail_send.py
分类: k8s
0 0 投票数
文章评分
订阅评论
提醒
guest

0 评论
内联反馈
查看所有评论

相关文章

开始在上面输入您的搜索词,然后按回车进行搜索。按ESC取消。

返回顶部
0
希望看到您的想法,请您发表评论x