背景
环境中使用了vector采集服务日志,服务有50多个,每个服务的配置文件是分开的,这就意味着有50多个vector进程,某个vector进程挂了就采集不到日志了,每次都是查日志时发现没有采集到才发现vector进程挂了,所以需要加一个vector进程的监控。
实现
使用python脚本实现,监控进程,如果进程挂了发送消息到钉钉。
check_vector.py
import subprocess
import time
import requests
import hmac
import hashlib
import base64
import urllib.parse
import json
# ===== 钉钉 Webhook 和密钥=====
WEBHOOK_URL = "https://oapi.dingtalk.com/robot/send?access_token=xxx"
SECRET = "xxx"
# ===== 要监控的服务列表=====
services = [
"service-a", "service-b", ...
]
# ===== 检查进程是否存在的函数 =====
def check_process_exists(process_name):
try:
# 使用pgrep查找进程
result = subprocess.run(["pgrep", "-f", process_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return len(result.stdout.decode().strip()) > 0
except Exception as e:
print(f"检查进程 {process_name} 失败: {e}")
return False
# ===== 发送钉钉消息的函数 =====
def send_dingding_message(message):
timestamp = str(round(time.time() * 1000))
secret = SECRET
# 计算签名
sign_str = timestamp + '\n' + secret
hmac_code = hmac.new(secret.encode('utf-8'), sign_str.encode('utf-8'), hashlib.sha256)
sign = urllib.parse.quote(base64.b64encode(hmac_code.digest()))
full_url = f"{WEBHOOK_URL}×tamp={timestamp}&sign={sign}"
headers = {
"Content-Type": "application/json"
}
data = {
"msgtype": "text",
"text": {
"content": message
}
}
response = requests.post(full_url, headers=headers, data=json.dumps(data))
return response.text
# ===== 监控主函数 =====
def monitor_services():
# 存储上一次的状态(用于判断是否恢复)
last_status = {}
while True:
current_status = {}
for service in services:
exists = check_process_exists(service)
current_status[service] = exists
# 判断是否异常或恢复
if service in last_status:
if not exists and last_status[service]:
# 服务挂了
message = f"❌ 服务异常:{service} 日志采集进程已停止,请检查!"
send_dingding_message(message)
elif exists and not last_status[service]:
# 服务恢复
message = f"✅ 服务恢复:{service} 日志采集进程已重新启动"
send_dingding_message(message)
# 更新状态
last_status.update(current_status)
# 每隔 5 分钟检查一次(可调整)
time.sleep(300)
# ===== 启动脚本 =====
if __name__ == "__main__":
monitor_services()
启动
nohup python3.8 check_vector.py &> /dev/null &
