检查会遇到集群节点内存消耗超过90%,我们可以筛选一些可以进行重启的pods,如脚本中涉及svc-开头的,进行触发即重启的shell编写。此项会涉及metrics组件需要安装。
#!/bin/bash# 设置内存使用率阈值为90%
MEMORY_THRESHOLD=90# 初始化一个数组来存储需要处理的节点名
EXCEED_NODES=()# 获取所有节点的内存使用信息
NODES=$(sudo kubectl top node)# 逐行处理节点信息
while IFS= read -r line; do# 跳过表头行if [[ $line =~ ^NAME ]]; thencontinuefi# 提取节点名称和内存使用百分比NODE=$(echo "$line" | awk '{print $1}')MEMORY_PERCENT=$(echo "$line" | awk '{print $4}' | tr -d '%')# 检查是否超过阈值if [[ $MEMORY_PERCENT -gt $MEMORY_THRESHOLD ]]; thenEXCEED_NODES+=("$NODE")echo "Node $NODE memory usage is over $MEMORY_THRESHOLD%, checking for svc- prefixed Pods..."elseecho "Node $NODE is within acceptable memory usage."fi
done <<< "$(echo "$NODES" | awk '/^[^[:space:]]/ {print}')"# 特定命名空间
NAMESPACE="test"# 对于每一个超过阈值的节点,查找并重启svc-开头的Pods
for NODE in "${EXCEED_NODES[@]}"; do# 获取该节点上所有名称以svc-开头的PodsPODS=$(sudo kubectl get pods -n $NAMESPACE -o wide | grep "$NODE" | grep "^.*\ssvc-" | awk '{print $1}')# 如果有符合条件的Pods,则尝试删除它们if [ -n "$PODS" ]; thenfor POD in $PODS; dosudo kubectl delete pod $POD -n $NAMESPACEecho "Deleted Pod $NAMESPACE/$POD"doneelseecho "No svc- prefixed Pods found on Node $NODE."fi
done# 输出最终状态
if [ ${#EXCEED_NODES[@]} -eq 0 ]; thenecho "No Pods deleted, all nodes are below the threshold."
elseecho "Some Pods were deleted due to high memory usage."
fi