部署集群监控
配置该功能可以让管理员在管理系统中查看集群资源信息和告警日志,需要配置 Prometheus、Grafana、Alertmanager、alertsnitch、MySQL 安装
集群资源信息
告警日志
一、环境准备
本监控方案通过 docker-compose 安装,需要安装 docker 和 docker-compose:
- 安装 docker
# 安装所需的包
yum install -y yum-utils \
device-mapper-persistent-data \
lvm2
# 设置稳定存储库
yum-config-manager \
--add-repo \
https://download.docker.com/linux/centos/docker-ce.repo
**# **安装Docker CE**
**yum install docker-ce -y
# 安装指定版本Docker CE
yum list docker-ce --showduplicates | sort -r
yum install docker-ce-23.0.6 -y
**# **启动Docker CE并设置开机启动
systemctl start docker
systemctl enable docker
**# **验证Docker环境**
**docker run hello-world
- 安装 docker-compose
**# **下载安装**
**curl -L "https://github.com/docker/compose/releases/download/v2.7.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
mv docker-compose-linux-x86_64 /usr/local/bin/docker-compose
**# **赋权**
**chmod +x /usr/local/bin/docker-compose
**# **验证安装成功**
**docker-compose --version
- 创建持久化配置目录
# 创建存放配置的目录
mkdir -p /root/monitor
mkdir -p /root/monitor/alertmanager
mkdir -p /root/monitor/alertmanager/template
mkdir -p /root/monitor/grafana
mkdir -p /root/monitor/grafana/provisioning
mkdir -p /root/monitor/grafana/provisioning/dashboards
mkdir -p /root/monitor/grafana/provisioning/dashboards/tmp
mkdir -p /root/monitor/grafana/provisioning/datasources
mkdir -p /root/monitor/prometheus
mkdir -p /root/monitor/prometheus/rules
二、创建配置文件
注意:配置文件中各主机名、主机 IP 以及集群信息按实际情况进行修改!
配置 Grafana
编辑 grafana 配置文件:/root/monitor/grafana/grafana.ini
此处需要修改 [server] 下的 root_url,将 your_scow_mis_path
修改为 scow 管理系统的路径
此处配置了 root_url 和 allow_embedding,将允许 grafana 通过 iframe 被嵌入。并通过 scow 代理,将请求转发到 grafana。
[paths]
# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used)
data = /var/lib/grafana
# folder that contains provisioning config files that grafana will apply on startup and while running.
provisioning = /etc/grafana/provisioning
[auth.anonymous]
# enable anonymous access
enabled = true
# specify role for unauthenticated users
org_role = Viewer
[server]
# Root URL for specifying the Grafana instance
# for example
# root_url=http://localhost:5003/api/admin/monitor/getResourceStatus
root_url=[your_scow_mis_path]/api/admin/monitor/getResourceStatus
[security]
# Allows embedding Grafana dashboards into iframes in other websites
allow_embedding = true
编辑 grafana 默认数据源
/root/monitor/grafana/provisioning/datasources/prometheus.yaml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
orgId: 1
access: proxy
url: http://prometheus:9090
editable: true
isDefault: true
jsonData:
timeInterval: 5s
编辑 grafana Mysql 数据源
/root/monitor/grafana/provisioning/datasources/mysql.yaml
apiVersion: 1
datasources:
- name: AlarmDB-MySQL
type: mysql
url: alert-db:3306
user: alertsnitch
jsonData:
database: alertsnitch
maxOpenConns: 100 # Grafana v5.4+
maxIdleConns: 100 # Grafana v5.4+
maxIdleConnsAuto: true # Grafana v9.5.1+
connMaxLifetime: 14400 # Grafana v5.4+
secureJsonData:
password: alertsnitch
编辑 grafana 面板配置
/root/monitor/grafana/provisioning/dashboards/dashboard.yaml
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
options:
path: /etc/grafana/provisioning/dashboards/tmp
编辑 grafana 面板模板文件
/root/monitor/grafana/provisioning/dashboards/tmp/JobScheduler.json
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"datasource": {
"type": "prometheus"
},
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"panels": [],
"targets": [
{
"datasource": {
"type": "prometheus"
},
"refId": "A"
}
],
"title": "CPU Cores",
"type": "row"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 8,
"x": 0,
"y": 1
},
"id": 4,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max",
"min"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "10.2.2",
"repeat": "partition",
"targets": [
{
"datasource": {
"type": "prometheus"
},
"editorMode": "code",
"expr": "slurm_partition_cpus_idle{clustername=\"$cluster\", partition=\"$partition\"}",
"interval": "",
"legendFormat": "available",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus"
},
"editorMode": "code",
"expr": "slurm_partition_cpus_allocated{clustername=\"$cluster\", partition=\"$partition\"}",
"interval": "",
"legendFormat": "allocated",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus"
},
"editorMode": "code",
"expr": "slurm_partition_cpus_other{clustername=\"$cluster\", partition=\"$partition\"}",
"interval": "",
"legendFormat": "unavailable",
"range": true,
"refId": "C"
}
],
"title": "${partition}",
"type": "timeseries"
},
{
"collapsed": true,
"datasource": {
"type": "prometheus"
},
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 10
},
"id": 8,
"panels": [],
"targets": [
{
"datasource": {
"type": "prometheus"
},
"refId": "A"
}
],
"title": "Job Count",
"type": "row"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 12,
"w": 8,
"x": 0,
"y": 11
},
"id": 12,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max",
"min"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "10.2.2",
"repeat": "partition",
"targets": [
{
"datasource": {
"type": "prometheus"
},
"editorMode": "code",
"expr": "slurm_partition_jobs_running{clustername=\"$cluster\", partition=\"$partition\"}",
"interval": "",
"legendFormat": "RUNNING",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus"
},
"editorMode": "code",
"expr": "slurm_partition_jobs_pending{clustername=\"$cluster\", partition=\"$partition\"}",
"interval": "",
"legendFormat": "PENDING",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus"
},
"editorMode": "code",
"expr": "slurm_partition_jobs_suspended{clustername=\"$cluster\", partition=\"$partition\"}",
"interval": "",
"legendFormat": "SUSPENDED",
"range": true,
"refId": "C"
},
{
"datasource": {
"type": "prometheus"
},
"editorMode": "code",
"expr": "slurm_partition_jobs_completing{clustername=\"$cluster\", partition=\"$partition\"}",
"interval": "",
"legendFormat": "COMPLETING",
"range": true,
"refId": "D"
}
],
"title": "$partition",
"type": "timeseries"
}
],
"refresh": "",
"schemaVersion": 38,
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "hpc01",
"value": "hpc01"
},
"datasource": {
"type": "prometheus"
},
"definition": "label_values(clustername)",
"hide": 0,
"includeAll": false,
"label": "集群",
"multi": false,
"name": "cluster",
"options": [],
"query": "label_values(clustername)",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus"
},
"definition": "label_values(slurm_partition_cpus_total{clustername=\"$cluster\"}, partition)",
"hide": 0,
"includeAll": true,
"label": "分区",
"multi": true,
"name": "partition",
"options": [],
"query": "label_values(slurm_partition_cpus_total{clustername=\"$cluster\"}, partition)",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Job Scheduler",
"uid": "shZOtO4Sk",
"version": 1,
"weekStart": ""
}
配置 Prometheus
配置 Prometheus 配置文件
/root/monitor/prometheus/prometheus.yml
注意:scrape_configs 下 slurm_exporter 中 slurmctlIP 需要替换成 slurm 管理节点实际的 IP 地址
如果有多个 slurm 管理节点则在 scrape_configs
下 slurm_exporter 中的 static_configs
里面配置更多。
global:
scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute.
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/rules.yml"
scrape_configs:
- job_name: 'slurm_exporter'
scrape_interval: 30s
scrape_timeout: 30s
static_configs:
- targets: ['[slurmctlIP]:9341']
labels:
appname: 'hpc01' #添加的标签
clustername: 'hpc01' #添加的标签
- job_name: 'prometheus'
static_configs:
- targets: [ 'prometheus:9090']
labels:
appname: 'prometheus'
配置 Prometheus 告警规则
/root/monitor/prometheus/rules/rules.yml
# slurmAlert
groups:
- name: slurmAlert
rules:
- alert: SlurmPartitionCPUUsage
expr: (slurm_partition_cpus_total - slurm_partition_cpus_idle)/slurm_partition_cpus_total > 0.80
for: 1m
labels:
severity: "Warning"
annotations:
summary: "SLURM partition CPU usgae high"
description: "{{ $labels.clustername }} partition: {{ $labels.partition }} - CPU usage above 80% (current value: {{ $value }})"
- alert: SlurmPendingJob
expr: slurm_partition_jobs_pending >= 5
for: 1m
labels:
severity: "Warning"
annotations:
summary: "Too many jobs queued"
description: "{{ $labels.clustername }} partition: {{ $labels.partition }} - Too many jobs queued (current value: {{ $value }})"
配置 Alertmanager
配置 Alertmanager 配置文件
/root/monitor/alertmanager/alertmanager.yml
global:
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'xxxxxxx@163.com'
smtp_auth_username: 'xxxxxxx@163.com'
smtp_auth_password: 'XXXXXXXXXXXXXXXX'
smtp_require_tls: false
templates:
- '/etc/alertmanager/template/*.tmpl'
route:
group_by: ['alertname','cluster','service']
group_wait: 30s
group_interval: 10s
repeat_interval: 10m
receiver: 'default-receiver'
routes:
- receiver: 'alertsnitch'
continue: true
receivers:
- name: 'default-receiver'
email_configs:
- to: 'xxxxxxx@qq.com'
html: '{{ template "email.default.html" . }}'
headers: { Subject: "Prometheus 告警测试邮件" }
- name: 'alertsnitch'
webhook_configs:
- send_resolved: true
http_config:
follow_redirects: true
url: http://alertsnitch:9567/webhook
配置 Alertmanager 告警模板
/root/monitor/alertmanager/template/default.tmpl
{{ define "__alertmanager" }}Alertmanager{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver | urlquery }}{{ end }}
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__description" }}{{ end }}
{{ define "__text_alert_list" }}{{ range . }}Labels:
{{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }}
{{ end }}Annotations:
{{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }}
{{ end }}Source: {{ .GeneratorURL }}
{{ end }}{{ end }}
{{ define "__text_alert_list_markdown" }}{{ range . }}
Labels:
{{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }}
{{ end }}
Annotations:
{{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }}
{{ end }}
Source: {{ .GeneratorURL }}
{{ end }}
{{ end }}
{{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }}
{{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }}
{{ define "slack.default.callbackid" }}{{ end }}
{{ define "slack.default.pretext" }}{{ end }}
{{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }}
{{ define "slack.default.iconemoji" }}{{ end }}
{{ define "slack.default.iconurl" }}{{ end }}
{{ define "slack.default.text" }}{{ end }}
{{ define "slack.default.footer" }}{{ end }}
{{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }}
{{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }}
{{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }}
{{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }}
{{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }}
{{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
{{ if gt (len .Alerts.Firing) 0 -}}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{- end }}
{{- end }}
{{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }}
{{ define "wechat.default.message" }}{{ template "__subject" . }}
{{ .CommonAnnotations.SortedPairs.Values | join " " }}
{{ if gt (len .Alerts.Firing) 0 -}}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{- end }}
AlertmanagerUrl:
{{ template "__alertmanagerURL" . }}
{{- end }}
{{ define "wechat.default.to_user" }}{{ end }}
{{ define "wechat.default.to_party" }}{{ end }}
{{ define "wechat.default.to_tag" }}{{ end }}
{{ define "wechat.default.agent_id" }}{{ end }}
{{ define "victorops.default.state_message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
{{ if gt (len .Alerts.Firing) 0 -}}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{- end }}
{{- end }}
{{ define "victorops.default.entity_display_name" }}{{ template "__subject" . }}{{ end }}
{{ define "victorops.default.monitoring_tool" }}{{ template "__alertmanager" . }}{{ end }}
{{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
{{ if gt (len .Alerts.Firing) 0 }}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }}
{{ define "sns.default.subject" }}{{ template "__subject" . }}{{ end }}
{{ define "sns.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
{{ if gt (len .Alerts.Firing) 0 }}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "telegram.default.message" }}
{{ if gt (len .Alerts.Firing) 0 }}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "discord.default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "discord.default.message" }}
{{ if gt (len .Alerts.Firing) 0 }}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "webex.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }}
{{ if gt (len .Alerts.Firing) 0 }}
Alerts Firing:
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
Alerts Resolved:
{{ template "__text_alert_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "msteams.default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "msteams.default.text" }}
{{ if gt (len .Alerts.Firing) 0 }}
# Alerts Firing:
{{ template "__text_alert_list_markdown" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
# Alerts Resolved:
{{ template "__text_alert_list_markdown" .Alerts.Resolved }}
{{ end }}
{{ end }}
三、启动服务
编辑集群监控的 docker compose 配置文件
/root/monitor/monitor.yaml
如果需要修改 alert-db
密码,则在对应容器配置下修改即可。如果修改 mysql 用户 alertsnitch
的密码,需要在配置 grafana 的 mysql 数据源处同步修改密码
version: "3.1"
services:
grafana:
container_name: grafana
image: grafana/grafana:10.2.2
user: root
ports:
- "4000:3000"
volumes:
- /root/monitor/grafana:/etc/grafana
- grafana_data:/var/lib/grafana
restart: unless-stopped
prometheus:
container_name: prometheus
image: prom/prometheus
user: root
ports:
- "9090:9090"
volumes:
- /root/monitor/prometheus:/etc/prometheus
- prometheus_data:/prometheus
restart: unless-stopped
alertmanager:
container_name: alertmanager
image: prom/alertmanager
user: root
ports:
- 9093:9093
volumes:
- /root/monitor/alertmanager:/etc/alertmanager
- alertmanager_data:/alertmanager
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
restart: unless-stopped
alert-db:
container_name: alert-db
image: mysql:5.7
command: --default-authentication-plugin=mysql_native_password
environment:
MYSQL_DATABASE: alertsnitch
MYSQL_USER: "alertsnitch"
MYSQL_PASSWORD: "alertsnitch"
MYSQL_ROOT_PASSWORD: "root"
volumes:
- alert_db_data:/var/lib/mysql
restart: unless-stopped
alertsnitch:
container_name: alertsnitch
image: registry.gitlab.com/yakshaving.art/alertsnitch
ports:
- 9567:9567
environment:
ALERTSNITCH_BACKEND: "mysql"
ALERTSNITCH_DSN: "alertsnitch:alertsnitch@tcp(alert-db)/alertsnitch"
restart: unless-stopped
depends_on:
- alert-db
volumes:
prometheus_data: {}
grafana_data: {}
alertmanager_data: {}
alert_db_data: {}