[Debian] Debian 11.2 Prometheus 和 Alertmanager實戰配置
[Debian] Debian 11.2 Prometheus 和 Alertmanager實戰配置
#Step 01 – Prometheus安裝
机器名称 配置 系统 ip地址 角色
prometheus 8C16G ubuntu16.04 192.168.88.70 prometheus server,grafana server
prometheus-alertmanager 8C16G ubuntu16.04 192.168.88.80 alertmanager server
sudo groupadd --system prometheus
sudo useradd -s /sbin/nologin --system -g prometheus prometheus
sudo mkdir /var/lib/prometheus
for i in rules rules.d files_sd; do sudo mkdir -p /etc/prometheus/${i}; done
sudo apt-get update
sudo apt-get -y install wget curl
mkdir -p /tmp/prometheus && cd /tmp/prometheus
curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest|grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -
tar xvf prometheus*.tar.gz
cd prometheus*/
sudo mv prometheus promtool /usr/local/bin/
sudo mv prometheus.yml /etc/prometheus/prometheus.yml
sudo mv consoles/ console_libraries/ /etc/prometheus/
cd ~/
rm -rf /tmp/prometheus
cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
rule_files:
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
```
sudo tee /etc/systemd/system/prometheus.service EOF
[Unit]
Description=Prometheus
Documentation=https://prometheus.io/docs/introduction/overview/
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/local/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.listen-address=0.0.0.0:9090 \
--web.external-url=
SyslogIdentifier=prometheus
Restart=always
[Install]
WantedBy=multi-user.target
EOF
```
for i in rules rules.d files_sd; do sudo chown -R prometheus:prometheus /etc/prometheus/${i}; done
for i in rules rules.d files_sd; do sudo chmod -R 775 /etc/prometheus/${i}; done
sudo chown -R prometheus:prometheus /var/lib/prometheus/
sudo systemctl daemon-reload
sudo systemctl start prometheus
sudo systemctl enable prometheus
sudo systemctl status prometheus
#Step 02 – 安裝 node_exporter
curl -s https://api.github.com/repos/prometheus/node_exporter/releases/latest| grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -
tar -xvf node_exporter*.tar.gz
cd node_exporter*/
sudo cp node_exporter /usr/local/bin
$ node_exporter --version
sudo tee /etc/systemd/system/node_exporter.service EOF
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
ExecStart=/usr/local/bin/node_exporter
[Install]
WantedBy=default.target
EOF
sudo systemctl daemon-reload
sudo systemctl start node_exporter
sudo systemctl enable node_exporter
systemctl status node_exporter.service
sudo vim /etc/prometheus/prometheus.yml
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: 'node_exporter'
static_configs:
- targets: ['localhost:9100']
sudo systemctl restart prometheus
#Step 03 – prometheus配置檔案的設定
prometheus的配置檔案採用的是yaml檔案,yaml檔案書寫的要求如下:
大小寫敏感
使用縮排表示層級關係
縮排時不允許使用Tab鍵,只允許使用空格。
縮排的空格數目不重要,只要相同層級的元素左側對齊即可
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 15s
external_labels:
monitor: 'codelab_monitor'
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
- 10.13.0.80:9093
rule_files:
- "alertmanager_rules.yml"
- "prometheus_rules.yml"
scrape_configs:
- job_name: 'prometheus'
scrape_interval: 15s
static_configs:
- targets: ['localdns:9090']
- job_name: 'example-random'
static_configs:
- targets: ['localhost:8080']
或是
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
labels:
instance: After_server_Local
- targets: ["192.168.88.90:9093"]
labels:
instance: After_server_1
#Step 04 – Alertmanager安裝
curl -s https://api.github.com/repos/prometheus/alertmanager/releases/latest| grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -
tar -xvf alertmanager*.tar.gz
cd alertmanager*/
sudo mv amtool alertmanager /usr/local/bin
$ sudo mkdir -p /etc/alertmanager
$ sudo mv alertmanager.yml /etc/alertmanager
$ sudo mkdir -p /data/alertmanager
$ sudo useradd -rs /bin/false alertmanager
$ sudo chown alertmanager:alertmanager /usr/local/bin/amtool /usr/local/bin/alertmanager
$ sudo chown -R alertmanager:alertmanager /data/alertmanager /etc/alertmanager/*
sudo tee /etc/systemd/system/alertmanager.service EOF
[Unit]
Description=Alert Manager
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=alertmanager
Group=alertmanager
ExecStart=/usr/local/bin/alertmanager \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/data/alertmanager
Restart=always
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable alertmanager
sudo systemctl start alertmanager
sudo systemctl status alertmanager.service
- job_name: 'alertmanager'
static_configs:
- targets: ['192.168.20.161:9093']
vi /etc/prometheus/prometheus_rules.yml
groups:
- name: test
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
groups:
- name: test-rules
rules:
- alert: InstanceDown
expr: up == 0
for: 2m
labels:
team: node
annotations:
summary: "{{$labels.instance}}: has been down"
description: "{{$labels.instance}}: job {{$labels.job}} has been down "
value: {{$value}}
- name: 'gmail'
email_configs:
- to: '@gmail.com'
from: '@gmail.com'
smarthost: smtp.gmail.com:587
auth_username: '@gmail.com'
auth_identity: '@gmail.com'
systemctl restart alertmanager.service
sudo journalctl --follow --no-pager --boot --unit alertmanager.service
global:
slack_api_url: your_webhook_url
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: "slack"
routes:
- match:
job: "idc_mail"
group_by: ['host']
routes:
- match:
severity: "critical"
receiver: "slack"
- match:
severity: "warning"
receiver: "email"
- match:
job: "proxmox"
group_by: ['instance']
group_wait: 10s
routes:
- match:
severity: "critical"
receiver: "slack"
- match:
severity: "warning"
receiver: "email"
- match:
job: "node_gce"
group_by: [ 'zone']
group_wait: 10s
routes:
- match:
severity: "critical"
receiver: "slack"
- match:
severity: "warning"
receiver: "email"
- match:
job: "Domain"
group_by: ['domain']
receiver: "email"
receivers:
- name: slack
slack_configs:
- api_url: 'your_webhook_url'
username: "TigerFly Project's Alert"
channel: 'cts_alert'
icon_url: https://avatars3.githubusercontent.com/u/3380462
send_resolved: true
title: |-
[TigerFly Project][{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
{{- if gt (len .CommonLabels) (len .GroupLabels) -}}
{{" "}}(
{{- with .CommonLabels.Remove .GroupLabels.Names }}
{{- range $index, $label := .SortedPairs -}}
{{ if $index }}, {{ end }}
{{- $label.Name }}="{{ $label.Value -}}"
{{- end }}
{{- end -}}
)
{{- end }}
text: >-
{{ with index .Alerts 0 -}}
:chart_with_upwards_trend: *<{{ .GeneratorURL }}|Graph>*
{{- if .Annotations.runbook }} :notebook: *<{{ .Annotations.runbook }}|Runbook>*{{ end }}
{{ end }}
*Alert details*:
{{ range .Alerts -}}
*Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
- name: 'email'
email_configs:
- to: cts@awoo.com.tw
from: 'secit@awoo.com.tw'
smarthost: smtp.gmail.com:587
auth_username: 'useremail@aaa.com'
auth_password: 'your_accout_password'
headers:
From: "TigerFly Prometheus"
Subject: "TigerFly Monitor Alert"
#Step 05 –
#Step 06 –
#Step 07 –
#Step 08 –
#Step 09 –
#Step 10 –
沒有留言:
張貼留言