[Debian] Debian 11.2 Prometheus 和 Alertmanager實戰配置
[Debian] Debian 11.2 Prometheus 和 Alertmanager實戰配置
#Step 01 – Prometheus安裝
机器名称 配置 系统 ip地址 角色
prometheus 8C16G ubuntu16.04 192.168.88.70 prometheus server,grafana server
prometheus-alertmanager 8C16G ubuntu16.04 192.168.88.80 alertmanager server
# 建立 user
sudo groupadd --system prometheus
sudo useradd -s /sbin/nologin --system -g prometheus prometheus
# Create configuration and data directories
sudo mkdir /var/lib/prometheus
for i in rules rules.d files_sd; do sudo mkdir -p /etc/prometheus/${i}; done
#
sudo apt-get update
sudo apt-get -y install wget curl
mkdir -p /tmp/prometheus && cd /tmp/prometheus
curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest|grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -
#
tar xvf prometheus*.tar.gz
cd prometheus*/
#
sudo mv prometheus promtool /usr/local/bin/
sudo mv prometheus.yml /etc/prometheus/prometheus.yml
sudo mv consoles/ console_libraries/ /etc/prometheus/
cd ~/
rm -rf /tmp/prometheus
# 設定 設定檔
cat /etc/prometheus/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
### 建立啟動服務 Create a Prometheus systemd Service unit file
```
sudo tee /etc/systemd/system/prometheus.service EOF
[Unit]
Description=Prometheus
Documentation=https://prometheus.io/docs/introduction/overview/
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/local/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.listen-address=0.0.0.0:9090 \
--web.external-url=
SyslogIdentifier=prometheus
Restart=always
[Install]
WantedBy=multi-user.target
EOF
```
# 建立目錄權限 Change directory permissions.
for i in rules rules.d files_sd; do sudo chown -R prometheus:prometheus /etc/prometheus/${i}; done
for i in rules rules.d files_sd; do sudo chmod -R 775 /etc/prometheus/${i}; done
sudo chown -R prometheus:prometheus /var/lib/prometheus/
# 重啟服務 Reload systemd daemon and start the service.
sudo systemctl daemon-reload
sudo systemctl start prometheus
sudo systemctl enable prometheus
sudo systemctl status prometheus
#Step 02 – 安裝 node_exporter
# 下載
curl -s https://api.github.com/repos/prometheus/node_exporter/releases/latest| grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -
# 解壓縮並複製到目錄
tar -xvf node_exporter*.tar.gz
cd node_exporter*/
sudo cp node_exporter /usr/local/bin
# 確認設定
$ node_exporter --version
# 建立執行檔
sudo tee /etc/systemd/system/node_exporter.service EOF
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
ExecStart=/usr/local/bin/node_exporter
[Install]
WantedBy=default.target
EOF
# 重啟服務
sudo systemctl daemon-reload
sudo systemctl start node_exporter
sudo systemctl enable node_exporter
# 確認服務執行狀態
systemctl status node_exporter.service
# 將node_exporter 加入到 prometheus
sudo vim /etc/prometheus/prometheus.yml
# 新增到 scrape_configs 地方底下
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: 'node_exporter'
static_configs:
- targets: ['localhost:9100']
# 重啟 Prometheus 服務
sudo systemctl restart prometheus
#Step 03 – prometheus配置檔案的設定
prometheus的配置檔案採用的是yaml檔案,yaml檔案書寫的要求如下:
大小寫敏感
使用縮排表示層級關係
縮排時不允許使用Tab鍵,只允許使用空格。
縮排的空格數目不重要,只要相同層級的元素左側對齊即可
## Prometheus全域性配置項
global:
scrape_interval: 15s # 設定抓取資料的週期,預設為1min
evaluation_interval: 15s # 設定更新rules檔案的週期,預設為1min
scrape_timeout: 15s # 設定抓取資料的超時時間,預設為10s
external_labels: # 額外的屬性,會新增到拉取得資料並存到資料庫中
monitor: 'codelab_monitor'
# Alertmanager配置
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093 # 設定alertmanager和prometheus互動的介面,即alertmanager監聽的ip地址和埠
- 10.13.0.80:9093 # alertmanager主机地址
# rule配置,首次讀取預設載入,之後根據evaluation_interval設定的週期載入
rule_files:
- "alertmanager_rules.yml"
- "prometheus_rules.yml"
# scape配置
scrape_configs:
- job_name: 'prometheus' # job_name預設寫入timeseries的labels中,可以用於查詢使用
scrape_interval: 15s # 抓取週期,預設採用global配置
static_configs: # 靜態配置
- targets: ['localdns:9090'] # prometheus所要抓取資料的地址,即instance例項項
- job_name: 'example-random'
static_configs:
- targets: ['localhost:8080']
#####
或是
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
labels:
instance: After_server_Local
- targets: ["192.168.88.90:9093"]
labels:
instance: After_server_1
#Step 04 – Alertmanager安裝
#
# 下載
curl -s https://api.github.com/repos/prometheus/alertmanager/releases/latest| grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -
# 解壓縮並複製到目錄
tar -xvf alertmanager*.tar.gz
cd alertmanager*/
#sudo cp alertmanager /usr/local/bin
sudo mv amtool alertmanager /usr/local/bin
$ sudo mkdir -p /etc/alertmanager
$ sudo mv alertmanager.yml /etc/alertmanager
$ sudo mkdir -p /data/alertmanager
$ sudo useradd -rs /bin/false alertmanager
$ sudo chown alertmanager:alertmanager /usr/local/bin/amtool /usr/local/bin/alertmanager
$ sudo chown -R alertmanager:alertmanager /data/alertmanager /etc/alertmanager/*
# 建立執行檔
sudo tee /etc/systemd/system/alertmanager.service EOF
[Unit]
Description=Alert Manager
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=alertmanager
Group=alertmanager
ExecStart=/usr/local/bin/alertmanager \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/data/alertmanager
Restart=always
[Install]
WantedBy=multi-user.target
EOF
#
# 重啟服務
sudo systemctl daemon-reload
sudo systemctl enable alertmanager
sudo systemctl start alertmanager
#
sudo systemctl status alertmanager.service
# 監控 alertmanager 服務
- job_name: 'alertmanager'
static_configs:
- targets: ['192.168.20.161:9093']
## 設定規則
vi /etc/prometheus/prometheus_rules.yml
groups:
- name: test
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
# 或 alertmanager_rules.yml 配置檔案(與prometheus同目錄下)
groups:
- name: test-rules
rules:
- alert: InstanceDown # 告警名稱
expr: up == 0 # 告警的判定條件,參考Prometheus高階查詢來設定
for: 2m # 滿足告警條件持續時間多久後,才會傳送告警
labels: #標籤項
team: node
annotations: # 解析項,詳細解釋告警資訊
summary: "{{$labels.instance}}: has been down"
description: "{{$labels.instance}}: job {{$labels.job}} has been down "
value: {{$value}}
## 設定gmail寄信
- name: 'gmail'
email_configs:
- to: '@gmail.com'
from: '@gmail.com'
smarthost: smtp.gmail.com:587
auth_username: '@gmail.com'
auth_identity: '@gmail.com'
systemctl restart alertmanager.service
# 問題排除 Troubleshooting Alert Manager E-Mail Delivery Issues:
sudo journalctl --follow --no-pager --boot --unit alertmanager.service
#### alertmanager 範例
global:
slack_api_url: your_webhook_url
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: "slack"
routes:
- match:
job: "idc_mail"
group_by: ['host']
routes:
- match:
severity: "critical"
receiver: "slack"
- match:
severity: "warning"
receiver: "email"
- match:
job: "proxmox"
group_by: ['instance']
group_wait: 10s
routes:
- match:
severity: "critical"
receiver: "slack"
- match:
severity: "warning"
receiver: "email"
- match:
job: "node_gce"
group_by: [ 'zone']
group_wait: 10s
routes:
- match:
severity: "critical"
receiver: "slack"
- match:
severity: "warning"
receiver: "email"
- match:
job: "Domain"
group_by: ['domain']
receiver: "email"
receivers:
- name: slack
slack_configs:
- api_url: 'your_webhook_url'
username: "TigerFly Project's Alert"
channel: 'cts_alert'
icon_url: https://avatars3.githubusercontent.com/u/3380462
send_resolved: true
title: |-
[TigerFly Project][{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
{{- if gt (len .CommonLabels) (len .GroupLabels) -}}
{{" "}}(
{{- with .CommonLabels.Remove .GroupLabels.Names }}
{{- range $index, $label := .SortedPairs -}}
{{ if $index }}, {{ end }}
{{- $label.Name }}="{{ $label.Value -}}"
{{- end }}
{{- end -}}
)
{{- end }}
text: >-
{{ with index .Alerts 0 -}}
:chart_with_upwards_trend: *<{{ .GeneratorURL }}|Graph>*
{{- if .Annotations.runbook }} :notebook: *<{{ .Annotations.runbook }}|Runbook>*{{ end }}
{{ end }}
*Alert details*:
{{ range .Alerts -}}
*Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
- name: 'email'
email_configs:
- to: cts@awoo.com.tw
from: 'secit@awoo.com.tw'
smarthost: smtp.gmail.com:587
auth_username: 'useremail@aaa.com'
auth_password: 'your_accout_password'
headers:
From: "TigerFly Prometheus"
Subject: "TigerFly Monitor Alert"
#Step 05 –
#Step 06 –
#Step 07 –
#Step 08 –
#Step 09 –
#Step 10 –
沒有留言:
張貼留言