[Up]常用資訊

[重點文章] 重點文章 [重點文章] 重點文章

2022年3月3日 星期四

[Debian] Debian 11.2 Prometheus 和 Alertmanager實戰配置

[Debian] Debian 11.2 Prometheus 和 Alertmanager實戰配置

[Debian] Debian 11.2 Prometheus 和 Alertmanager實戰配置

 
#Step 01 – Prometheus安裝


机器名称                配置     系统	      ip地址	    角色
prometheus              8C16G	ubuntu16.04	  192.168.88.70	prometheus server,grafana server
prometheus-alertmanager 8C16G	ubuntu16.04	  192.168.88.80	alertmanager server


# 建立 user
sudo groupadd --system prometheus
sudo useradd -s /sbin/nologin --system -g prometheus prometheus

# Create configuration and data directories
sudo mkdir /var/lib/prometheus
for i in rules rules.d files_sd; do sudo mkdir -p /etc/prometheus/${i}; done

#

sudo apt-get update
sudo apt-get -y install wget curl
mkdir -p /tmp/prometheus && cd /tmp/prometheus
curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest|grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -
#
tar xvf prometheus*.tar.gz
cd prometheus*/
#
sudo mv prometheus promtool /usr/local/bin/

sudo mv prometheus.yml  /etc/prometheus/prometheus.yml

sudo mv consoles/ console_libraries/ /etc/prometheus/
cd ~/
rm -rf /tmp/prometheus

# 設定 設定檔

cat /etc/prometheus/prometheus.yml

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:9090']

### 建立啟動服務 Create a Prometheus systemd Service unit file
```
sudo tee /etc/systemd/system/prometheus.service EOF
[Unit]
Description=Prometheus
Documentation=https://prometheus.io/docs/introduction/overview/
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/local/bin/prometheus \
  --config.file=/etc/prometheus/prometheus.yml \
  --storage.tsdb.path=/var/lib/prometheus \
  --web.console.templates=/etc/prometheus/consoles \
  --web.console.libraries=/etc/prometheus/console_libraries \
  --web.listen-address=0.0.0.0:9090 \
  --web.external-url=

SyslogIdentifier=prometheus
Restart=always

[Install]
WantedBy=multi-user.target
EOF
```
# 建立目錄權限 Change directory permissions.

for i in rules rules.d files_sd; do sudo chown -R prometheus:prometheus /etc/prometheus/${i}; done
for i in rules rules.d files_sd; do sudo chmod -R 775 /etc/prometheus/${i}; done
sudo chown -R prometheus:prometheus /var/lib/prometheus/

# 重啟服務 Reload systemd daemon and start the service.

sudo systemctl daemon-reload
sudo systemctl start prometheus
sudo systemctl enable prometheus
sudo systemctl status prometheus

 
#Step 02 – 安裝 node_exporter


# 下載
curl -s https://api.github.com/repos/prometheus/node_exporter/releases/latest| grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -

# 解壓縮並複製到目錄
tar -xvf node_exporter*.tar.gz
cd  node_exporter*/
sudo cp node_exporter /usr/local/bin

# 確認設定
$ node_exporter --version

# 建立執行檔
sudo tee /etc/systemd/system/node_exporter.service EOF
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target

[Service]
User=prometheus
ExecStart=/usr/local/bin/node_exporter

[Install]
WantedBy=default.target
EOF

# 重啟服務

sudo systemctl daemon-reload
sudo systemctl start node_exporter
sudo systemctl enable node_exporter

# 確認服務執行狀態
systemctl status node_exporter.service

# 將node_exporter 加入到 prometheus

sudo vim /etc/prometheus/prometheus.yml

# 新增到 scrape_configs 地方底下

scrape_configs:
  # The job name is added as a label `job=` to any timeseries scraped from this config.
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
  - job_name: 'node_exporter'
    static_configs:
      - targets: ['localhost:9100']
      
# 重啟 Prometheus 服務

sudo systemctl restart prometheus


 
#Step 03 – prometheus配置檔案的設定


prometheus的配置檔案採用的是yaml檔案,yaml檔案書寫的要求如下:

大小寫敏感
使用縮排表示層級關係
縮排時不允許使用Tab鍵,只允許使用空格。
縮排的空格數目不重要,只要相同層級的元素左側對齊即可

## Prometheus全域性配置項
global:
  scrape_interval:     15s # 設定抓取資料的週期,預設為1min
  evaluation_interval: 15s # 設定更新rules檔案的週期,預設為1min
  scrape_timeout: 15s # 設定抓取資料的超時時間,預設為10s
  external_labels: # 額外的屬性,會新增到拉取得資料並存到資料庫中
   monitor: 'codelab_monitor'


# Alertmanager配置
alerting:
 alertmanagers:
 - static_configs:
   - targets:
     - localhost:9093 # 設定alertmanager和prometheus互動的介面,即alertmanager監聽的ip地址和埠
     - 10.13.0.80:9093                                                    # alertmanager主机地址
     
# rule配置,首次讀取預設載入,之後根據evaluation_interval設定的週期載入
rule_files:
 - "alertmanager_rules.yml"
 - "prometheus_rules.yml"

# scape配置
scrape_configs:
- job_name: 'prometheus' # job_name預設寫入timeseries的labels中,可以用於查詢使用
  scrape_interval: 15s # 抓取週期,預設採用global配置
  static_configs: # 靜態配置
  - targets: ['localdns:9090'] # prometheus所要抓取資料的地址,即instance例項項

- job_name: 'example-random'
  static_configs:
  - targets: ['localhost:8080']
  
  
#####
或是

alerting:
  alertmanagers:
  - static_configs:
    - targets: ["localhost:9093"]
      labels:
        instance: After_server_Local
    - targets: ["192.168.88.90:9093"]
      labels:
        instance: After_server_1

 
#Step 04 – Alertmanager安裝


#
# 下載
curl -s https://api.github.com/repos/prometheus/alertmanager/releases/latest| grep browser_download_url|grep linux-amd64|cut -d '"' -f 4|wget -qi -

# 解壓縮並複製到目錄
tar -xvf alertmanager*.tar.gz
cd  alertmanager*/
#sudo cp alertmanager /usr/local/bin
sudo mv amtool alertmanager /usr/local/bin

$ sudo mkdir -p /etc/alertmanager
$ sudo mv alertmanager.yml /etc/alertmanager

$ sudo mkdir -p /data/alertmanager
$ sudo useradd -rs /bin/false alertmanager
$ sudo chown alertmanager:alertmanager /usr/local/bin/amtool /usr/local/bin/alertmanager
$ sudo chown -R alertmanager:alertmanager /data/alertmanager /etc/alertmanager/*

# 建立執行檔
sudo tee /etc/systemd/system/alertmanager.service EOF
[Unit]
Description=Alert Manager
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=alertmanager
Group=alertmanager
ExecStart=/usr/local/bin/alertmanager \
  --config.file=/etc/alertmanager/alertmanager.yml \
  --storage.path=/data/alertmanager

Restart=always

[Install]
WantedBy=multi-user.target
EOF

#
# 重啟服務
sudo systemctl daemon-reload
sudo systemctl enable alertmanager
sudo systemctl start alertmanager
#
sudo systemctl status alertmanager.service

# 監控 alertmanager 服務

  - job_name: 'alertmanager'
    static_configs:
    - targets: ['192.168.20.161:9093']

## 設定規則

vi /etc/prometheus/prometheus_rules.yml

groups:
 - name: test
   rules:
   - alert: InstanceDown
     expr: up == 0
     for: 1m

# 或 alertmanager_rules.yml 配置檔案(與prometheus同目錄下)
groups:
 - name: test-rules
   rules:
   - alert: InstanceDown # 告警名稱
     expr: up == 0 # 告警的判定條件,參考Prometheus高階查詢來設定
     for: 2m # 滿足告警條件持續時間多久後,才會傳送告警
     labels: #標籤項
      team: node
     annotations: # 解析項,詳細解釋告警資訊
      summary: "{{$labels.instance}}: has been down"
      description: "{{$labels.instance}}: job {{$labels.job}} has been down "
      value: {{$value}}




## 設定gmail寄信
- name: 'gmail'
  email_configs:
  - to: '@gmail.com'
    from: '@gmail.com'
    smarthost: smtp.gmail.com:587
    auth_username: '@gmail.com'
    auth_identity: '@gmail.com'
    
    
    
systemctl restart alertmanager.service
# 問題排除 Troubleshooting Alert Manager E-Mail Delivery Issues:
sudo journalctl --follow --no-pager --boot --unit alertmanager.service


#### alertmanager 範例
global:
  slack_api_url: your_webhook_url
route:
  group_by: ['job']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 3h
  receiver: "slack"
  routes:
  - match:
      job: "idc_mail"
    group_by: ['host']
    routes:
    - match:
        severity: "critical"
      receiver: "slack"
    - match:
        severity: "warning"
      receiver: "email"
  - match:
      job: "proxmox"
    group_by: ['instance']
    group_wait: 10s
    routes:
    - match:
        severity: "critical"
      receiver: "slack"
    - match:
        severity: "warning"
      receiver: "email"
  - match:
      job: "node_gce"
    group_by: [ 'zone']
    group_wait: 10s
    routes:
    - match:
        severity: "critical"
      receiver: "slack"
    - match:
        severity: "warning"
      receiver: "email"
  - match:
      job: "Domain"
    group_by: ['domain']
    receiver: "email"

receivers:
- name: slack
  slack_configs:
  - api_url: 'your_webhook_url'
    username: "TigerFly Project's Alert"
    channel: 'cts_alert'
    icon_url: https://avatars3.githubusercontent.com/u/3380462
    send_resolved: true
    title: |-
      [TigerFly Project][{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
        {{" "}}(
        {{- with .CommonLabels.Remove .GroupLabels.Names }}
          {{- range $index, $label := .SortedPairs -}}
            {{ if $index }}, {{ end }}
            {{- $label.Name }}="{{ $label.Value -}}"
          {{- end }}
        {{- end -}}
        )
      {{- end }}
    text: >-
      {{ with index .Alerts 0 -}}
        :chart_with_upwards_trend: *<{{ .GeneratorURL }}|Graph>*
        {{- if .Annotations.runbook }}   :notebook: *<{{ .Annotations.runbook }}|Runbook>*{{ end }}
      {{ end }}

      *Alert details*:

      {{ range .Alerts -}}
        *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
      *Summary:* {{ .Annotations.summary }}
      *Description:* {{ .Annotations.description }}
      *Details:*
        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
        {{ end }}
      {{ end }}

- name: 'email'
  email_configs:
  - to: cts@awoo.com.tw
    from: 'secit@awoo.com.tw'
    smarthost: smtp.gmail.com:587
    auth_username: 'useremail@aaa.com'
    auth_password: 'your_accout_password'
    headers:
      From: "TigerFly Prometheus"
      Subject: "TigerFly Monitor Alert"


 
#Step 05 –


 
#Step 06 –


 
#Step 07 –


 
#Step 08 –


 
#Step 09 –


 
#Step 10 –


沒有留言:

張貼留言