mirror of
https://github.com/genebean/dots.git
synced 2026-03-27 09:27:44 -04:00
298 lines
8.8 KiB
YAML
298 lines
8.8 KiB
YAML
apiVersion: 1
|
|
groups:
|
|
- orgId: 1
|
|
name: infrastructure_alerts
|
|
folder: Monitoring
|
|
interval: 1m
|
|
rules:
|
|
- uid: node_exporter_down
|
|
title: Node Exporter Down
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: VictoriaMetrics
|
|
model:
|
|
expr: up{job="node"}
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 1
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
type: query
|
|
expression: B
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
dashboardUid: ""
|
|
panelId: 0
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
for: 2m
|
|
annotations:
|
|
summary: Node exporter has been down for more than 2 minutes
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|
|
- uid: high_cpu_usage
|
|
title: High CPU Usage
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: VictoriaMetrics
|
|
model:
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 80
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
type: query
|
|
expression: B
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
dashboardUid: ""
|
|
panelId: 0
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
for: 5m
|
|
annotations:
|
|
summary: CPU usage is above 80% for more than 5 minutes
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
- uid: high_memory_usage
|
|
title: High Memory Usage
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: VictoriaMetrics
|
|
model:
|
|
expr: 100 * (1 - ((node_memory_MemAvailable_bytes) / (node_memory_MemTotal_bytes)))
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 85
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
type: query
|
|
expression: B
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
dashboardUid: ""
|
|
panelId: 0
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
for: 5m
|
|
annotations:
|
|
summary: Memory usage is above 85% for more than 5 minutes
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
- uid: low_disk_space
|
|
title: Low Disk Space
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: VictoriaMetrics
|
|
model:
|
|
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|fuse.lxcfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|fuse.lxcfs"})
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 85
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
type: query
|
|
expression: B
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
dashboardUid: ""
|
|
panelId: 0
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
for: 5m
|
|
annotations:
|
|
summary: Disk usage on root filesystem is above 85%
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
- uid: cadvisor_down
|
|
title: cAdvisor Down
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: VictoriaMetrics
|
|
model:
|
|
expr: up{job="cadvisor"}
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 1
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
type: query
|
|
expression: B
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
dashboardUid: ""
|
|
panelId: 0
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
for: 2m
|
|
annotations:
|
|
summary: cAdvisor (container metrics) has been down for more than 2 minutes
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
|