dots/modules/hosts/nixos/nixnuc/grafana-files/alert-rules.yaml

298 lines
8.8 KiB
YAML

apiVersion: 1
groups:
- orgId: 1
name: infrastructure_alerts
folder: Monitoring
interval: 1m
rules:
- uid: node_exporter_down
title: Node Exporter Down
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: VictoriaMetrics
model:
expr: up{job="node"}
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 1
type: lt
operator:
type: and
query:
params:
- C
type: query
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: ""
panelId: 0
noDataState: NoData
execErrState: Error
for: 2m
annotations:
summary: Node exporter has been down for more than 2 minutes
labels:
severity: critical
isPaused: false
- uid: high_cpu_usage
title: High CPU Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: VictoriaMetrics
model:
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 80
type: gt
operator:
type: and
query:
params:
- C
type: query
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: ""
panelId: 0
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: CPU usage is above 80% for more than 5 minutes
labels:
severity: warning
isPaused: false
- uid: high_memory_usage
title: High Memory Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: VictoriaMetrics
model:
expr: 100 * (1 - ((node_memory_MemAvailable_bytes) / (node_memory_MemTotal_bytes)))
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
query:
params:
- C
type: query
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: ""
panelId: 0
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: Memory usage is above 85% for more than 5 minutes
labels:
severity: warning
isPaused: false
- uid: low_disk_space
title: Low Disk Space
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: VictoriaMetrics
model:
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|fuse.lxcfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|fuse.lxcfs"})
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
query:
params:
- C
type: query
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: ""
panelId: 0
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: Disk usage on root filesystem is above 85%
labels:
severity: warning
isPaused: false
- uid: cadvisor_down
title: cAdvisor Down
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: VictoriaMetrics
model:
expr: up{job="cadvisor"}
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 1
type: lt
operator:
type: and
query:
params:
- C
type: query
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: ""
panelId: 0
noDataState: NoData
execErrState: Error
for: 2m
annotations:
summary: cAdvisor (container metrics) has been down for more than 2 minutes
labels:
severity: warning
isPaused: false