mirror of
https://github.com/genebean/dots.git
synced 2026-03-28 09:57:43 -04:00
Build a new monitoring stack
This commit is contained in:
parent
65485e1c47
commit
a2799f5e54
13 changed files with 1104 additions and 368 deletions
298
modules/hosts/nixos/nixnuc/grafana-files/alert-rules.yaml
Normal file
298
modules/hosts/nixos/nixnuc/grafana-files/alert-rules.yaml
Normal file
|
|
@ -0,0 +1,298 @@
|
|||
apiVersion: 1
|
||||
groups:
|
||||
- orgId: 1
|
||||
name: infrastructure_alerts
|
||||
folder: Monitoring
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: node_exporter_down
|
||||
title: Node Exporter Down
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: VictoriaMetrics
|
||||
model:
|
||||
expr: up{job="node"}
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
- refId: C
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 1
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
type: query
|
||||
expression: B
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
dashboardUid: ""
|
||||
panelId: 0
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 2m
|
||||
annotations:
|
||||
summary: Node exporter has been down for more than 2 minutes
|
||||
labels:
|
||||
severity: critical
|
||||
isPaused: false
|
||||
- uid: high_cpu_usage
|
||||
title: High CPU Usage
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: VictoriaMetrics
|
||||
model:
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
- refId: C
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 80
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
type: query
|
||||
expression: B
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
dashboardUid: ""
|
||||
panelId: 0
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: CPU usage is above 80% for more than 5 minutes
|
||||
labels:
|
||||
severity: warning
|
||||
isPaused: false
|
||||
- uid: high_memory_usage
|
||||
title: High Memory Usage
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: VictoriaMetrics
|
||||
model:
|
||||
expr: 100 * (1 - ((node_memory_MemAvailable_bytes) / (node_memory_MemTotal_bytes)))
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
- refId: C
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 85
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
type: query
|
||||
expression: B
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
dashboardUid: ""
|
||||
panelId: 0
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: Memory usage is above 85% for more than 5 minutes
|
||||
labels:
|
||||
severity: warning
|
||||
isPaused: false
|
||||
- uid: low_disk_space
|
||||
title: Low Disk Space
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: VictoriaMetrics
|
||||
model:
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|fuse.lxcfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|fuse.lxcfs"})
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
- refId: C
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 85
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
type: query
|
||||
expression: B
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
dashboardUid: ""
|
||||
panelId: 0
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: Disk usage on root filesystem is above 85%
|
||||
labels:
|
||||
severity: warning
|
||||
isPaused: false
|
||||
- uid: cadvisor_down
|
||||
title: cAdvisor Down
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: VictoriaMetrics
|
||||
model:
|
||||
expr: up{job="cadvisor"}
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
- refId: C
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 1
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
type: query
|
||||
expression: B
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
dashboardUid: ""
|
||||
panelId: 0
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 2m
|
||||
annotations:
|
||||
summary: cAdvisor (container metrics) has been down for more than 2 minutes
|
||||
labels:
|
||||
severity: warning
|
||||
isPaused: false
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue