apiVersion: 1 groups: - orgId: 1 name: infrastructure_alerts folder: Monitoring interval: 1m rules: - uid: node_exporter_down title: Node Exporter Down condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: VictoriaMetrics model: expr: up{job="node"} intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: expression: A intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: B type: reduce - refId: C relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 1 type: lt operator: type: and query: params: - C type: query expression: B intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold dashboardUid: "" panelId: 0 noDataState: NoData execErrState: Error for: 2m annotations: summary: Node exporter has been down for more than 2 minutes labels: severity: critical isPaused: false - uid: high_cpu_usage title: High CPU Usage condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: VictoriaMetrics model: expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: expression: A intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: B type: reduce - refId: C relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 80 type: gt operator: type: and query: params: - C type: query expression: B intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold dashboardUid: "" panelId: 0 noDataState: NoData execErrState: Error for: 5m annotations: summary: CPU usage is above 80% for more than 5 minutes labels: severity: warning isPaused: false - uid: high_memory_usage title: High Memory Usage condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: VictoriaMetrics model: expr: 100 * (1 - ((node_memory_MemAvailable_bytes) / (node_memory_MemTotal_bytes))) intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: expression: A intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: B type: reduce - refId: C relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 85 type: gt operator: type: and query: params: - C type: query expression: B intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold dashboardUid: "" panelId: 0 noDataState: NoData execErrState: Error for: 5m annotations: summary: Memory usage is above 85% for more than 5 minutes labels: severity: warning isPaused: false - uid: low_disk_space title: Low Disk Space condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: VictoriaMetrics model: expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|fuse.lxcfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|fuse.lxcfs"}) intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: expression: A intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: B type: reduce - refId: C relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 85 type: gt operator: type: and query: params: - C type: query expression: B intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold dashboardUid: "" panelId: 0 noDataState: NoData execErrState: Error for: 5m annotations: summary: Disk usage on root filesystem is above 85% labels: severity: warning isPaused: false - uid: cadvisor_down title: cAdvisor Down condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: VictoriaMetrics model: expr: up{job="cadvisor"} intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: expression: A intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: B type: reduce - refId: C relativeTimeRange: from: 600 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 1 type: lt operator: type: and query: params: - C type: query expression: B intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold dashboardUid: "" panelId: 0 noDataState: NoData execErrState: Error for: 2m annotations: summary: cAdvisor (container metrics) has been down for more than 2 minutes labels: severity: warning isPaused: false