apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ ansible_operator_meta.name }}-{{ item.name }}-alertrules
namespace: {{ ansible_operator_meta.namespace }}
labels:
app: prometheus-postgres-exporter-alertrules
name: {{ ansible_operator_meta.name }}-{{ item.name }}-alertrules
spec:
groups:
- name: fep-container
rules:
- alert: ContainerDisappeared
annotations:
description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been disappeared' }}
summary: Container Pod disappeared.
expr: time() -
container_last_seen{ container="fep-patroni", namespace="{{ ansible_operator_meta.namespace }}", pod=~"^{{ item.name }}-sts-.*" } > 60
labels:
severity: warning
- alert: ContainerHighCPUUsage
annotations:
description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been high on CPU usage(>80%) for 5 mins' }}
summary: High Container CPU usage.
expr: (sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"{{ item.name }}-sts.*", namespace="{{ ansible_operator_meta.namespace }}", container="fep-patroni"}) by (pod,namespace,container)/sum(kube_pod_container_resource_limits_cpu_cores) by (pod,namespace,container))*100 > 80
for: 5m
labels:
severity: warning
- alert: ContainerHighRAMUsage
annotations:
description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been high on RAM usage(>80%) since 30 mins' }}
summary: High container memory usage.
expr: sum(container_memory_working_set_bytes{pod=~"{{ item.name }}-sts.*", namespace="{{ ansible_operator_meta.namespace }}", container="fep-patroni"} / container_spec_memory_limit_bytes * 100) by (pod, container, instance) > 80
for: 30m
labels:
severity: warning
- alert: PVCLowDiskSpace
annotations:
description: {{ 'Found low disk space on {{$labels.persistentvolumeclaim}} in {{$labels.namespace}} namespace.' }}
summary: {{ 'Found low disk space on {{$labels.persistentvolumeclaim}} in {{$labels.namespace}} namespace.' }}
expr: kubelet_volume_stats_available_bytes{namespace="{{ ansible_operator_meta.namespace }}", persistentvolumeclaim=~"fep.*{{ item.name }}.*"}/ (kubelet_volume_stats_capacity_bytes) * 100 < 10
for: 5m
labels:
severity: warning
- name: postgres
rules:
- alert: PostgresqlDown
annotations:
description: "Postgresql one or more instances are down in FEPCluster {{ item.name }} in {{ ansible_operator_meta.namespace }} namespace. Please check the FEP pods in this cluster"
summary: "Postgresql FEPCluster {{ item.name }} in {{ ansible_operator_meta.namespace }} namespace is degraded"
expr: count(pg_static{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{item.name}}-sts.*" }) < {{item.instances | length}}
labels:
severity: error
- alert: PostgresqlTooManyConnections
annotations:
description: {{ 'PostgreSQL instance has too many connections on server {{ $labels.server }} in {{ $labels.namespace }} namespace.' }}
summary: {{ 'Postgresql too many connections (FEPCluster server {{ $labels.server }})' }}
expr: pg_capacity_connection_total{namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*"}/pg_settings_max_connections > 0.9
labels:
severity: warning
- alert: PostgresqlRolePasswordCloseExpierd
annotations:
description: "The Postgresql role's password expires in less than 7 days. Please update the password."
summary: "Postgresql Role Password expires in less than 7 days."
expr: count(pg_password_valid_days{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*", rolname=~".*" } < 8) > 0
labels:
severity: warning
- alert: PostgresqlRolePasswordExpired
annotations:
description: "The Postgresql role's password has already expired. Please update the password."
summary: "Postgresql Role Password has already expired. "
expr: count(pg_password_valid_seconds{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*", rolname=~".*" } < 0) > 0
labels:
severity: warning