apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: {{ ansible_operator_meta.name }}-{{ item.name }}-alertrules namespace: {{ ansible_operator_meta.namespace }} labels: app: prometheus-postgres-exporter-alertrules name: {{ ansible_operator_meta.name }}-{{ item.name }}-alertrules spec: groups: - name: fep-container rules: - alert: ContainerDisappeared annotations: description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been disappeared' }} summary: Container Pod disappeared. expr: time() - container_last_seen{ container="fep-patroni", namespace="{{ ansible_operator_meta.namespace }}", pod=~"^{{ item.name }}-sts-.*" } > 60 labels: severity: warning - alert: ContainerHighCPUUsage annotations: description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been high on CPU usage(>80%) for 5 mins' }} summary: High Container CPU usage. expr: (sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"{{ item.name }}-sts.*", namespace="{{ ansible_operator_meta.namespace }}", container="fep-patroni"}) by (pod,namespace,container)/sum(kube_pod_container_resource_limits_cpu_cores) by (pod,namespace,container))*100 > 80 for: 5m labels: severity: warning - alert: ContainerHighRAMUsage annotations: description: {{ 'Container {{$labels.container}}/{{$labels.pod}} from {{$labels.namespace}} has been high on RAM usage(>80%) since 30 mins' }} summary: High container memory usage. expr: sum(container_memory_working_set_bytes{pod=~"{{ item.name }}-sts.*", namespace="{{ ansible_operator_meta.namespace }}", container="fep-patroni"} / container_spec_memory_limit_bytes * 100) by (pod, container, instance) > 80 for: 30m labels: severity: warning - alert: PVCLowDiskSpace annotations: description: {{ 'Found low disk space on {{$labels.persistentvolumeclaim}} in {{$labels.namespace}} namespace.' }} summary: {{ 'Found low disk space on {{$labels.persistentvolumeclaim}} in {{$labels.namespace}} namespace.' }} expr: kubelet_volume_stats_available_bytes{namespace="{{ ansible_operator_meta.namespace }}", persistentvolumeclaim=~"fep.*{{ item.name }}.*"}/ (kubelet_volume_stats_capacity_bytes) * 100 < 10 for: 5m labels: severity: warning - name: postgres rules: - alert: PostgresqlDown annotations: description: "Postgresql one or more instances are down in FEPCluster {{ item.name }} in {{ ansible_operator_meta.namespace }} namespace. Please check the FEP pods in this cluster" summary: "Postgresql FEPCluster {{ item.name }} in {{ ansible_operator_meta.namespace }} namespace is degraded" expr: count(pg_static{ namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{item.name}}-sts.*" }) < {{item.instances | length}} labels: severity: error - alert: PostgresqlTooManyConnections annotations: description: {{ 'PostgreSQL instance has too many connections on server {{ $labels.server }} in {{ $labels.namespace }} namespace.' }} summary: {{ 'Postgresql too many connections (FEPCluster server {{ $labels.server }})' }} expr: pg_capacity_connection_total{namespace="{{ ansible_operator_meta.namespace }}", service="{{ ansible_operator_meta.name }}-service", server=~"{{ item.name }}-sts.*"}/pg_settings_max_connections > 0.9 labels: severity: warning