Alerts


/etc/config/alerting_rules.yml > BlackBox Alerts
Probe failure (6 active)
alert: Probe failure
expr: probe_success{job=~"compliance-service-check|dpm-service-check",namespace!="kube-system"} == 0
for: 1m
annotations:
  summary: The service {{ $labels.job }} is unreachable or down. please check the cluster for further information.
Labels State Active Since Value
alertname="Probe failure" instance="https://compliance-association.qa-spirion.com/api/v1/association/healthcheck" job="compliance-service-check" firing 2025-12-28 08:04:38.926874181 +0000 UTC 0
alertname="Probe failure" instance="https://compliance-subjectrequest.qa-spirion.com/api/v2/healthcheck" job="compliance-service-check" firing 2025-12-28 08:04:38.926874181 +0000 UTC 0
alertname="Probe failure" instance="https://compliance-fulltext.qa-spirion.com/api/v1/FulltextSearch/healthcheck" job="compliance-service-check" firing 2025-12-28 08:00:38.926874181 +0000 UTC 0
alertname="Probe failure" instance="https://svc-sdmtranslator-i.qa-spirion.com/api/healthcheck" job="dpm-service-check" firing 2025-12-28 08:05:38.926874181 +0000 UTC 0
alertname="Probe failure" instance="https://compliance-datasubjectrequest.qa-spirion.com/api/datasubjectrequests/healthcheck" job="compliance-service-check" firing 2025-12-28 08:02:38.926874181 +0000 UTC 0
alertname="Probe failure" instance="https://compliance-piidatatypes.qa-spirion.com/api/healthcheck" job="compliance-service-check" firing 2025-12-28 08:02:38.926874181 +0000 UTC 0
Public endpoint check (0 active)
alert: Public endpoint check
expr: probe_success{job=~"external.*"} == 0
for: 1m
labels:
  severity: warning
annotations:
  summary: The service {{ $labels.job }} is unreachble from internet, please check if URL is pointing to public endpoint.
/etc/config/alerting_rules.yml > MSSQL Alerts
KubernetesPersistentvolumeclaimPending (1 active)
alert: KubernetesPersistentvolumeclaimPending
expr: kube_persistentvolumeclaim_status_phase{namespace!="kube-system",phase="Pending"} == 1
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.pod }})
Labels State Active Since Value
alertname="KubernetesPersistentvolumeclaimPending" app="prometheus" component="kube-state-metrics" instance="10.0.7.84:8080" job="kubernetes-service-endpoints" kubernetes_name="prometheus-kube-state-metrics" kubernetes_namespace="monitoring" kubernetes_node="ip-10-0-5-18.ec2.internal" module="monitoring" namespace="qa" oss="true" persistentvolumeclaim="compliance-spawler-packages" phase="Pending" provider="prometheus" severity="warning" spirion_release="2.20.1" firing 2025-11-29 14:44:46.936640537 +0000 UTC 1
KubernetesPodNotHealthy (45 active)
alert: KubernetesPodNotHealthy
expr: min_over_time(sum by(namespace, pod) (kube_pod_status_phase{namespace!="kube-system",phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
labels:
  severity: critical
annotations:
  description: |-
    Pod has been in a non-ready state for longer than 15 minutes.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes Pod not healthy (instance {{ $labels.pod }})
Labels State Active Since Value
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-rcql5" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-rlzzk" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-wsr7l" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-926xx" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-nhzln" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-gjgkp" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="qa" pod="kafka-manager-5c8db9b7f9-6gbxq" severity="critical" firing 2025-12-28 08:22:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="qa" pod="dbadmin-db-0.0.9.245.0-nxk5d" severity="critical" firing 2025-11-29 14:45:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-dvbgt" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-v96dp" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-4slgt" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="qa" pod="svc-searchpersistence-7557485df-pjhh9" severity="critical" firing 2025-12-28 08:22:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-mv848" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="qa" pod="svc-searchcontroller-6d859594fd-7sgkt" severity="critical" firing 2025-12-28 08:23:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-bwhgh" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-957tx" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-kxp52" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-ssvtt" severity="critical" firing 2025-12-28 08:01:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-b4lw2" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-vnfnn" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-ccnrw" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-h8bv5" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-7z44n" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="qa" pod="svc-reporting-67c64c775b-fzhqz" severity="critical" firing 2025-12-28 08:22:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-qx6px" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-9xqqm" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-jt7s2" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-pf8kk" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-tlvf9" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-z5hfs" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-9zl5l" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="qa" pod="job-licensestatuscheck-0.0.9.245.0-k2qrt" severity="critical" firing 2025-12-29 00:22:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-sqmnm" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-sgnz7" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-4rppv" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-2fg28" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-j4sv9" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="qa" pod="dbadmin-db-0.0.9.242.0-l9wcq" severity="critical" firing 2025-11-29 14:45:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-857w8" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="qa" pod="cache-identity-authority-db-2cf6h" severity="critical" firing 2025-11-29 14:45:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-wjqjx" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-pqh5j" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-h9c6d" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-7sh65" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
alertname="KubernetesPodNotHealthy" namespace="monitoring" pod="grafana-7749454fdd-6j9cq" severity="critical" firing 2025-12-28 08:07:46.936640537 +0000 UTC 1
DatabaseMaintainenceJobCountIncreased (0 active)
alert: DatabaseMaintainenceJobCountIncreased
expr: jobcount{job="prometheus-query-exporter"} > 0
for: 1m
annotations:
  description: Database Maintainence Job count for database
  summary: Database Maintainence Job count for database
HostHighCpuLoad (0 active)
alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    CPU load is > 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host high CPU load (instance {{ $labels.pod }})
KubernetesDiskPressure (0 active)
alert: KubernetesDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",namespace!="kube-system",status="true"} == 1
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    {{ $labels.node }} has DiskPressure condition
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes disk pressure (instance {{ $labels.pod }})
KubernetesMemoryPressure (0 active)
alert: KubernetesMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",namespace!="kube-system",status="true"} == 1
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    {{ $labels.node }} has MemoryPressure condition
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes memory pressure (instance {{ $labels.pod }})
KubernetesNodeReady (0 active)
alert: KubernetesNodeReady
expr: kube_node_status_condition{condition="Ready",namespace!="kube-system",status="true"} == 0
for: 10m
labels:
  severity: critical
annotations:
  description: |-
    Node {{ $labels.node }} has been unready for a long time
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes Node ready (instance {{ $labels.pod }})
KubernetesOutOfCapacity (0 active)
alert: KubernetesOutOfCapacity
expr: sum by(node) ((kube_pod_status_phase{namespace!="kube-system",phase="Running"} == 1) + on(pod, namespace) group_left(node) (0 * kube_pod_info)) / sum by(node) (kube_node_status_allocatable_pods{namespace!="kube-system"}) * 100 > 90
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    {{ $labels.node }} is out of capacity
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes out of capacity (instance {{ $labels.pod }})
KubernetesOutOfDisk (0 active)
alert: KubernetesOutOfDisk
expr: kube_node_status_condition{condition="OutOfDisk",namespace!="kube-system",status="true"} == 1
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    {{ $labels.node }} has OutOfDisk condition
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes out of disk (instance {{ $labels.pod }})
KubernetesPersistentvolumeError (0 active)
alert: KubernetesPersistentvolumeError
expr: kube_persistentvolume_status_phase{job="kube-state-metrics",namespace!="kube-system",phase=~"Failed|Pending"} > 0
labels:
  severity: critical
annotations:
  description: |-
    Persistent volume is in bad state
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes PersistentVolume error (instance {{ $labels.pod }})
KubernetesPodCrashLooping (0 active)
alert: KubernetesPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[1m]) > 3
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Pod {{ $labels.pod }} is crash looping
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes pod crash looping (instance {{ $labels.pod }})
KubernetesVolumeOutOfDiskSpace (0 active)
MSSQL connectivity alert (0 active)
alert: MSSQL connectivity alert
expr: up{job="prometheus-mssql-exporter"} == 0
for: 1m
labels:
  severity: Critical
annotations:
  summary: The service {{ $labels.job }} is unreachable or down. please check the MSSQL for further information.
compliance alert (0 active)
alert: compliance alert
expr: probe_success{job="compliance",namespace!="kube-system"} == 1
labels:
  severity: warning
annotations:
  summary: The service {{ $labels.job }} compliance is enabled.
compliance alert (0 active)
alert: compliance alert
expr: probe_success{job="compliance",namespace!="kube-system"} == 0
labels:
  Notification: None
  severity: warning
annotations:
  summary: The service {{ $labels.job }} compliance is disabled.