Skip to content

Commit 513291c

Browse files
committed
Create critical-op PDB on-demand to avoid false monitoring alerts
The critical-op PodDisruptionBudget was previously created permanently, but its selector (critical-operation=true) matched no pods during normal operation. This caused false alerts in monitoring systems like kube-prometheus-stack because the PDB expected healthy pods but none matched. Changes: - Modified syncCriticalOpPodDisruptionBudget to check if any pods have the critical-operation label before creating/keeping the PDB - PDB is now created on-demand when pods are labeled (e.g., during major version upgrades) and deleted when labels are removed - Updated majorVersionUpgrade to explicitly create/delete the PDB around the critical operation for immediate protection - Removed automatic critical-op PDB creation from initial cluster setup - Added test to verify on-demand PDB creation and deletion behavior, including edge cases for idempotent create/delete operations The explicit PDB creation in majorVersionUpgrade ensures immediate protection before the critical operation starts. The sync function serves as a safety net for edge cases like bootstrap (where Patroni applies labels) or operator restarts during critical operations. Fixes #3020
1 parent a06f8d7 commit 513291c

File tree

4 files changed

+195
-18
lines changed

4 files changed

+195
-18
lines changed

pkg/cluster/majorversionupgrade.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,22 @@ func (c *Cluster) majorVersionUpgrade() error {
243243
if err = c.criticalOperationLabel(pods, nil); err != nil {
244244
return fmt.Errorf("failed to remove critical-operation label: %s", err)
245245
}
246+
// Delete the critical-op PDB after the critical operation is complete
247+
if err = c.deleteCriticalOpPodDisruptionBudget(); err != nil {
248+
c.logger.Warningf("failed to delete critical-op PDB: %s", err)
249+
}
246250
return nil
247251
}()
252+
253+
// Create the critical-op PDB before starting the critical operation.
254+
// This ensures protection is in place immediately, rather than waiting
255+
// for a sync cycle. The sync function also creates the PDB if it detects
256+
// pods with the critical-operation label, serving as a safety net for
257+
// edge cases like operator restarts during critical operations.
258+
if err = c.createCriticalOpPodDisruptionBudget(); err != nil {
259+
c.logger.Warningf("failed to create critical-op PDB: %s", err)
260+
}
261+
248262
val := "true"
249263
if err = c.criticalOperationLabel(pods, &val); err != nil {
250264
return fmt.Errorf("failed to assign critical-operation label: %s", err)

pkg/cluster/resources.go

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -465,20 +465,13 @@ func (c *Cluster) createCriticalOpPodDisruptionBudget() error {
465465
}
466466

467467
func (c *Cluster) createPodDisruptionBudgets() error {
468-
errors := make([]string, 0)
469-
468+
// Only create the primary PDB during cluster creation.
469+
// The critical-op PDB is created on-demand during critical operations
470+
// (e.g., major version upgrades) to avoid false alerts when no pods
471+
// have the critical-operation label.
470472
err := c.createPrimaryPodDisruptionBudget()
471473
if err != nil {
472-
errors = append(errors, fmt.Sprintf("could not create primary pod disruption budget: %v", err))
473-
}
474-
475-
err = c.createCriticalOpPodDisruptionBudget()
476-
if err != nil {
477-
errors = append(errors, fmt.Sprintf("could not create pod disruption budget for critical operations: %v", err))
478-
}
479-
480-
if len(errors) > 0 {
481-
return fmt.Errorf("%v", strings.Join(errors, `', '`))
474+
return fmt.Errorf("could not create primary pod disruption budget: %v", err)
482475
}
483476
return nil
484477
}

pkg/cluster/sync.go

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,34 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error {
499499
pdb *policyv1.PodDisruptionBudget
500500
err error
501501
)
502-
if pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}); err == nil {
502+
503+
// Check if any pods have the critical-operation label
504+
hasCriticalOpPods, err := c.hasCriticalOperationPods()
505+
if err != nil {
506+
return fmt.Errorf("could not check for critical operation pods: %v", err)
507+
}
508+
509+
pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
510+
pdbExists := err == nil
511+
512+
if !pdbExists && !k8sutil.ResourceNotFound(err) {
513+
return fmt.Errorf("could not get pod disruption budget: %v", err)
514+
}
515+
516+
// If no pods have the critical-operation label, delete the PDB if it exists
517+
if !hasCriticalOpPods {
518+
if pdbExists {
519+
c.CriticalOpPodDisruptionBudget = pdb
520+
c.logger.Infof("no pods with critical-operation label, deleting critical-op PDB")
521+
if err = c.deleteCriticalOpPodDisruptionBudget(); err != nil {
522+
return fmt.Errorf("could not delete pod disruption budget for critical operations: %v", err)
523+
}
524+
}
525+
return nil
526+
}
527+
528+
// Pods have critical-operation label, ensure PDB exists
529+
if pdbExists {
503530
c.CriticalOpPodDisruptionBudget = pdb
504531
newPDB := c.generateCriticalOpPodDisruptionBudget()
505532
match, reason := c.comparePodDisruptionBudget(pdb, newPDB)
@@ -512,13 +539,10 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error {
512539
c.CriticalOpPodDisruptionBudget = pdb
513540
}
514541
return nil
515-
516-
}
517-
if !k8sutil.ResourceNotFound(err) {
518-
return fmt.Errorf("could not get pod disruption budget: %v", err)
519542
}
543+
520544
// no existing pod disruption budget, create new one
521-
c.logger.Infof("could not find pod disruption budget for critical operations")
545+
c.logger.Infof("pods with critical-operation label found, creating critical-op PDB")
522546

523547
if err = c.createCriticalOpPodDisruptionBudget(); err != nil {
524548
if !k8sutil.ResourceAlreadyExists(err) {
@@ -533,6 +557,21 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error {
533557
return nil
534558
}
535559

560+
// hasCriticalOperationPods checks if any pods in the cluster have the critical-operation=true label
561+
func (c *Cluster) hasCriticalOperationPods() (bool, error) {
562+
pods, err := c.listPods()
563+
if err != nil {
564+
return false, err
565+
}
566+
567+
for _, pod := range pods {
568+
if val, ok := pod.Labels["critical-operation"]; ok && val == "true" {
569+
return true, nil
570+
}
571+
}
572+
return false, nil
573+
}
574+
536575
func (c *Cluster) syncPodDisruptionBudgets(isUpdate bool) error {
537576
errors := make([]string, 0)
538577

pkg/cluster/sync_test.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,3 +1018,134 @@ func TestUpdateSecretNameConflict(t *testing.T) {
10181018
expectedError := fmt.Sprintf("syncing secret %s failed: could not update secret because of user name mismatch", "default/prepared-owner-user.acid-test-cluster.credentials")
10191019
assert.Contains(t, err.Error(), expectedError)
10201020
}
1021+
1022+
func TestSyncCriticalOpPodDisruptionBudget(t *testing.T) {
1023+
testName := "test syncing critical-op PDB on-demand"
1024+
clusterName := "acid-test-cluster-pdb"
1025+
namespace := "default"
1026+
1027+
clientSet := fake.NewSimpleClientset()
1028+
acidClientSet := fakeacidv1.NewSimpleClientset()
1029+
1030+
client := k8sutil.KubernetesClient{
1031+
PodDisruptionBudgetsGetter: clientSet.PolicyV1(),
1032+
PodsGetter: clientSet.CoreV1(),
1033+
PostgresqlsGetter: acidClientSet.AcidV1(),
1034+
StatefulSetsGetter: clientSet.AppsV1(),
1035+
}
1036+
1037+
pg := acidv1.Postgresql{
1038+
ObjectMeta: metav1.ObjectMeta{
1039+
Name: clusterName,
1040+
Namespace: namespace,
1041+
},
1042+
Spec: acidv1.PostgresSpec{
1043+
NumberOfInstances: 3,
1044+
Volume: acidv1.Volume{
1045+
Size: "1Gi",
1046+
},
1047+
},
1048+
}
1049+
1050+
cluster := New(
1051+
Config{
1052+
OpConfig: config.Config{
1053+
PDBNameFormat: "postgres-{cluster}-pdb",
1054+
EnablePodDisruptionBudget: util.True(),
1055+
Resources: config.Resources{
1056+
ClusterLabels: map[string]string{"application": "spilo"},
1057+
ClusterNameLabel: "cluster-name",
1058+
PodRoleLabel: "spilo-role",
1059+
ResourceCheckInterval: time.Duration(3),
1060+
ResourceCheckTimeout: time.Duration(10),
1061+
},
1062+
},
1063+
}, client, pg, logger, eventRecorder)
1064+
1065+
cluster.Name = clusterName
1066+
cluster.Namespace = namespace
1067+
1068+
// Create pods without critical-operation label
1069+
for i := 0; i < 3; i++ {
1070+
pod := v1.Pod{
1071+
ObjectMeta: metav1.ObjectMeta{
1072+
Name: fmt.Sprintf("%s-%d", clusterName, i),
1073+
Namespace: namespace,
1074+
Labels: map[string]string{
1075+
"application": "spilo",
1076+
"cluster-name": clusterName,
1077+
},
1078+
},
1079+
}
1080+
_, err := cluster.KubeClient.Pods(namespace).Create(context.TODO(), &pod, metav1.CreateOptions{})
1081+
assert.NoError(t, err)
1082+
}
1083+
1084+
// Test 1: Sync with no critical-operation labels - PDB should NOT be created
1085+
err := cluster.syncCriticalOpPodDisruptionBudget(false)
1086+
assert.NoError(t, err)
1087+
1088+
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
1089+
assert.Error(t, err, "%s: critical-op PDB should not exist when no pods have critical-operation label", testName)
1090+
1091+
// Test 2: Add critical-operation label to pods - PDB should be created
1092+
for i := 0; i < 3; i++ {
1093+
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
1094+
assert.NoError(t, err)
1095+
pod.Labels["critical-operation"] = "true"
1096+
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
1097+
assert.NoError(t, err)
1098+
}
1099+
1100+
err = cluster.syncCriticalOpPodDisruptionBudget(false)
1101+
assert.NoError(t, err)
1102+
1103+
pdb, err := cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
1104+
assert.NoError(t, err, "%s: critical-op PDB should exist when pods have critical-operation label", testName)
1105+
assert.Equal(t, int32(3), pdb.Spec.MinAvailable.IntVal, "%s: minAvailable should be 3", testName)
1106+
1107+
// Test 3: Remove critical-operation label from pods - PDB should be deleted
1108+
for i := 0; i < 3; i++ {
1109+
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
1110+
assert.NoError(t, err)
1111+
delete(pod.Labels, "critical-operation")
1112+
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
1113+
assert.NoError(t, err)
1114+
}
1115+
1116+
err = cluster.syncCriticalOpPodDisruptionBudget(false)
1117+
assert.NoError(t, err)
1118+
1119+
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
1120+
assert.Error(t, err, "%s: critical-op PDB should be deleted when no pods have critical-operation label", testName)
1121+
1122+
// Test 4: Try to delete again when PDB is already deleted - should handle gracefully
1123+
err = cluster.syncCriticalOpPodDisruptionBudget(false)
1124+
assert.NoError(t, err, "%s: syncing when PDB already deleted should not error", testName)
1125+
1126+
// Test 5: Try to create when PDB already exists - should handle gracefully
1127+
// First, add labels back to pods
1128+
for i := 0; i < 3; i++ {
1129+
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
1130+
assert.NoError(t, err)
1131+
pod.Labels["critical-operation"] = "true"
1132+
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
1133+
assert.NoError(t, err)
1134+
}
1135+
1136+
// Create the PDB
1137+
err = cluster.syncCriticalOpPodDisruptionBudget(false)
1138+
assert.NoError(t, err)
1139+
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
1140+
assert.NoError(t, err, "%s: PDB should exist after sync with labeled pods", testName)
1141+
1142+
// Now sync again - should handle "already exists" gracefully
1143+
cluster.CriticalOpPodDisruptionBudget = nil // Simulate operator restart (in-memory state lost)
1144+
err = cluster.syncCriticalOpPodDisruptionBudget(false)
1145+
assert.NoError(t, err, "%s: syncing when PDB already exists should not error", testName)
1146+
1147+
// Verify PDB still exists and is properly tracked
1148+
pdb, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
1149+
assert.NoError(t, err)
1150+
assert.Equal(t, int32(3), pdb.Spec.MinAvailable.IntVal)
1151+
}

0 commit comments

Comments
 (0)