Skip to content

Commit 45b9167

Browse files
authored
Add configurable exponential backoff for certificate apply retries (#137)
* add per key exponential backoff for retries Signed-off-by: Hiroki Hanada <hiroki-hanada@cybozu.co.jp>
1 parent 3e1ca20 commit 45b9167

6 files changed

Lines changed: 82 additions & 48 deletions

File tree

cmd/root.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ func init() {
5050
fs.StringSlice("allowed-dns-namespaces", []string{}, "List of namespaces where DNSEndpoint resources can be created. If empty, no namespaces are allowed")
5151
fs.StringSlice("allowed-issuer-namespaces", []string{}, "List of namespaces where Certificate resources can be created. If empty, no namespaces are allowed")
5252
fs.Float64("certificate-apply-limit", 0, "Maximum number of certificate apply operations allowed per second (0 disables rate limiting)")
53+
fs.Duration("certificate-apply-retry-base-delay", controllers.DefaultRetryBaseDelay, "Base delay for certificate apply exponential backoff retry")
54+
fs.Duration("certificate-apply-retry-max-delay", controllers.DefaultRetryMaxDelay, "Maximum delay for certificate apply exponential backoff retry")
5355
if err := viper.BindPFlags(fs); err != nil {
5456
panic(err)
5557
}

cmd/run.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,18 @@ func run() error {
8383
return errors.New("certificate-apply-limit must be greater than or equal to 0")
8484
}
8585

86+
opts.CertificateApplyRetryBaseDelay = viper.GetDuration("certificate-apply-retry-base-delay")
87+
if opts.CertificateApplyRetryBaseDelay <= 0 {
88+
return errors.New("certificate-apply-retry-base-delay must be greater than 0")
89+
}
90+
opts.CertificateApplyRetryMaxDelay = viper.GetDuration("certificate-apply-retry-max-delay")
91+
if opts.CertificateApplyRetryMaxDelay <= 0 {
92+
return errors.New("certificate-apply-retry-max-delay must be greater than 0")
93+
}
94+
if opts.CertificateApplyRetryMaxDelay < opts.CertificateApplyRetryBaseDelay {
95+
return errors.New("certificate-apply-retry-max-delay must be greater than or equal to certificate-apply-retry-base-delay")
96+
}
97+
8698
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
8799
Scheme: scheme,
88100
Metrics: metricsserver.Options{

controllers/certificate_worker.go

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"math"
77
"sync"
8+
"time"
89

910
cmv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1"
1011
projectcontourv1 "github.com/projectcontour/contour/apis/projectcontour/v1"
@@ -34,6 +35,9 @@ const (
3435
labelApplyResult = "result"
3536
applyResultSuccess applyResultValue = "success"
3637
applyResultError applyResultValue = "error"
38+
39+
DefaultRetryBaseDelay = 5 * time.Second
40+
DefaultRetryMaxDelay = 10 * time.Minute
3741
)
3842

3943
type Applier[T client.Object] interface {
@@ -104,8 +108,9 @@ func NewCertificateApplyWorker(client client.Client, opt ReconcilerOptions) *Cer
104108
}
105109

106110
global := &workqueue.TypedBucketRateLimiter[types.NamespacedName]{Limiter: limiter}
107-
workqueue := workqueue.NewTypedRateLimitingQueueWithConfig(
108-
global,
111+
expFailure := workqueue.NewTypedItemExponentialFailureRateLimiter[types.NamespacedName](opt.CertificateApplyRetryBaseDelay, opt.CertificateApplyRetryMaxDelay)
112+
certQueue := workqueue.NewTypedRateLimitingQueueWithConfig(
113+
workqueue.NewTypedMaxOfRateLimiter(global, expFailure),
109114
workqueue.TypedRateLimitingQueueConfig[types.NamespacedName]{
110115
Name: certificateApplierName,
111116
},
@@ -114,7 +119,7 @@ func NewCertificateApplyWorker(client client.Client, opt ReconcilerOptions) *Cer
114119
return &CertificateApplyWorker{
115120
ReconcilerOptions: opt,
116121
client: client,
117-
workqueue: workqueue,
122+
workqueue: certQueue,
118123
manifests: make(map[types.NamespacedName]*cmv1.Certificate),
119124
limiter: limiter,
120125
retryCh: retryCh,
@@ -182,7 +187,6 @@ func (w *CertificateApplyWorker) Start(ctx context.Context) error {
182187
log.Info("processing cert queue item", "key", objKey.String())
183188

184189
func() {
185-
// there is no need to call .Forget since we are only using BucketRateLimiter
186190
defer w.workqueue.Done(objKey)
187191

188192
w.mu.Lock()
@@ -209,6 +213,9 @@ func (w *CertificateApplyWorker) Start(ctx context.Context) error {
209213

210214
log.Info("cert applied from queue", "key", objKey.String())
211215
w.recordApply(viaQueueYes, applyResultSuccess)
216+
// Forget the cert key to reset the exponential backoff counter so that
217+
// a future failure starts from the base delay again.
218+
w.workqueue.Forget(objKey)
212219
}()
213220
}
214221
}
@@ -250,35 +257,33 @@ func (w *CertificateApplyWorker) enqueueCertificate(objKey types.NamespacedName,
250257
w.workqueue.AddRateLimited(objKey)
251258
}
252259

253-
// enqueueHTTPProxy enqueues HTTPProxy in to the workqueue used by the main Reconcile function.
254-
// It requires the Controller to be setup with .WatchesRawSource using the same channel as w.retryCh
260+
// enqueueHTTPProxy sends the HTTPProxy that owns obj directly to retryCh so the main
261+
// reconcile loop picks it up. The exponential backoff is handled by the certQueue's
262+
// MaxOfRateLimiter when the reconcile loop calls AddRateLimited on subsequent failures.
255263
func (w *CertificateApplyWorker) enqueueHTTPProxy(ctx context.Context, obj *cmv1.Certificate) {
256264
log := crlog.FromContext(ctx)
257-
if w.retryCh == nil {
258-
return
259-
}
260265
annotations := obj.GetAnnotations()
261266
if annotations == nil {
262-
log.Error(fmt.Errorf("annotation does not exist"), "skipping HTTPProxy enqueue", "certificateName", obj.Name, "certificateNamespace", obj.Namespace)
267+
log.Error(fmt.Errorf("annotations do not exist on certificate %s/%s", obj.Namespace, obj.Name), "skipping HTTPProxy enqueue", "certificateName", obj.Name, "certificateNamespace", obj.Namespace)
263268
return
264269
}
265270
owner, ok := annotations[ownerAnnotation]
266271
if !ok {
267-
log.Error(fmt.Errorf("annotation not found for %s", ownerAnnotation), "skipping HTTPProxy enqueue", "certificateName", obj.Name, "certificateNamespace", obj.Namespace)
272+
log.Error(fmt.Errorf("annotation %q not found on certificate %s/%s", ownerAnnotation, obj.Namespace, obj.Name), "skipping HTTPProxy enqueue", "certificateName", obj.Name, "certificateNamespace", obj.Namespace)
268273
return
269274
}
270-
271275
ns, name, err := cache.SplitMetaNamespaceKey(owner)
272276
if err != nil {
273277
log.Error(err, "skipping HTTPProxy enqueue", "certificateName", obj.Name, "certificateNamespace", obj.Namespace)
274278
return
275279
}
276-
h := projectcontourv1.HTTPProxy{}
280+
h := &projectcontourv1.HTTPProxy{}
277281
h.SetNamespace(ns)
278282
h.SetName(name)
279-
log.Info("re-queueing HTTPProxy", "namespace", ns, "name", name)
280-
w.retryCh <- event.TypedGenericEvent[*projectcontourv1.HTTPProxy]{
281-
Object: &h,
283+
log.Info("re-queueing HTTPProxy for retry", "namespace", ns, "name", name)
284+
select {
285+
case w.retryCh <- event.TypedGenericEvent[*projectcontourv1.HTTPProxy]{Object: h}:
286+
case <-ctx.Done():
282287
}
283288
}
284289

controllers/certificate_worker_test.go

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ func testCertificateApplyWorker() {
9393
baseClient := newFakeClient() // no existing Certificate
9494
cl := &applyAsUpdateClient{Client: baseClient}
9595
worker := NewCertificateApplyWorker(cl, ReconcilerOptions{
96-
CertificateApplyLimit: 10, // avoid rate.Limit(0) semantics
96+
CertificateApplyLimit: 10, // avoid rate.Limit(0) semantics
97+
CertificateApplyRetryBaseDelay: 1 * time.Millisecond,
98+
CertificateApplyRetryMaxDelay: 10 * time.Millisecond,
9799
})
98100

99101
reg := prometheus.NewRegistry()
@@ -120,7 +122,7 @@ func testCertificateApplyWorker() {
120122
Expect(k8serrors.IsNotFound(err)).To(BeTrue(), "new object should not be applied immediately, but queued")
121123

122124
// And the worker should have the key in its internal queue
123-
Expect(worker.workqueue.Len()).To(Equal(1))
125+
Eventually(func() int { return worker.workqueue.Len() }, 500*time.Millisecond, time.Millisecond).Should(Equal(1))
124126

125127
queuedKey, shutdown := worker.workqueue.Get()
126128
Expect(shutdown).To(BeFalse())
@@ -207,7 +209,9 @@ func testCertificateApplyWorker() {
207209
baseClient := newFakeClient(current) // init with a certificate
208210
cl := &applyAsUpdateClient{Client: baseClient}
209211
worker := NewCertificateApplyWorker(cl, ReconcilerOptions{
210-
CertificateApplyLimit: 10,
212+
CertificateApplyLimit: 10,
213+
CertificateApplyRetryBaseDelay: 1 * time.Millisecond,
214+
CertificateApplyRetryMaxDelay: 10 * time.Millisecond,
211215
})
212216

213217
reg := prometheus.NewRegistry()
@@ -221,7 +225,7 @@ func testCertificateApplyWorker() {
221225
Expect(worker.Apply(ctx, desired)).To(Succeed())
222226

223227
// Should be queued, not applied directly
224-
Expect(worker.workqueue.Len()).To(Equal(1))
228+
Eventually(func() int { return worker.workqueue.Len() }, 500*time.Millisecond, time.Millisecond).Should(Equal(1))
225229

226230
// The object in the fake client should still be the old spec (no extra DNS yet),
227231
// because the queued apply hasn’t run Start() / processed the queue.
@@ -250,7 +254,9 @@ func testCertificateApplyWorker() {
250254
baseClient := newFakeClient() // no existing certificate
251255
cl := &applyAsUpdateClient{Client: baseClient}
252256
worker := NewCertificateApplyWorker(cl, ReconcilerOptions{
253-
CertificateApplyLimit: 10, // avoid rate.Limit(0) oddness
257+
CertificateApplyLimit: 10, // avoid rate.Limit(0) oddness
258+
CertificateApplyRetryBaseDelay: 1 * time.Millisecond,
259+
CertificateApplyRetryMaxDelay: 10 * time.Millisecond,
254260
})
255261

256262
reg := prometheus.NewRegistry()
@@ -318,7 +324,9 @@ func testCertificateApplyWorker() {
318324
cl := &failingPatchClient{Client: baseClient}
319325

320326
worker := NewCertificateApplyWorker(cl, ReconcilerOptions{
321-
CertificateApplyLimit: 10,
327+
CertificateApplyLimit: 10,
328+
CertificateApplyRetryBaseDelay: 100 * time.Millisecond,
329+
CertificateApplyRetryMaxDelay: 1 * time.Second,
322330
})
323331

324332
reg := prometheus.NewRegistry()
@@ -350,11 +358,12 @@ func testCertificateApplyWorker() {
350358
// Apply should enqueue the certificate (RequiresQueue returns true for new object)
351359
Expect(worker.Apply(ctx, cert)).To(Succeed())
352360

353-
// We expect a retry event on the channel because Patch always fails
361+
// We expect a retry event on the channel because Patch always fails.
362+
// The event is delayed by CertificateApplyRetryBaseDelay (100ms) due to the exponential backoff queue.
354363
retryCh := worker.GetRetryChannel()
355364

356365
var evt event.TypedGenericEvent[*projectcontourv1.HTTPProxy]
357-
Eventually(retryCh, 2*time.Second, 100*time.Millisecond).Should(
366+
Eventually(retryCh, 5*time.Second, 100*time.Millisecond).Should(
358367
Receive(&evt),
359368
"expected an HTTPProxy event to be sent on retry channel after apply failure",
360369
)

controllers/httpproxy_controller_test.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,14 +1391,16 @@ func testHTTPProxyReconcile() {
13911391

13921392
prefix := "test-"
13931393
opts := ReconcilerOptions{
1394-
ServiceKey: testServiceKey,
1395-
Prefix: prefix,
1396-
DefaultIssuerName: "test-issuer",
1397-
DefaultIssuerKind: IssuerKind,
1398-
CreateDNSEndpoint: true,
1399-
CreateCertificate: true,
1400-
PropagatedAnnotations: []string{"foo"}, // must propagate an annotation to update cert metadata
1401-
CertificateApplyLimit: 1,
1394+
ServiceKey: testServiceKey,
1395+
Prefix: prefix,
1396+
DefaultIssuerName: "test-issuer",
1397+
DefaultIssuerKind: IssuerKind,
1398+
CreateDNSEndpoint: true,
1399+
CreateCertificate: true,
1400+
PropagatedAnnotations: []string{"foo"}, // must propagate an annotation to update cert metadata
1401+
CertificateApplyLimit: 1,
1402+
CertificateApplyRetryBaseDelay: 1 * time.Millisecond,
1403+
CertificateApplyRetryMaxDelay: 10 * time.Millisecond,
14021404
}
14031405

14041406
err := SetupReconciler(mgr, scm, opts)

controllers/setup.go

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package controllers
22

33
import (
4+
"time"
5+
46
cmapiv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1"
57
projectcontourv1 "github.com/projectcontour/contour/apis/projectcontour/v1"
68
"k8s.io/apimachinery/pkg/runtime"
@@ -14,22 +16,24 @@ import (
1416

1517
// ReconcilerOptions is a set of options for reconcilers
1618
type ReconcilerOptions struct {
17-
ServiceKey client.ObjectKey
18-
Prefix string
19-
DefaultIssuerName string
20-
DefaultIssuerKind string
21-
DefaultDelegatedDomain string
22-
AllowedDelegatedDomains []string
23-
AllowCustomDelegations bool
24-
CSRRevisionLimit uint
25-
CreateDNSEndpoint bool
26-
CreateCertificate bool
27-
IngressClassName string
28-
PropagatedAnnotations []string
29-
PropagatedLabels []string
30-
AllowedDNSNamespaces []string
31-
AllowedIssuerNamespaces []string
32-
CertificateApplyLimit float64
19+
ServiceKey client.ObjectKey
20+
Prefix string
21+
DefaultIssuerName string
22+
DefaultIssuerKind string
23+
DefaultDelegatedDomain string
24+
AllowedDelegatedDomains []string
25+
AllowCustomDelegations bool
26+
CSRRevisionLimit uint
27+
CreateDNSEndpoint bool
28+
CreateCertificate bool
29+
IngressClassName string
30+
PropagatedAnnotations []string
31+
PropagatedLabels []string
32+
AllowedDNSNamespaces []string
33+
AllowedIssuerNamespaces []string
34+
CertificateApplyLimit float64
35+
CertificateApplyRetryBaseDelay time.Duration
36+
CertificateApplyRetryMaxDelay time.Duration
3337
}
3438

3539
// SetupScheme initializes a schema

0 commit comments

Comments
 (0)