feat(scheduler): Enforce concurrency_limit for CONCURRENT overlap policy in scheduler (#8024)

YaweiZhang-930 · web-flow · commit e786a4257e9b · 2026-05-01T11:45:28.000-07:00
**What changed?** - Implement SchedulePolicies.concurrency_limit for CONCURRENT by tracking multiple in-flight target runs and enforcing the cap in processScheduleFireActivity **Why?** - SchedulePolicies.ConcurrencyLimit was defined in the IDL but is not used— CONCURRENT always allowed unlimited parallel runs. We need a way to cap how many instances of a target workflow run simultaneously - **Why RunningWorkflows?** Unlike LastStartedWorkflow (which tracks only the the last started workflow), bounded CONCURRENT must track all in-flight workflows. On each fire, the activity describes every tracked workflow, prunes completed entries, and compares the surviving count against the cap. RunningWorkflows is the state that carries this set across fires and across ContinueAsNew. - **Why MaxConcurrencyLimitSystemLimit = 1000** ? RunningWorkflows is serialized into the ContinueAsNew payload. An unbounded user-configured limit could grow this slice enough to breach Cadence's 2MB BlobSizeLimitError, which causes the workflow to fail with no graceful recovery. This is mirroring the existing MaxBufferedFiresSystemLimit pattern for the BUFFER policy. - **Why handleUpdate cleanup**? When a schedule switches away from bounded CONCURRENT to different policy, or limit set to 0, the tracked list becomes meaningless. handleUpdate clears it to avoid carrying stale entries. **How did you test it?** - Unit tests for processScheduleFireActivity: at-capacity skip, room-available start, completed-workflow pruning, describe error propagation, and AlreadyStartedError handling - Metrics tests: bounded-CONCURRENT skip emits SchedulerFireSkippedCountPerDomain with the correct overlap-policy tag - handleUpdate tests: RunningWorkflows is cleared when leaving bounded CONCURRENT (policy change or limit → 0), and preserved when only the limit changes between positive values **Potential risks** - ConcurrencyLimit flag is missing from CLI command. A user running cadence schedule create --help sees no mention of concurrency_limit at all. - Early return guard in schedule_commands silently lost the value. https://github.com/cadence-workflow/cadence/blob/95a7d9d56a25988b7bbfd7143f68597dbfe3fbcd/tools/cli/schedule_commands.go#L371 **Release notes** N/A **Documentation Changes** - consider adding an operator note that ConcurrencyLimit should be kept below a certain threshold to avoid workflow-state size concerns --------- Signed-off-by: YaweiZhang-930 <yawei930@gmail.com>
diff --git a/service/worker/scheduler/activity.go b/service/worker/scheduler/activity.go
@@ -75,6 +75,32 @@ func processScheduleFireActivity(ctx context.Context, req ProcessFireRequest) (r
 		policy = types.ScheduleOverlapPolicySkipNew
 	}
 
+	// Bounded CONCURRENT: describe each tracked in-flight workflow, prune
+	// completed entries, and enforce the cap. When under the cap, falls through
+	// to the shared start block; stillRunning is used there to build
+	// result.ActiveWorkflows. When at or over the cap, returns early with a skip.
+	isBoundedConcurrent := policy == types.ScheduleOverlapPolicyConcurrent && req.ConcurrencyLimit > 0
+	var stillRunning []RunningWorkflowInfo
+	if isBoundedConcurrent {
+		effectiveLimit := effectiveConcurrencyLimit(req.ConcurrencyLimit)
+		for _, wf := range req.RunningWorkflows {
+			running, err := isWorkflowRunning(ctx, sc.FrontendClient, req.Domain, &wf)
+			if err != nil {
+				return nil, err
+			}
+			if running {
+				stillRunning = append(stillRunning, wf)
+			}
+		}
+		if len(stillRunning) >= int(effectiveLimit) {
+			scope.Tagged(metrics.OverlapPolicyTag(policy.String()), metrics.TriggerSourceTag(string(req.TriggerSource))).
+				IncCounter(metrics.SchedulerFireSkippedCountPerDomain)
+			result.SkippedDelta = 1
+			result.ActiveWorkflows = stillRunning
+			return result, nil
+		}
+	}
+
 	if policy != types.ScheduleOverlapPolicyConcurrent && req.LastStartedWorkflow != nil {
 		running, err := isWorkflowRunning(ctx, sc.FrontendClient, req.Domain, req.LastStartedWorkflow)
 		if err != nil {
@@ -135,11 +161,15 @@ func processScheduleFireActivity(ctx context.Context, req ProcessFireRequest) (r
 		var alreadyStarted *types.WorkflowExecutionAlreadyStartedError
 		if errors.As(err, &alreadyStarted) {
 			scope.Tagged(metrics.TriggerSourceTag(string(req.TriggerSource))).IncCounter(metrics.SchedulerFireAlreadyRunningCountPerDomain)
-			result.SkippedDelta = 1
-			result.StartedWorkflow = &RunningWorkflowInfo{
+			existing := &RunningWorkflowInfo{
 				WorkflowID: workflowID,
 				RunID:      alreadyStarted.RunID,
 			}
+			result.SkippedDelta = 1
+			result.StartedWorkflow = existing
+			if isBoundedConcurrent {
+				result.ActiveWorkflows = append(stillRunning, *existing)
+			}
 			return result, nil
 		}
 		return nil, fmt.Errorf("failed to start workflow: %w", err)
@@ -155,6 +185,9 @@ func processScheduleFireActivity(ctx context.Context, req ProcessFireRequest) (r
 		WorkflowID: workflowID,
 		RunID:      resp.GetRunID(),
 	}
+	if isBoundedConcurrent {
+		result.ActiveWorkflows = append(stillRunning, *result.StartedWorkflow)
+	}
 	return result, nil
 }
 
diff --git a/service/worker/scheduler/activity_test.go b/service/worker/scheduler/activity_test.go
@@ -383,6 +383,175 @@ func TestProcessScheduleFireActivity(t *testing.T) {
 				StartedWorkflow: &RunningWorkflowInfo{WorkflowID: expectedWfID, RunID: "new-run"},
 			},
 		},
+		{
+			name: "CONCURRENT with cap: at capacity, skips new fire",
+			req: func() ProcessFireRequest {
+				r := baseReq
+				r.OverlapPolicy = types.ScheduleOverlapPolicyConcurrent
+				r.ConcurrencyLimit = 2
+				r.RunningWorkflows = []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+					{WorkflowID: "wf-2", RunID: "run-2"},
+				}
+				return r
+			}(),
+			setupMock: func(m *frontend.MockClient) {
+				m.EXPECT().DescribeWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(&types.DescribeWorkflowExecutionResponse{
+						WorkflowExecutionInfo: &types.WorkflowExecutionInfo{CloseStatus: nil},
+					}, nil).Times(2)
+			},
+			wantResult: &ProcessFireResult{
+				SkippedDelta: 1,
+				ActiveWorkflows: []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+					{WorkflowID: "wf-2", RunID: "run-2"},
+				},
+			},
+		},
+		{
+			name: "CONCURRENT with cap: slot available, starts new workflow",
+			req: func() ProcessFireRequest {
+				r := baseReq
+				r.OverlapPolicy = types.ScheduleOverlapPolicyConcurrent
+				r.ConcurrencyLimit = 3
+				r.RunningWorkflows = []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+					{WorkflowID: "wf-2", RunID: "run-2"},
+				}
+				return r
+			}(),
+			setupMock: func(m *frontend.MockClient) {
+				m.EXPECT().DescribeWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(&types.DescribeWorkflowExecutionResponse{
+						WorkflowExecutionInfo: &types.WorkflowExecutionInfo{CloseStatus: nil},
+					}, nil).Times(2)
+				m.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(&types.StartWorkflowExecutionResponse{RunID: "new-run"}, nil)
+			},
+			wantResult: &ProcessFireResult{
+				TotalDelta:      1,
+				StartedWorkflow: &RunningWorkflowInfo{WorkflowID: expectedWfID, RunID: "new-run"},
+				ActiveWorkflows: []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+					{WorkflowID: "wf-2", RunID: "run-2"},
+					{WorkflowID: expectedWfID, RunID: "new-run"},
+				},
+			},
+		},
+		{
+			name: "CONCURRENT with cap: completed workflows pruned, freed slot starts new workflow",
+			req: func() ProcessFireRequest {
+				r := baseReq
+				r.OverlapPolicy = types.ScheduleOverlapPolicyConcurrent
+				r.ConcurrencyLimit = 2
+				r.RunningWorkflows = []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+					{WorkflowID: "wf-2", RunID: "run-2"},
+				}
+				return r
+			}(),
+			setupMock: func(m *frontend.MockClient) {
+				closed := types.WorkflowExecutionCloseStatus(0)
+				// wf-1 is closed; wf-2 is still running
+				gomock.InOrder(
+					m.EXPECT().DescribeWorkflowExecution(gomock.Any(), gomock.Any()).
+						Return(&types.DescribeWorkflowExecutionResponse{
+							WorkflowExecutionInfo: &types.WorkflowExecutionInfo{CloseStatus: &closed},
+						}, nil),
+					m.EXPECT().DescribeWorkflowExecution(gomock.Any(), gomock.Any()).
+						Return(&types.DescribeWorkflowExecutionResponse{
+							WorkflowExecutionInfo: &types.WorkflowExecutionInfo{CloseStatus: nil},
+						}, nil),
+				)
+				m.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(&types.StartWorkflowExecutionResponse{RunID: "new-run"}, nil)
+			},
+			wantResult: &ProcessFireResult{
+				TotalDelta:      1,
+				StartedWorkflow: &RunningWorkflowInfo{WorkflowID: expectedWfID, RunID: "new-run"},
+				ActiveWorkflows: []RunningWorkflowInfo{
+					{WorkflowID: "wf-2", RunID: "run-2"},
+					{WorkflowID: expectedWfID, RunID: "new-run"},
+				},
+			},
+		},
+		{
+			name: "CONCURRENT with cap: all tracked workflows completed, starts new workflow",
+			req: func() ProcessFireRequest {
+				r := baseReq
+				r.OverlapPolicy = types.ScheduleOverlapPolicyConcurrent
+				r.ConcurrencyLimit = 1
+				r.RunningWorkflows = []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+				}
+				return r
+			}(),
+			setupMock: func(m *frontend.MockClient) {
+				closed := types.WorkflowExecutionCloseStatus(0)
+				m.EXPECT().DescribeWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(&types.DescribeWorkflowExecutionResponse{
+						WorkflowExecutionInfo: &types.WorkflowExecutionInfo{CloseStatus: &closed},
+					}, nil)
+				m.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(&types.StartWorkflowExecutionResponse{RunID: "new-run"}, nil)
+			},
+			wantResult: &ProcessFireResult{
+				TotalDelta:      1,
+				StartedWorkflow: &RunningWorkflowInfo{WorkflowID: expectedWfID, RunID: "new-run"},
+				ActiveWorkflows: []RunningWorkflowInfo{
+					{WorkflowID: expectedWfID, RunID: "new-run"},
+				},
+			},
+		},
+		{
+			name: "CONCURRENT with cap: describe error during slot check propagates",
+			req: func() ProcessFireRequest {
+				r := baseReq
+				r.OverlapPolicy = types.ScheduleOverlapPolicyConcurrent
+				r.ConcurrencyLimit = 2
+				r.RunningWorkflows = []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+				}
+				return r
+			}(),
+			setupMock: func(m *frontend.MockClient) {
+				m.EXPECT().DescribeWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(nil, errors.New("connection refused"))
+			},
+			wantErr: true,
+		},
+		{
+			name: "CONCURRENT with cap: already-started includes workflow in active set",
+			req: func() ProcessFireRequest {
+				r := baseReq
+				r.OverlapPolicy = types.ScheduleOverlapPolicyConcurrent
+				r.ConcurrencyLimit = 3
+				r.RunningWorkflows = []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+				}
+				return r
+			}(),
+			setupMock: func(m *frontend.MockClient) {
+				m.EXPECT().DescribeWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(&types.DescribeWorkflowExecutionResponse{
+						WorkflowExecutionInfo: &types.WorkflowExecutionInfo{CloseStatus: nil},
+					}, nil)
+				m.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(nil, &types.WorkflowExecutionAlreadyStartedError{
+						Message: "already started",
+						RunID:   "existing-run",
+					})
+			},
+			wantResult: &ProcessFireResult{
+				SkippedDelta:    1,
+				StartedWorkflow: &RunningWorkflowInfo{WorkflowID: expectedWfID, RunID: "existing-run"},
+				ActiveWorkflows: []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+					{WorkflowID: expectedWfID, RunID: "existing-run"},
+				},
+			},
+		},
 		{
 			name: "AlreadyStartedError returns skipped with RunID",
 			req:  baseReq,
@@ -648,6 +817,32 @@ func TestProcessScheduleFireActivityMetrics(t *testing.T) {
 				metrics.SchedulerFireErrorCountPerDomain,
 			},
 		},
+		{
+			name: "CONCURRENT with cap at capacity: emits skipped counter",
+			req: func() ProcessFireRequest {
+				r := baseReq
+				r.OverlapPolicy = types.ScheduleOverlapPolicyConcurrent
+				r.ConcurrencyLimit = 2
+				r.RunningWorkflows = []RunningWorkflowInfo{
+					{WorkflowID: "wf-1", RunID: "run-1"},
+					{WorkflowID: "wf-2", RunID: "run-2"},
+				}
+				return r
+			}(),
+			setupMock: func(m *frontend.MockClient) {
+				m.EXPECT().DescribeWorkflowExecution(gomock.Any(), gomock.Any()).
+					Return(&types.DescribeWorkflowExecutionResponse{
+						WorkflowExecutionInfo: &types.WorkflowExecutionInfo{CloseStatus: nil},
+					}, nil).Times(2)
+			},
+			wantCounters: []metrics.MetricIdx{metrics.SchedulerFireSkippedCountPerDomain},
+			wantNoCounter: []metrics.MetricIdx{
+				metrics.SchedulerFireStartedCountPerDomain,
+				metrics.SchedulerFireBufferedCountPerDomain,
+				metrics.SchedulerFireAlreadyRunningCountPerDomain,
+				metrics.SchedulerFireErrorCountPerDomain,
+			},
+		},
 		{
 			name: "start error emits error counter",
 			req:  baseReq,
@@ -819,3 +1014,22 @@ func TestProcessScheduleFireActivityLatency(t *testing.T) {
 		})
 	}
 }
+
+func TestEffectiveConcurrencyLimit(t *testing.T) {
+	tests := []struct {
+		name      string
+		userLimit int32
+		want      int32
+	}{
+		{"below system limit returned as-is", 1, 1},
+		{"typical value returned as-is", 10, 10},
+		{"at system limit returned as-is", MaxConcurrencyLimitSystemLimit, MaxConcurrencyLimitSystemLimit},
+		{"one above system limit clamped", MaxConcurrencyLimitSystemLimit + 1, MaxConcurrencyLimitSystemLimit},
+		{"large value clamped to system limit", 10000, MaxConcurrencyLimitSystemLimit},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			assert.Equal(t, tc.want, effectiveConcurrencyLimit(tc.userLimit))
+		})
+	}
+}
diff --git a/service/worker/scheduler/types.go b/service/worker/scheduler/types.go
@@ -75,6 +75,14 @@ const (
 	// entries stays well within the workflow input size limit.
 	MaxBufferedFiresSystemLimit = 1000
 
+	// MaxConcurrencyLimitSystemLimit caps ConcurrencyLimit for the bounded CONCURRENT
+	// overlap policy regardless of the user-configured value. It bounds the
+	// RunningWorkflows slice carried in ContinueAsNew payload: each RunningWorkflowInfo
+	// is ~110 bytes JSON, so 1000 entries adds ~107KB — well within the 2MB hard limit
+	// and leaving headroom for the rest of the workflow state. Exceeding the 2MB limit
+	// causes Cadence to fail the workflow entirely with no graceful degradation.
+	MaxConcurrencyLimitSystemLimit = 1000
+
 	// signal_type tag values for scheduler_signal_received_count metric.
 	signalTypeTagPause    = "pause"
 	signalTypeTagUnpause  = "unpause"
@@ -152,6 +160,9 @@ type SchedulerWorkflowState struct {
 	// the overlap policy can check whether it is still running before starting
 	// the next one. Nil when no workflow has been started yet.
 	LastStartedWorkflow *RunningWorkflowInfo `json:"lastStartedWorkflow,omitempty"`
+	// RunningWorkflows holds in-flight target workflows under bounded CONCURRENT
+	// (ConcurrencyLimit > 0); completed entries are pruned by the activity on each fire.
+	RunningWorkflows []RunningWorkflowInfo `json:"runningWorkflows,omitempty"`
 }
 
 // BufferedFire is a schedule fire queued for sequential execution by the BUFFER
@@ -259,6 +270,11 @@ type ProcessFireRequest struct {
 	TriggerSource       TriggerSource               `json:"triggerSource"`
 	OverlapPolicy       types.ScheduleOverlapPolicy `json:"overlapPolicy"`
 	LastStartedWorkflow *RunningWorkflowInfo        `json:"lastStartedWorkflow,omitempty"`
+	// ConcurrencyLimit mirrors SchedulePolicies.ConcurrencyLimit; 0 = unlimited.
+	ConcurrencyLimit int32 `json:"concurrencyLimit,omitempty"`
+	// RunningWorkflows is the current in-flight set from workflow state; used
+	// only when OverlapPolicy==CONCURRENT and ConcurrencyLimit > 0.
+	RunningWorkflows []RunningWorkflowInfo `json:"runningWorkflows,omitempty"`
 }
 
 // ProcessFireResult is the output of processScheduleFireActivity. The workflow
@@ -272,4 +288,7 @@ type ProcessFireResult struct {
 	// appends the fire to state.BufferedFires and retries draining on the
 	// next loop iteration.
 	Buffered bool `json:"buffered,omitempty"`
+	// ActiveWorkflows is the updated in-flight set for bounded CONCURRENT; the workflow
+	// replaces state.RunningWorkflows with it after each fire. Nil for all other policies.
+	ActiveWorkflows []RunningWorkflowInfo `json:"activeWorkflows,omitempty"`
 }
diff --git a/service/worker/scheduler/workflow.go b/service/worker/scheduler/workflow.go
@@ -422,6 +422,20 @@ func handleUpdate(logger *zap.Logger, sig UpdateSignal, input *SchedulerWorkflow
 			state.SkippedRuns += int64(len(state.BufferedFires))
 			state.BufferedFires = nil
 		}
+		// Drop running-workflow tracking when leaving bounded CONCURRENT: the
+		// list is meaningless under any other policy or when limit becomes 0.
+		newOverlap := input.Policies.OverlapPolicy
+		newLimit := input.Policies.ConcurrencyLimit
+		if previousOverlap == types.ScheduleOverlapPolicyConcurrent &&
+			(newOverlap != types.ScheduleOverlapPolicyConcurrent || newLimit == 0) &&
+			len(state.RunningWorkflows) > 0 {
+			logger.Warn("policy change cleared running workflows tracking",
+				zap.String("from", previousOverlap.String()),
+				zap.String("to", newOverlap.String()),
+				zap.Int32("newLimit", newLimit),
+				zap.Int("clearedCount", len(state.RunningWorkflows)))
+			state.RunningWorkflows = nil
+		}
 	}
 	if changed {
 		logger.Info("schedule updated")
@@ -533,6 +547,8 @@ func tryStartFire(ctx workflow.Context, logger *zap.Logger, input *SchedulerWork
 		TriggerSource:       trigger,
 		OverlapPolicy:       input.Policies.OverlapPolicy,
 		LastStartedWorkflow: state.LastStartedWorkflow,
+		ConcurrencyLimit:    input.Policies.ConcurrencyLimit,
+		RunningWorkflows:    state.RunningWorkflows,
 	}
 
 	var result ProcessFireResult
@@ -554,6 +570,9 @@ func tryStartFire(ctx workflow.Context, logger *zap.Logger, input *SchedulerWork
 	if result.StartedWorkflow != nil {
 		state.LastStartedWorkflow = result.StartedWorkflow
 	}
+	if result.ActiveWorkflows != nil {
+		state.RunningWorkflows = result.ActiveWorkflows
+	}
 
 	if result.TotalDelta > 0 && result.StartedWorkflow != nil {
 		logger.Info("scheduled workflow started",
@@ -612,6 +631,18 @@ func effectiveBufferLimit(userLimit int32) (effective int, reason string) {
 	return int(userLimit), BufferOverflowReasonUserLimit
 }
 
+// effectiveConcurrencyLimit returns the concurrency cap enforced for the bounded
+// CONCURRENT overlap policy. Values above the system ceiling are silently clamped
+// so RunningWorkflows never grows large enough to bloat the ContinueAsNew payload
+// toward Cadence's BlobSizeLimitError (default 2MB). Only called when userLimit > 0
+// (i.e., isBoundedConcurrent is true).
+func effectiveConcurrencyLimit(userLimit int32) int32 {
+	if userLimit > MaxConcurrencyLimitSystemLimit {
+		return MaxConcurrencyLimitSystemLimit
+	}
+	return userLimit
+}
+
 // drainBufferedFires executes queued fires in FIFO order, stopping as soon as
 // one re-buffers (previous target workflow still running) or
 // maxDrainFiresPerExecution fires have been processed. Returns true when more
diff --git a/service/worker/scheduler/workflow_test.go b/service/worker/scheduler/workflow_test.go