Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 65 additions & 15 deletions internal/executor/categorizer/classifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,29 @@ type rule struct {
containerName string
onExitCodes *errormatch.ExitCodeMatcher
onTerminationMessage *regexp.Regexp
onPodError *regexp.Regexp
onConditions []string
subcategory string
hint string
}

// ClassifyResult holds the classification output for a failed pod.
type ClassifyResult struct {
Category string
Subcategory string
// Hint is operator-supplied user-facing copy attached to the matching rule.
// Use AppendHint to attach it to the failure message before emitting events.
Hint string
}

// AppendHint returns the message with this result's hint appended after a blank
// line. Returns the message unchanged when no hint is set. Centralizing the
// format here keeps both event-reporting call sites consistent.
func (r ClassifyResult) AppendHint(message string) string {
if r.Hint == "" {
return message
}
return fmt.Sprintf("%s\n\n%s", message, r.Hint)
}

// Classifier evaluates pods against a set of category rules and returns
Expand Down Expand Up @@ -101,11 +116,14 @@ func buildRule(cfg CategoryRule) (rule, error) {
if cfg.OnTerminationMessage != nil {
matcherCount++
}
if cfg.OnPodError != nil {
matcherCount++
}
if matcherCount == 0 {
return rule{}, fmt.Errorf("rule must specify one of onConditions, onExitCodes, or onTerminationMessage")
return rule{}, fmt.Errorf("rule must specify one of onConditions, onExitCodes, onTerminationMessage, or onPodError")
}
if matcherCount > 1 {
return rule{}, fmt.Errorf("rule must specify only one of onConditions, onExitCodes, or onTerminationMessage")
return rule{}, fmt.Errorf("rule must specify only one of onConditions, onExitCodes, onTerminationMessage, or onPodError")
}

for _, cond := range cfg.OnConditions {
Expand All @@ -128,29 +146,57 @@ func buildRule(cfg CategoryRule) (rule, error) {
}
}

var compiledRegex *regexp.Regexp
var terminationRegex *regexp.Regexp
if cfg.OnTerminationMessage != nil {
re, err := regexp.Compile(cfg.OnTerminationMessage.Pattern)
if err != nil {
return rule{}, fmt.Errorf("invalid regex %q: %w", cfg.OnTerminationMessage.Pattern, err)
return rule{}, fmt.Errorf("invalid onTerminationMessage regex %q: %w", cfg.OnTerminationMessage.Pattern, err)
}
terminationRegex = re
}

var podErrorRegex *regexp.Regexp
if cfg.OnPodError != nil {
re, err := regexp.Compile(cfg.OnPodError.Pattern)
if err != nil {
return rule{}, fmt.Errorf("invalid onPodError regex %q: %w", cfg.OnPodError.Pattern, err)
}
compiledRegex = re
podErrorRegex = re
}

return rule{
containerName: cfg.ContainerName,
onExitCodes: cfg.OnExitCodes,
onConditions: cfg.OnConditions,
onTerminationMessage: compiledRegex,
onTerminationMessage: terminationRegex,
onPodError: podErrorRegex,
subcategory: cfg.Subcategory,
hint: cfg.Hint,
}, nil
}

// Classify returns the category and subcategory for the given pod.
// Rules are evaluated in config order; the first matching rule wins.
// ClassifyContainerError returns the category and subcategory for a pod whose
// failure is described by its own state: terminated containers, exit codes,
// and Kubernetes conditions. Use it for terminated pods (PodFailed phase).
// Returns empty result if the receiver is nil or the pod is nil.
// Returns (defaultCategory, defaultSubcategory) if no rules match.
func (c *Classifier) ClassifyContainerError(pod *v1.Pod) ClassifyResult {
return c.classify(pod, "")
}

// ClassifyPodError returns the category and subcategory for a pod-level failure
// captured by the executor (image pull, missing volume, stuck terminating,
// active deadline exceeded, etc.). It additionally matches podErrorMessage
// against onPodError rules (see CategoryRule.OnPodError); all other rule types
// are evaluated against pod state, preserving first-match-wins across config order.
// Returns empty result if the receiver is nil or the pod is nil.
// Returns (defaultCategory, defaultSubcategory) if no rules match.
func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult {
func (c *Classifier) ClassifyPodError(pod *v1.Pod, podErrorMessage string) ClassifyResult {
return c.classify(pod, podErrorMessage)
}

// Rules are evaluated in config order; the first matching rule wins.
func (c *Classifier) classify(pod *v1.Pod, podErrorMessage string) ClassifyResult {
if c == nil || pod == nil {
return ClassifyResult{}
}
Expand All @@ -160,20 +206,21 @@ func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult {
for _, cat := range c.categories {
for _, r := range cat.rules {
start := time.Now()
matched := ruleMatches(r, containers, podReason)
matched := ruleMatches(r, containers, podReason, podErrorMessage)
metrics.RecordRuleEvaluationDuration(cat.name, r.subcategory, time.Since(start))
if matched {
return ClassifyResult{Category: cat.name, Subcategory: r.subcategory}
return ClassifyResult{Category: cat.name, Subcategory: r.subcategory, Hint: r.hint}
}
}
}
return ClassifyResult{Category: c.defaultCategory, Subcategory: c.defaultSubcategory}
}

// ruleMatches evaluates a single rule. When containerName is set, only that
// container is considered. It checks the first non-nil matcher:
// conditions > exit codes > termination message.
func ruleMatches(r rule, containers []containerInfo, podReason string) bool {
// ruleMatches evaluates a single rule. Container-level matchers honor the
// rule's containerName scope (when set); onPodError ignores it because the
// pod-level error has no container attribution. Exactly one matcher is set
// per rule (validated at NewClassifier).
func ruleMatches(r rule, containers []containerInfo, podReason, podErrorMessage string) bool {
filtered := containers
if r.containerName != "" {
filtered = filterByName(containers, r.containerName)
Expand All @@ -187,6 +234,9 @@ func ruleMatches(r rule, containers []containerInfo, podReason string) bool {
if r.onTerminationMessage != nil {
return matchesTerminationMessage(r.onTerminationMessage, filtered)
}
if r.onPodError != nil {
return podErrorMessage != "" && errormatch.MatchPattern(r.onPodError, podErrorMessage)
}
return false
}

Expand Down
128 changes: 124 additions & 4 deletions internal/executor/categorizer/classifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ func TestClassify(t *testing.T) {
tests := map[string]struct {
config ErrorCategoriesConfig
pod *v1.Pod
podErrorMessage string
expectedCategory string
expectedSubcategory string
}{
Expand Down Expand Up @@ -283,19 +284,130 @@ func TestClassify(t *testing.T) {
pod: podWithTerminatedContainer(1, "Error", ""),
expectedCategory: "",
},
"onPodError matches the captured kubelet error": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{OnPodError: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "platform_mismatch"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{
Phase: v1.PodPending,
ContainerStatuses: []v1.ContainerStatus{
{Name: "main", State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
}},
},
}},
podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
expectedCategory: "infrastructure",
expectedSubcategory: "platform_mismatch",
},
"empty podErrorMessage does not match onPodError": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{OnPodError: &errormatch.RegexMatcher{Pattern: "anything"}, Subcategory: "x"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}},
podErrorMessage: "",
expectedCategory: "",
},
// Pins the public contract that ClassifyContainerError never matches onPodError rules,
// even with a pattern (".*") that would otherwise match empty input. The guard is enforced
// at two layers (ruleMatches and errormatch.MatchPattern); this test fails only if both go,
// which is the right level to assert the contract regardless of internal layering.
"ClassifyContainerError must not match onPodError even when regex matches empty": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{OnPodError: &errormatch.RegexMatcher{Pattern: ".*"}, Subcategory: "should_not_fire"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}},
podErrorMessage: "",
expectedCategory: "",
},
"onPodError ignores ContainerName scope (pod-level error has no container attribution)": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{ContainerName: "init", OnPodError: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "platform_mismatch"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{
Phase: v1.PodPending,
ContainerStatuses: []v1.ContainerStatus{
{Name: "main", State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
}},
},
}},
podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
expectedCategory: "infrastructure",
expectedSubcategory: "platform_mismatch",
},
"onTerminationMessage does not match pod-level podErrorMessage": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{OnTerminationMessage: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "should_not_fire"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{
Phase: v1.PodPending,
ContainerStatuses: []v1.ContainerStatus{
{Name: "main", State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
}},
},
}},
podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
expectedCategory: "",
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
classifier, err := NewClassifier(tc.config)
require.NoError(t, err)
result := classifier.Classify(tc.pod)
var result ClassifyResult
if tc.podErrorMessage == "" {
result = classifier.ClassifyContainerError(tc.pod)
} else {
result = classifier.ClassifyPodError(tc.pod, tc.podErrorMessage)
}
assert.Equal(t, tc.expectedCategory, result.Category)
assert.Equal(t, tc.expectedSubcategory, result.Subcategory)
})
}
}

func TestClassifyResult_AppendHint(t *testing.T) {
tests := map[string]struct {
result ClassifyResult
message string
expected string
}{
"empty hint returns message unchanged": {
result: ClassifyResult{Category: "x", Subcategory: "y"},
message: "raw runtime error",
expected: "raw runtime error",
},
"non-empty hint is appended after a blank line": {
result: ClassifyResult{Hint: "operator guidance"},
message: "raw runtime error",
expected: "raw runtime error\n\noperator guidance",
},
"empty message with hint preserves separator": {
result: ClassifyResult{Hint: "guidance"},
message: "",
expected: "\n\nguidance",
},
}
for name, tc := range tests {
t.Run(name, func(t *testing.T) {
assert.Equal(t, tc.expected, tc.result.AppendHint(tc.message))
})
}
}

func TestNewClassifier_ValidationErrors(t *testing.T) {
tests := map[string]struct {
config ErrorCategoriesConfig
Expand Down Expand Up @@ -348,13 +460,21 @@ func TestNewClassifier_ValidationErrors(t *testing.T) {
}},
errContains: "requires at least one value",
},
"invalid regex": {
"invalid onTerminationMessage regex": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "bad", Rules: []CategoryRule{
{OnTerminationMessage: &errormatch.RegexMatcher{Pattern: "[invalid"}},
}},
}},
errContains: "invalid regex",
errContains: "invalid onTerminationMessage regex",
},
"invalid onPodError regex": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "bad", Rules: []CategoryRule{
{OnPodError: &errormatch.RegexMatcher{Pattern: "[invalid"}},
}},
}},
errContains: "invalid onPodError regex",
},
"empty rules": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{{Name: "empty", Rules: nil}}},
Expand Down Expand Up @@ -424,7 +544,7 @@ func TestClassify_RecordsRuleEvaluationDuration(t *testing.T) {

before := ruleHistogramCounts(t)
pod := podWithTerminatedContainer(42, "Error", "")
result := classifier.Classify(pod)
result := classifier.ClassifyContainerError(pod)
require.Equal(t, "user_error", result.Category)
require.Equal(t, "bad_exit", result.Subcategory)
after := ruleHistogramCounts(t)
Expand Down
25 changes: 23 additions & 2 deletions internal/executor/categorizer/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,19 @@
// category name and the rule's optional subcategory.
//
// Each rule uses exactly one matcher:
// - OnConditions: matches Kubernetes failure signals (OOMKilled, Evicted, DeadlineExceeded, AppError)
// - OnConditions: matches Kubernetes failure signals (OOMKilled, Evicted, DeadlineExceeded)
// - OnExitCodes: matches non-zero container exit codes using In/NotIn set operators
// - OnTerminationMessage: matches container termination messages against a regex
// - OnPodError: matches a pod-level error message captured by the executor
// against a regex; covers failures with no useful container terminationMessage
// (image pull, missing volume, stuck terminating, deadline exceeded, etc.)
//
// Container-level matchers honor ContainerName scoping when set. OnPodError
// ignores it because pod-level error text has no container attribution.
//
// Each rule may also set Hint, an optional user-facing string that the executor
// appends to the failure message. Hints land in lookoutdb.job_run.error and
// are surfaced to users in Lookout alongside the raw runtime error.
//
// Exit code 0 is always skipped. Both regular and init containers are checked.
//
Expand All @@ -29,8 +39,13 @@
// rules:
// - onConditions: ["OOMKilled"]
// subcategory: "oom"
// hint: "Increase the memory request in your job spec"
// - onConditions: ["Evicted"]
// subcategory: "eviction"
// - onPodError:
// pattern: "no match for platform in manifest"
// subcategory: "platform_mismatch"
// hint: "Build the image for the cluster's CPU architecture (typically x64/arm64 mismatch)"
// - name: user_code
// rules:
// - onExitCodes:
Expand All @@ -53,5 +68,11 @@
// if err != nil {
// // handle invalid config
// }
// result := classifier.Classify(pod) // result.Category = "infrastructure", result.Subcategory = "oom"
//
// // Terminated pod: container state carries the relevant termination signals.
// result := classifier.ClassifyContainerError(pod)
//
// // Pre-terminal failure: an executor-captured error message is matched
// // against onPodError rules in addition to pod state.
// result = classifier.ClassifyPodError(pod, podErrorMessage)
package categorizer
Loading
Loading