Skip to content

Commit

Permalink
Add Prometheus rule generators for metrics aggregated by sloth ID (#2)
Browse files Browse the repository at this point in the history
To be used when SLO has one metrics series and some labels change, to continue the same series in the recording rules. Ignore all other labels except for sloth_id.
  • Loading branch information
krisleipus authored Jun 20, 2024
1 parent 7bec2a8 commit bfebb37
Show file tree
Hide file tree
Showing 3 changed files with 470 additions and 4 deletions.
181 changes: 181 additions & 0 deletions internal/prometheus/recording_rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ type sliRecordingRulesGenerator struct {
// Normally these rules are used by the SLO alerts.
var OptimizedSLIRecordingRulesGenerator = sliRecordingRulesGenerator{genFunc: optimizedFactorySLIRecordGenerator}

// OptimizedBySlothIDSLIRecordingRulesGenerator knows how to generate the SLI prometheus recording rules
// from an SLO optimizing where it can and aggregating by sloth_id to allow changes to other labels without breaking.
var OptimizedBySlothIDSLIRecordingRulesGenerator = sliRecordingRulesGenerator{genFunc: optimizedFactorySLIBySlothIDRecordGenerator}

// SLIRecordingRulesGenerator knows how to generate the SLI prometheus recording rules
// form an SLO.
// Normally these rules are used by the SLO alerts.
Expand All @@ -40,6 +44,15 @@ func optimizedFactorySLIRecordGenerator(slo SLO, window time.Duration, alerts al
return factorySLIRecordGenerator(slo, window, alerts)
}

func optimizedFactorySLIBySlothIDRecordGenerator(slo SLO, window time.Duration, alerts alert.MWMBAlertGroup) (*rulefmt.Rule, error) {
// Optimize the rules that are for the total period time window.
if window == slo.TimeWindow {
return optimizedBySlothIDSLIRecordGenerator(slo, window, alerts.PageQuick.ShortWindow)
}

return factorySLIRecordGenerator(slo, window, alerts)
}

func (s sliRecordingRulesGenerator) GenerateSLIRecordingRules(ctx context.Context, slo SLO, alerts alert.MWMBAlertGroup) ([]rulefmt.Rule, error) {
// Get the windows we need the recording rules.
windows := getAlertGroupWindows(alerts)
Expand Down Expand Up @@ -196,6 +209,56 @@ count_over_time({{.metric}}{{.filter}}[{{.window}}])
}, nil
}

// optimizedBySlothIDSLIRecordGenerator generates the same optimized SLI recording rule as optimizedSLIRecordGenerator,
// just aggregates by the sloth_id label to allow changes to other labels without breaking.
func optimizedBySlothIDSLIRecordGenerator(slo SLO, window, shortWindow time.Duration) (*rulefmt.Rule, error) {
// Averages over ratios (average over average) is statistically incorrect, so we do
// aggregate all ratios on the time window and then divide with the aggregation of all the full ratios
// that is 1 (thats why we can use `count`), giving use a correct ratio of ratios:
// - https://prometheus.io/docs/practices/rules/
// - https://math.stackexchange.com/questions/95909/why-is-an-average-of-an-average-usually-incorrect
const sliExprTplFmt = `sum by (sloth_id) (sum_over_time({{.metric}}{{.filter}}[{{.window}}]))
/
sum by (sloth_id) (count_over_time({{.metric}}{{.filter}}[{{.window}}]))
`

if window == shortWindow {
return nil, fmt.Errorf("can't optimize using the same shortwindow as the window to optimize")
}

shortWindowSLIRec := slo.GetSLIErrorMetric(shortWindow)

// Render with our templated data.
tpl, err := template.New("sliExpr").Option("missingkey=error").Parse(sliExprTplFmt)
if err != nil {
return nil, fmt.Errorf("could not create SLI expression template data: %w", err)
}

strWindow := timeDurationToPromStr(window)
var b bytes.Buffer
err = tpl.Execute(&b, map[string]string{
"metric": shortWindowSLIRec,
"filter": slothIDFilter(slo.ID),
"window": strWindow,
"windowKey": sloWindowLabelName,
})
if err != nil {
return nil, fmt.Errorf("could not render SLI expression template: %w", err)
}

return &rulefmt.Rule{
Record: slo.GetSLIErrorMetric(window),
Expr: b.String(),
Labels: mergeLabels(
slo.GetSLOIDPromLabels(),
map[string]string{
sloWindowLabelName: strWindow,
},
slo.Labels,
),
}, nil
}

type metadataRecordingRulesGenerator bool

// MetadataRecordingRulesGenerator knows how to generate the metadata prometheus recording rules
Expand Down Expand Up @@ -305,7 +368,125 @@ func (m metadataRecordingRulesGenerator) GenerateMetadataRecordingRules(ctx cont
return rules, nil
}

type metadataRecordingRulesBySlothIDGenerator bool

// MetadataRecordingRulesBySlothIDGenerator knows how to generate the metadata prometheus recording rules
// from an SLO aggregating by sloth_id to allow changes to other labels without breaking.
const MetadataRecordingRulesBySlothIDGenerator = metadataRecordingRulesBySlothIDGenerator(false)

func (m metadataRecordingRulesBySlothIDGenerator) GenerateMetadataRecordingRules(ctx context.Context, info info.Info, slo SLO, alerts alert.MWMBAlertGroup) ([]rulefmt.Rule, error) {
labels := mergeLabels(slo.GetSLOIDPromLabels(), slo.Labels)

// Metatada Recordings.
const (
metricSLOObjectiveRatio = "slo:objective:ratio"
metricSLOErrorBudgetRatio = "slo:error_budget:ratio"
metricSLOTimePeriodDays = "slo:time_period:days"
metricSLOCurrentBurnRateRatio = "slo:current_burn_rate:ratio"
metricSLOPeriodBurnRateRatio = "slo:period_burn_rate:ratio"
metricSLOPeriodErrorBudgetRemainingRatio = "slo:period_error_budget_remaining:ratio"
metricSLOInfo = "sloth_slo_info"
)

sloObjectiveRatio := slo.Objective / 100

sloFilter := slothIDFilter(slo.ID)

var currentBurnRateExpr bytes.Buffer
err := burnRateRecordingBySlothIDExprTpl.Execute(&currentBurnRateExpr, map[string]string{
"SLIErrorMetric": slo.GetSLIErrorMetric(alerts.PageQuick.ShortWindow),
"MetricFilter": sloFilter,
"SLOIDName": sloIDLabelName,
"SLOLabelName": sloNameLabelName,
"SLOServiceName": sloServiceLabelName,
"ErrorBudgetRatioMetric": metricSLOErrorBudgetRatio,
})
if err != nil {
return nil, fmt.Errorf("could not render current burn rate prometheus metadata recording rule expression: %w", err)
}

var periodBurnRateExpr bytes.Buffer
err = burnRateRecordingBySlothIDExprTpl.Execute(&periodBurnRateExpr, map[string]string{
"SLIErrorMetric": slo.GetSLIErrorMetric(slo.TimeWindow),
"MetricFilter": sloFilter,
"SLOIDName": sloIDLabelName,
"SLOLabelName": sloNameLabelName,
"SLOServiceName": sloServiceLabelName,
"ErrorBudgetRatioMetric": metricSLOErrorBudgetRatio,
})
if err != nil {
return nil, fmt.Errorf("could not render period burn rate prometheus metadata recording rule expression: %w", err)
}

rules := []rulefmt.Rule{
// SLO Objective.
{
Record: metricSLOObjectiveRatio,
Expr: fmt.Sprintf(`vector(%g)`, sloObjectiveRatio),
Labels: labels,
},

// Error budget.
{
Record: metricSLOErrorBudgetRatio,
Expr: fmt.Sprintf(`vector(1-%g)`, sloObjectiveRatio),
Labels: labels,
},

// Total period.
{
Record: metricSLOTimePeriodDays,
Expr: fmt.Sprintf(`vector(%g)`, slo.TimeWindow.Hours()/24),
Labels: labels,
},

// Current burning speed.
{
Record: metricSLOCurrentBurnRateRatio,
Expr: currentBurnRateExpr.String(),
Labels: labels,
},

// Total period burn rate.
{
Record: metricSLOPeriodBurnRateRatio,
Expr: periodBurnRateExpr.String(),
Labels: labels,
},

// Total Error budget remaining period.
{
Record: metricSLOPeriodErrorBudgetRemainingRatio,
Expr: fmt.Sprintf(`1 - max(%s%s)`, metricSLOPeriodBurnRateRatio, sloFilter),
Labels: labels,
},

// Info.
{
Record: metricSLOInfo,
Expr: `vector(1)`,
Labels: mergeLabels(labels, map[string]string{
sloVersionLabelName: info.Version,
sloModeLabelName: string(info.Mode),
sloSpecLabelName: info.Spec,
sloObjectiveLabelName: strconv.FormatFloat(slo.Objective, 'f', -1, 64),
}),
},
}

return rules, nil
}

var burnRateRecordingExprTpl = template.Must(template.New("burnRateExpr").Option("missingkey=error").Parse(`{{ .SLIErrorMetric }}{{ .MetricFilter }}
/ on({{ .SLOIDName }}, {{ .SLOLabelName }}, {{ .SLOServiceName }}) group_left
{{ .ErrorBudgetRatioMetric }}{{ .MetricFilter }}
`))

var burnRateRecordingBySlothIDExprTpl = template.Must(template.New("burnRateExpr").Option("missingkey=error").Parse(`max({{ .SLIErrorMetric }}{{ .MetricFilter }})
/
max({{ .ErrorBudgetRatioMetric }}{{ .MetricFilter }})
`))

func slothIDFilter(sloID string) string {
return fmt.Sprintf(`{sloth_id=%q}`, sloID)
}
Loading

0 comments on commit bfebb37

Please sign in to comment.