Content-Length: 33307 | pFad | http://github.com/coder/coder/pull/18008.patch
thub.com
From 6fb6800d0d0b4357aabef958c20a3f78295f4b34 Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Thu, 22 May 2025 18:32:19 +0000
Subject: [PATCH 01/10] feat: add hard-limited presets metric
---
.../coderd/prebuilds/metricscollector.go | 41 +++++++++++++++++++
enterprise/coderd/prebuilds/reconcile.go | 2 +
enterprise/coderd/prebuilds/reconcile_test.go | 38 ++++++++++++++++-
3 files changed, 80 insertions(+), 1 deletion(-)
diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go
index 7a7734b6f8093..8d1d3185b7feb 100644
--- a/enterprise/coderd/prebuilds/metricscollector.go
+++ b/enterprise/coderd/prebuilds/metricscollector.go
@@ -27,6 +27,7 @@ const (
MetricDesiredGauge = namespace + "desired"
MetricRunningGauge = namespace + "running"
MetricEligibleGauge = namespace + "eligible"
+ MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
)
@@ -82,6 +83,12 @@ var (
labels,
nil,
)
+ presetHardLimitedDesc = prometheus.NewDesc(
+ MetricPresetHardLimitedGauge,
+ "Indicates whether a given preset has reached the hard failure limit (1 for hard-limited, 0 otherwise).",
+ labels,
+ nil,
+ )
lastUpdateDesc = prometheus.NewDesc(
MetricLastUpdatedGauge,
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
@@ -104,17 +111,22 @@ type MetricsCollector struct {
replacementsCounter map[replacementKey]float64
replacementsCounterMu sync.Mutex
+
+ isPresetHardLimited map[hardLimitedPresetKey]bool
+ isPresetHardLimitedMu sync.Mutex
}
var _ prometheus.Collector = new(MetricsCollector)
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
log := logger.Named("prebuilds_metrics_collector")
+
return &MetricsCollector{
database: db,
logger: log,
snapshotter: snapshotter,
replacementsCounter: make(map[replacementKey]float64),
+ isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
}
}
@@ -126,6 +138,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- desiredPrebuildsDesc
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
+ descCh <- presetHardLimitedDesc
descCh <- lastUpdateDesc
}
@@ -173,6 +186,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
}
+ mc.isPresetHardLimitedMu.Lock()
+ for key, isHardLimited := range mc.isPresetHardLimited {
+ var val float64
+ if isHardLimited {
+ val = 1
+ }
+
+ metricsCh <- prometheus.MustNewConstMetric(presetHardLimitedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName)
+ }
+ mc.isPresetHardLimitedMu.Unlock()
+
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
}
@@ -247,3 +271,20 @@ func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, pres
// cause an issue (or indeed if either would), so we just track the replacement.
mc.replacementsCounter[key]++
}
+
+type hardLimitedPresetKey struct {
+ orgName, templateName, presetName string
+}
+
+func (k hardLimitedPresetKey) String() string {
+ return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
+}
+
+func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
+ mc.isPresetHardLimitedMu.Lock()
+ defer mc.isPresetHardLimitedMu.Unlock()
+
+ key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}
+
+ mc.isPresetHardLimited[key] = isHardLimited
+}
diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go
index 7796e43777951..f4a7887b6033a 100644
--- a/enterprise/coderd/prebuilds/reconcile.go
+++ b/enterprise/coderd/prebuilds/reconcile.go
@@ -361,6 +361,8 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)
+ c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
+
// If the preset was previously hard-limited, log it and exit early.
if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited {
logger.Warn(ctx, "skipping hard limited preset")
diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go
index f52a77ca500b9..681a557bd3fe7 100644
--- a/enterprise/coderd/prebuilds/reconcile_test.go
+++ b/enterprise/coderd/prebuilds/reconcile_test.go
@@ -696,7 +696,8 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
fakeEnqueuer := newFakeEnqueuer()
- controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer)
+ registry := prometheus.NewRegistry()
+ controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer)
// Template admin to receive a notification.
templateAdmin := dbgen.User(t, db, database.User{
@@ -732,6 +733,17 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
workspaceCount := len(workspaces)
require.Equal(t, 1, workspaceCount)
+ // Verify initial state: metric is not set - meaning preset is not hard limited.
+ require.NoError(t, controller.ForceMetricsUpdate(ctx))
+ mf, err := registry.Gather()
+ require.NoError(t, err)
+ metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
+ "template_name": template.Name,
+ "preset_name": preset.Name,
+ "org_name": org.Name,
+ })
+ require.Nil(t, metric)
+
// We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called.
// Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond.
clock.Advance(time.Nanosecond).MustWait(ctx)
@@ -755,6 +767,18 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
// When hard limit is not reached, a new workspace should be created.
require.Equal(t, 2, len(workspaces))
require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus)
+
+ // When hard limit is not reached, metric is set to 0.
+ mf, err = registry.Gather()
+ require.NoError(t, err)
+ metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
+ "template_name": template.Name,
+ "preset_name": preset.Name,
+ "org_name": org.Name,
+ })
+ require.NotNil(t, metric)
+ require.NotNil(t, metric.GetGauge())
+ require.EqualValues(t, 0, metric.GetGauge().GetValue())
return
}
@@ -775,6 +799,18 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
return true
})
require.Len(t, matching, 1)
+
+ // When hard limit is reached, metric is set to 1.
+ mf, err = registry.Gather()
+ require.NoError(t, err)
+ metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
+ "template_name": template.Name,
+ "preset_name": preset.Name,
+ "org_name": org.Name,
+ })
+ require.NotNil(t, metric)
+ require.NotNil(t, metric.GetGauge())
+ require.EqualValues(t, 1, metric.GetGauge().GetValue())
})
}
}
From a1a4f2a2a7b1be28b3d93228183ccf6f7df1e618 Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Fri, 23 May 2025 15:59:45 +0000
Subject: [PATCH 02/10] fix: report metric only for active template version
---
enterprise/coderd/prebuilds/reconcile.go | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go
index f4a7887b6033a..d90316d567170 100644
--- a/enterprise/coderd/prebuilds/reconcile.go
+++ b/enterprise/coderd/prebuilds/reconcile.go
@@ -361,7 +361,9 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)
- c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
+ if !ps.Preset.Deleted && ps.Preset.UsingActiveVersion {
+ c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
+ }
// If the preset was previously hard-limited, log it and exit early.
if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited {
From 26676840a5d970a345a59dd1732d2433b0af4009 Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Fri, 23 May 2025 16:43:38 +0000
Subject: [PATCH 03/10] fix: hard-limiting should only apply to creation of
prebuilds
---
enterprise/coderd/prebuilds/reconcile.go | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go
index d90316d567170..b18d2c1b0c3f4 100644
--- a/enterprise/coderd/prebuilds/reconcile.go
+++ b/enterprise/coderd/prebuilds/reconcile.go
@@ -365,16 +365,10 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
}
- // If the preset was previously hard-limited, log it and exit early.
- if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited {
- logger.Warn(ctx, "skipping hard limited preset")
- return nil
- }
-
// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
- if ps.IsHardLimited {
+ if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited {
logger.Warn(ctx, "skipping hard limited preset")
err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
@@ -388,10 +382,7 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
err = c.notifyPrebuildFailureLimitReached(ctx, ps)
if err != nil {
logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err))
- return nil
}
-
- return nil
}
state := ps.CalculateState()
@@ -456,6 +447,14 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
actions.Create = desired
}
+ if actions.Create > 0 {
+ // If the preset is hard-limited, log it and exit early.
+ if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited || ps.IsHardLimited {
+ logger.Warn(ctx, "skipping hard limited preset")
+ return nil
+ }
+ }
+
var multiErr multierror.Error
for range actions.Create {
From 6a14a8b2564f7491ce971181b2d330c2a7cfe4d2 Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Fri, 23 May 2025 20:26:09 +0000
Subject: [PATCH 04/10] test: improve test coverage for hard-limited presets
---
enterprise/coderd/prebuilds/reconcile_test.go | 207 ++++++++++++++++++
1 file changed, 207 insertions(+)
diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go
index 681a557bd3fe7..1d4a47fc442ad 100644
--- a/enterprise/coderd/prebuilds/reconcile_test.go
+++ b/enterprise/coderd/prebuilds/reconcile_test.go
@@ -815,6 +815,213 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
}
}
+func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
+ t.Parallel()
+
+ if !dbtestutil.WillUsePostgres() {
+ t.Skip("This test requires postgres")
+ }
+
+ // Test cases verify the behavior of prebuild creation depending on configured failure limits.
+ testCases := []struct {
+ name string
+ hardLimit int64
+ isHardLimitHit bool
+ }{
+ {
+ name: "hard limit is hit - skip creation of prebuilt workspace",
+ hardLimit: 1,
+ isHardLimitHit: true,
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ clock := quartz.NewMock(t)
+ ctx := testutil.Context(t, testutil.WaitShort)
+ cfg := codersdk.PrebuildsConfig{
+ FailureHardLimit: serpent.Int64(tc.hardLimit),
+ ReconciliationBackoffInterval: 0,
+ }
+ logger := slogtest.Make(
+ t, &slogtest.Options{IgnoreErrors: true},
+ ).Leveled(slog.LevelDebug)
+ db, pubSub := dbtestutil.NewDB(t)
+ fakeEnqueuer := newFakeEnqueuer()
+ registry := prometheus.NewRegistry()
+ controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer)
+
+ // Template admin to receive a notification.
+ templateAdmin := dbgen.User(t, db, database.User{
+ RBACRoles: []string{codersdk.RoleTemplateAdmin},
+ })
+
+ // Set up test environment with a template, version, and preset.
+ ownerID := uuid.New()
+ dbgen.User(t, db, database.User{
+ ID: ownerID,
+ })
+ org, template := setupTestDBTemplate(t, db, ownerID, false)
+ templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID)
+ preset := setupTestDBPreset(t, db, templateVersionID, 2, uuid.New().String())
+
+ // Create a successful prebuilt workspace.
+ successfulWorkspace, _ := setupTestDBPrebuild(
+ t,
+ clock,
+ db,
+ pubSub,
+ database.WorkspaceTransitionStart,
+ database.ProvisionerJobStatusSucceeded,
+ org.ID,
+ preset,
+ template.ID,
+ templateVersionID,
+ )
+
+ // Make sure that prebuilt workspaces created in such order: [successful, failed].
+ clock.Advance(time.Second).MustWait(ctx)
+
+ // Create a failed prebuilt workspace that counts toward the hard failure limit.
+ setupTestDBPrebuild(
+ t,
+ clock,
+ db,
+ pubSub,
+ database.WorkspaceTransitionStart,
+ database.ProvisionerJobStatusFailed,
+ org.ID,
+ preset,
+ template.ID,
+ templateVersionID,
+ )
+
+ getJobStatusMap := func(workspaces []database.WorkspaceTable) map[database.ProvisionerJobStatus]int {
+ jobStatusMap := make(map[database.ProvisionerJobStatus]int)
+ for _, workspace := range workspaces {
+ workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{
+ WorkspaceID: workspace.ID,
+ })
+ require.NoError(t, err)
+
+ for _, workspaceBuild := range workspaceBuilds {
+ job, err := db.GetProvisionerJobByID(ctx, workspaceBuild.JobID)
+ require.NoError(t, err)
+ jobStatusMap[job.JobStatus]++
+ }
+ }
+ return jobStatusMap
+ }
+
+ // Verify initial state: two workspaces exist, one successful, one failed.
+ workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
+ require.NoError(t, err)
+ require.Equal(t, 2, len(workspaces))
+ jobStatusMap := getJobStatusMap(workspaces)
+ require.Len(t, jobStatusMap, 2)
+ require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded])
+ require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed])
+
+ //Verify initial state: metric is not set - meaning preset is not hard limited.
+ require.NoError(t, controller.ForceMetricsUpdate(ctx))
+ mf, err := registry.Gather()
+ require.NoError(t, err)
+ metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
+ "template_name": template.Name,
+ "preset_name": preset.Name,
+ "org_name": org.Name,
+ })
+ require.Nil(t, metric)
+
+ // We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called.
+ // Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond.
+ clock.Advance(time.Nanosecond).MustWait(ctx)
+
+ // Trigger reconciliation to attempt creating a new prebuild.
+ // The outcome depends on whether the hard limit has been reached.
+ require.NoError(t, controller.ReconcileAll(ctx))
+
+ // These two additional calls to ReconcileAll should not trigger any notifications.
+ // A notification is only sent once.
+ require.NoError(t, controller.ReconcileAll(ctx))
+ require.NoError(t, controller.ReconcileAll(ctx))
+
+ // Verify the final state after reconciliation.
+ // When hard limit is reached, no new workspace should be created.
+ workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
+ require.NoError(t, err)
+ require.Equal(t, 2, len(workspaces))
+ jobStatusMap = getJobStatusMap(workspaces)
+ require.Len(t, jobStatusMap, 2)
+ require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded])
+ require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed])
+
+ updatedPreset, err := db.GetPresetByID(ctx, preset.ID)
+ require.NoError(t, err)
+ require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus)
+
+ // When hard limit is reached, a notification should be sent.
+ matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool {
+ if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") {
+ return false
+ }
+
+ if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") {
+ return false
+ }
+
+ return true
+ })
+ require.Len(t, matching, 1)
+
+ // When hard limit is reached, metric is set to 1.
+ mf, err = registry.Gather()
+ require.NoError(t, err)
+ metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
+ "template_name": template.Name,
+ "preset_name": preset.Name,
+ "org_name": org.Name,
+ })
+ require.NotNil(t, metric)
+ require.NotNil(t, metric.GetGauge())
+ require.EqualValues(t, 1, metric.GetGauge().GetValue())
+
+ // When: the template is deleted.
+ require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{
+ ID: template.ID,
+ Deleted: true,
+ UpdatedAt: dbtime.Now(),
+ }))
+
+ // Trigger reconciliation to make sure that successful, but outdated prebuilt workspace will be deleted.
+ require.NoError(t, controller.ReconcileAll(ctx))
+
+ workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
+ require.NoError(t, err)
+ require.Equal(t, 2, len(workspaces))
+
+ jobStatusMap = getJobStatusMap(workspaces)
+ require.Len(t, jobStatusMap, 3)
+ require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded])
+ require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed])
+ // Pending job should be the job that deletes successful, but outdated prebuilt workspace.
+ // Prebuilt workspace MUST be deleted, despite the fact that preset is marked as hard limited.
+ require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusPending])
+
+ workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{
+ WorkspaceID: successfulWorkspace.ID,
+ })
+ require.NoError(t, err)
+ require.Equal(t, 2, len(workspaceBuilds))
+ // Make sure that successfully created, but outdated prebuilt workspace was scheduled for deletion.
+ require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition)
+ require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition)
+ })
+ }
+}
+
func TestRunLoop(t *testing.T) {
t.Parallel()
From 0882626f6d04fe11dc349ff37fa4a5b7a626027b Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Fri, 23 May 2025 20:54:21 +0000
Subject: [PATCH 05/10] fix: make fmt
---
enterprise/coderd/prebuilds/reconcile_test.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go
index 1d4a47fc442ad..22d1318208e68 100644
--- a/enterprise/coderd/prebuilds/reconcile_test.go
+++ b/enterprise/coderd/prebuilds/reconcile_test.go
@@ -924,7 +924,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded])
require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed])
- //Verify initial state: metric is not set - meaning preset is not hard limited.
+ // Verify initial state: metric is not set - meaning preset is not hard limited.
require.NoError(t, controller.ForceMetricsUpdate(ctx))
mf, err := registry.Gather()
require.NoError(t, err)
From 1c530741d401a105f728a26868f52303bfb5b0c9 Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Fri, 23 May 2025 23:04:56 +0000
Subject: [PATCH 06/10] refactor: improve documentation
---
enterprise/coderd/prebuilds/reconcile.go | 26 ++++++++++++-------
enterprise/coderd/prebuilds/reconcile_test.go | 12 +++++++++
2 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go
index b18d2c1b0c3f4..603c0da3588a6 100644
--- a/enterprise/coderd/prebuilds/reconcile.go
+++ b/enterprise/coderd/prebuilds/reconcile.go
@@ -361,15 +361,22 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)
- if !ps.Preset.Deleted && ps.Preset.UsingActiveVersion {
- c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
- }
+ // Report a preset as hard-limited only if all the following conditions are met:
+ // - The preset is marked as hard-limited
+ // - The preset is using the active version of its template, and the template has not been deleted
+ //
+ // The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
+ // Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
+ // as hard-limited to the admin.
+ reportAsHardLimited := ps.IsHardLimited && ps.Preset.UsingActiveVersion && !ps.Preset.Deleted
+ c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, reportAsHardLimited)
// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
+ // - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed.
if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited {
- logger.Warn(ctx, "skipping hard limited preset")
+ logger.Warn(ctx, "preset is hard limited, notifying template admins")
err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
Status: database.PrebuildStatusHardLimited,
@@ -447,12 +454,11 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
actions.Create = desired
}
- if actions.Create > 0 {
- // If the preset is hard-limited, log it and exit early.
- if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited || ps.IsHardLimited {
- logger.Warn(ctx, "skipping hard limited preset")
- return nil
- }
+ // If preset is hard-limited, and it's a create operation, log it and exit early.
+ // Creation operation is disallowed for hard-limited preset.
+ if ps.IsHardLimited && actions.Create > 0 {
+ logger.Warn(ctx, "skipping hard limited preset for create operation")
+ return nil
}
var multiErr multierror.Error
diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go
index 22d1318208e68..42a5af552c4b5 100644
--- a/enterprise/coderd/prebuilds/reconcile_test.go
+++ b/enterprise/coderd/prebuilds/reconcile_test.go
@@ -1018,6 +1018,18 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
// Make sure that successfully created, but outdated prebuilt workspace was scheduled for deletion.
require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition)
require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition)
+
+ // Metric is reset to zero after preset became outdated.
+ mf, err = registry.Gather()
+ require.NoError(t, err)
+ metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
+ "template_name": template.Name,
+ "preset_name": preset.Name,
+ "org_name": org.Name,
+ })
+ require.NotNil(t, metric)
+ require.NotNil(t, metric.GetGauge())
+ require.EqualValues(t, 0, metric.GetGauge().GetValue())
})
}
}
From 3100e01a3d404cde877088c1dacf7de14193d1da Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Fri, 23 May 2025 23:32:25 +0000
Subject: [PATCH 07/10] test: improve testing coverage
---
enterprise/coderd/prebuilds/reconcile_test.go | 41 +++++++++++++------
1 file changed, 29 insertions(+), 12 deletions(-)
diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go
index 42a5af552c4b5..391fb21022bb7 100644
--- a/enterprise/coderd/prebuilds/reconcile_test.go
+++ b/enterprise/coderd/prebuilds/reconcile_test.go
@@ -824,14 +824,24 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
// Test cases verify the behavior of prebuild creation depending on configured failure limits.
testCases := []struct {
- name string
- hardLimit int64
- isHardLimitHit bool
+ name string
+ hardLimit int64
+ createNewTemplateVersion bool
+ deleteTemplate bool
}{
{
- name: "hard limit is hit - skip creation of prebuilt workspace",
- hardLimit: 1,
- isHardLimitHit: true,
+ // hard limit is hit - but we allow deletion of prebuilt workspace because it's outdated (new template version was created)
+ name: "new template version is created",
+ hardLimit: 1,
+ createNewTemplateVersion: true,
+ deleteTemplate: false,
+ },
+ {
+ // hard limit is hit - but we allow deletion of prebuilt workspace because template is deleted
+ name: "template is deleted",
+ hardLimit: 1,
+ createNewTemplateVersion: false,
+ deleteTemplate: true,
},
}
@@ -988,12 +998,19 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
require.NotNil(t, metric.GetGauge())
require.EqualValues(t, 1, metric.GetGauge().GetValue())
- // When: the template is deleted.
- require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{
- ID: template.ID,
- Deleted: true,
- UpdatedAt: dbtime.Now(),
- }))
+ if tc.createNewTemplateVersion {
+ // Create a new template version and mark it as active
+ // This marks the template version that we care about as inactive
+ setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID)
+ }
+
+ if tc.deleteTemplate {
+ require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{
+ ID: template.ID,
+ Deleted: true,
+ UpdatedAt: dbtime.Now(),
+ }))
+ }
// Trigger reconciliation to make sure that successful, but outdated prebuilt workspace will be deleted.
require.NoError(t, controller.ReconcileAll(ctx))
From 80c89fea87f181454af812f3df147f3ad4c42f42 Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Mon, 26 May 2025 12:39:58 +0000
Subject: [PATCH 08/10] refactor: CR's fixes
---
enterprise/coderd/prebuilds/metricscollector.go | 6 +++++-
enterprise/coderd/prebuilds/reconcile_test.go | 6 ++----
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go
index 8d1d3185b7feb..2109362867a8a 100644
--- a/enterprise/coderd/prebuilds/metricscollector.go
+++ b/enterprise/coderd/prebuilds/metricscollector.go
@@ -286,5 +286,9 @@ func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, preset
key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}
- mc.isPresetHardLimited[key] = isHardLimited
+ if isHardLimited {
+ mc.isPresetHardLimited[key] = true
+ } else {
+ delete(mc.isPresetHardLimited, key)
+ }
}
diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go
index 391fb21022bb7..7bc0ac1cd641d 100644
--- a/enterprise/coderd/prebuilds/reconcile_test.go
+++ b/enterprise/coderd/prebuilds/reconcile_test.go
@@ -1036,7 +1036,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition)
require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition)
- // Metric is reset to zero after preset became outdated.
+ // Metric is deleted after preset became outdated.
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
@@ -1044,9 +1044,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
"preset_name": preset.Name,
"org_name": org.Name,
})
- require.NotNil(t, metric)
- require.NotNil(t, metric.GetGauge())
- require.EqualValues(t, 0, metric.GetGauge().GetValue())
+ require.Nil(t, metric)
})
}
}
From 1b9593b3ae895012f1062fa1c801a4ea4b7fd59f Mon Sep 17 00:00:00 2001
From: evgeniy-scherbina
Date: Mon, 26 May 2025 13:04:01 +0000
Subject: [PATCH 09/10] fix: fix tests & linter
---
enterprise/coderd/prebuilds/metricscollector.go | 1 +
enterprise/coderd/prebuilds/reconcile_test.go | 6 ++----
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go
index 2109362867a8a..e4aca7c989033 100644
--- a/enterprise/coderd/prebuilds/metricscollector.go
+++ b/enterprise/coderd/prebuilds/metricscollector.go
@@ -280,6 +280,7 @@ func (k hardLimitedPresetKey) String() string {
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
}
+// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus.
func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
mc.isPresetHardLimitedMu.Lock()
defer mc.isPresetHardLimitedMu.Unlock()
diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go
index 7bc0ac1cd641d..b7f5974197150 100644
--- a/enterprise/coderd/prebuilds/reconcile_test.go
+++ b/enterprise/coderd/prebuilds/reconcile_test.go
@@ -768,7 +768,7 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
require.Equal(t, 2, len(workspaces))
require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus)
- // When hard limit is not reached, metric is set to 0.
+ // When hard limit is not reached, metric is not set.
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
@@ -776,9 +776,7 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
"preset_name": preset.Name,
"org_name": org.Name,
})
- require.NotNil(t, metric)
- require.NotNil(t, metric.GetGauge())
- require.EqualValues(t, 0, metric.GetGauge().GetValue())
+ require.Nil(t, metric)
return
}
From 00a2fcb379a979d3a2e10657dfef161375d71555 Mon Sep 17 00:00:00 2001
From: Yevhenii Shcherbina
Date: Mon, 26 May 2025 11:27:36 -0400
Subject: [PATCH 10/10] Update metric description
Co-authored-by: Susana Ferreira
---
enterprise/coderd/prebuilds/metricscollector.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go
index e4aca7c989033..90257c26dd580 100644
--- a/enterprise/coderd/prebuilds/metricscollector.go
+++ b/enterprise/coderd/prebuilds/metricscollector.go
@@ -85,7 +85,7 @@ var (
)
presetHardLimitedDesc = prometheus.NewDesc(
MetricPresetHardLimitedGauge,
- "Indicates whether a given preset has reached the hard failure limit (1 for hard-limited, 0 otherwise).",
+ "Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.",
labels,
nil,
)
--- a PPN by Garber Painting Akron. With Image Size Reduction included!Fetched URL: http://github.com/coder/coder/pull/18008.patch
Alternative Proxies:
Alternative Proxy
pFad Proxy
pFad v3 Proxy
pFad v4 Proxy