Skip to content

Commit 79bcf01

Browse files
chore: rollback rule (#733)
1 parent 6ce45fb commit 79bcf01

File tree

13 files changed

+1456
-5
lines changed

13 files changed

+1456
-5
lines changed

apps/workspace-engine/oapi/openapi.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,6 +1313,9 @@
13131313
"retry": {
13141314
"$ref": "#/components/schemas/RetryRule"
13151315
},
1316+
"rollback": {
1317+
"$ref": "#/components/schemas/RollbackRule"
1318+
},
13161319
"verification": {
13171320
"$ref": "#/components/schemas/VerificationRule"
13181321
},
@@ -1851,6 +1854,23 @@
18511854
],
18521855
"type": "object"
18531856
},
1857+
"RollbackRule": {
1858+
"properties": {
1859+
"onJobStatuses": {
1860+
"description": "Job statuses that will trigger a rollback",
1861+
"items": {
1862+
"$ref": "#/components/schemas/JobStatus"
1863+
},
1864+
"type": "array"
1865+
},
1866+
"onVerificationFailure": {
1867+
"default": false,
1868+
"description": "If true, a release target will be rolled back if the verification fails",
1869+
"type": "boolean"
1870+
}
1871+
},
1872+
"type": "object"
1873+
},
18541874
"RuleEvaluation": {
18551875
"properties": {
18561876
"actionRequired": {

apps/workspace-engine/oapi/spec/schemas/policy.jsonnet

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ local openapi = import '../lib/openapi.libsonnet';
6565
deploymentWindow: openapi.schemaRef('DeploymentWindowRule'),
6666
verification: openapi.schemaRef('VerificationRule'),
6767
versionCooldown: openapi.schemaRef('VersionCooldownRule'),
68+
rollback: openapi.schemaRef('RollbackRule'),
6869
},
6970
},
7071

@@ -235,6 +236,22 @@ local openapi = import '../lib/openapi.libsonnet';
235236
},
236237
},
237238

239+
RollbackRule: {
240+
type: 'object',
241+
properties: {
242+
onJobStatuses: {
243+
type: 'array',
244+
items: openapi.schemaRef('JobStatus'),
245+
description: 'Job statuses that will trigger a rollback',
246+
},
247+
onVerificationFailure: {
248+
type: 'boolean',
249+
default: false,
250+
description: 'If true, a release target will be rolled back if the verification fails',
251+
},
252+
},
253+
},
254+
238255
DeployDecision: {
239256
type: 'object',
240257
required: ['policyResults'],

apps/workspace-engine/pkg/oapi/oapi.gen.go

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apps/workspace-engine/pkg/workspace/releasemanager/action/action.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ import (
99
type ActionTrigger string
1010

1111
const (
12-
TriggerJobCreated ActionTrigger = "job.created"
13-
TriggerJobStarted ActionTrigger = "job.started"
14-
TriggerJobSuccess ActionTrigger = "job.success"
15-
TriggerJobFailure ActionTrigger = "job.failure"
12+
TriggerJobCreated ActionTrigger = "job.created"
13+
TriggerJobStarted ActionTrigger = "job.started"
14+
TriggerJobSuccess ActionTrigger = "job.success"
15+
TriggerJobFailure ActionTrigger = "job.failure"
16+
TriggerJobStatusChange ActionTrigger = "job.statuschange"
1617
)
1718

1819
// ActionContext provides context for action execution

apps/workspace-engine/pkg/workspace/releasemanager/action/orchestrator.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,5 +132,9 @@ func determineTrigger(
132132
return TriggerJobFailure
133133
}
134134

135+
if currentStatus != previousStatus {
136+
return TriggerJobStatusChange
137+
}
138+
135139
return ""
136140
}
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
package rollback
2+
3+
import (
4+
"context"
5+
"time"
6+
"workspace-engine/pkg/oapi"
7+
"workspace-engine/pkg/workspace/releasemanager/deployment/jobs"
8+
"workspace-engine/pkg/workspace/releasemanager/verification"
9+
"workspace-engine/pkg/workspace/store"
10+
11+
"github.com/google/uuid"
12+
"go.opentelemetry.io/otel"
13+
"go.opentelemetry.io/otel/attribute"
14+
"go.opentelemetry.io/otel/codes"
15+
)
16+
17+
var hookTracer = otel.Tracer("RollbackHooks")
18+
19+
type RollbackHooks struct {
20+
store *store.Store
21+
dispatcher *jobs.Dispatcher
22+
}
23+
24+
var _ verification.VerificationHooks = &RollbackHooks{}
25+
26+
func NewRollbackHooks(store *store.Store, dispatcher *jobs.Dispatcher) *RollbackHooks {
27+
return &RollbackHooks{
28+
store: store,
29+
dispatcher: dispatcher,
30+
}
31+
}
32+
33+
func (h *RollbackHooks) OnVerificationStarted(ctx context.Context, verification *oapi.JobVerification) error {
34+
return nil
35+
}
36+
37+
func (h *RollbackHooks) OnMeasurementTaken(ctx context.Context, verification *oapi.JobVerification, metricIndex int, measurement *oapi.VerificationMeasurement) error {
38+
return nil
39+
}
40+
41+
func (h *RollbackHooks) OnMetricComplete(ctx context.Context, verification *oapi.JobVerification, metricIndex int) error {
42+
return nil
43+
}
44+
45+
func (h *RollbackHooks) OnVerificationComplete(ctx context.Context, verificationResult *oapi.JobVerification) error {
46+
ctx, span := hookTracer.Start(ctx, "RollbackHooks.OnVerificationComplete")
47+
defer span.End()
48+
49+
span.SetAttributes(
50+
attribute.String("verification.id", verificationResult.Id),
51+
attribute.String("verification.job_id", verificationResult.JobId),
52+
)
53+
54+
status := verificationResult.Status()
55+
span.SetAttributes(attribute.String("verification.status", string(status)))
56+
57+
if status != oapi.JobVerificationStatusFailed {
58+
span.SetStatus(codes.Ok, "verification did not fail")
59+
return nil
60+
}
61+
62+
job, ok := h.store.Jobs.Get(verificationResult.JobId)
63+
if !ok {
64+
span.SetStatus(codes.Error, "job not found")
65+
return nil
66+
}
67+
68+
release, ok := h.store.Releases.Get(job.ReleaseId)
69+
if !ok {
70+
span.SetStatus(codes.Error, "release not found")
71+
return nil
72+
}
73+
74+
span.SetAttributes(
75+
attribute.String("release.id", release.ID()),
76+
attribute.String("release_target.key", release.ReleaseTarget.Key()),
77+
)
78+
79+
policies, err := h.store.ReleaseTargets.GetPolicies(ctx, &release.ReleaseTarget)
80+
if err != nil {
81+
span.RecordError(err)
82+
span.SetStatus(codes.Error, "failed to get policies")
83+
return nil
84+
}
85+
86+
if !h.shouldRollbackOnVerificationFailure(policies) {
87+
span.SetAttributes(attribute.Bool("rollback_applicable", false))
88+
span.SetStatus(codes.Ok, "no applicable rollback policy for verification failure")
89+
return nil
90+
}
91+
92+
span.SetAttributes(attribute.Bool("rollback_applicable", true))
93+
94+
currentRelease, lastSuccessfulJob, err := h.store.ReleaseTargets.GetCurrentRelease(ctx, &release.ReleaseTarget)
95+
if err != nil {
96+
span.AddEvent("No previous release to roll back to")
97+
span.SetStatus(codes.Ok, "no previous release available")
98+
return nil
99+
}
100+
101+
// Don't rollback to the same release
102+
if currentRelease.ID() == release.ID() {
103+
span.AddEvent("Current release is the same as failed release, no rollback needed")
104+
span.SetStatus(codes.Ok, "already on current release")
105+
return nil
106+
}
107+
108+
span.SetAttributes(
109+
attribute.String("rollback_to_release.id", currentRelease.ID()),
110+
attribute.String("rollback_to_version.id", currentRelease.Version.Id),
111+
attribute.String("rollback_to_version.tag", currentRelease.Version.Tag),
112+
)
113+
114+
now := time.Now()
115+
newJob := oapi.Job{
116+
Id: uuid.New().String(),
117+
ReleaseId: lastSuccessfulJob.ReleaseId,
118+
JobAgentId: lastSuccessfulJob.JobAgentId,
119+
JobAgentConfig: lastSuccessfulJob.JobAgentConfig,
120+
Status: oapi.JobStatusPending,
121+
CreatedAt: now,
122+
UpdatedAt: now,
123+
}
124+
125+
h.store.Jobs.Upsert(ctx, &newJob)
126+
127+
if err := h.dispatcher.DispatchJob(ctx, &newJob); err != nil {
128+
span.RecordError(err)
129+
span.SetStatus(codes.Error, "rollback execution failed")
130+
return err
131+
}
132+
133+
span.SetStatus(codes.Ok, "rollback executed successfully")
134+
return nil
135+
}
136+
137+
func (h *RollbackHooks) OnVerificationStopped(ctx context.Context, verification *oapi.JobVerification) error {
138+
return nil
139+
}
140+
141+
func (h *RollbackHooks) shouldRollbackOnVerificationFailure(policies []*oapi.Policy) bool {
142+
for _, policy := range policies {
143+
if !policy.Enabled {
144+
continue
145+
}
146+
147+
for _, rule := range policy.Rules {
148+
if rule.Rollback == nil {
149+
continue
150+
}
151+
152+
if rule.Rollback.OnVerificationFailure != nil && *rule.Rollback.OnVerificationFailure {
153+
return true
154+
}
155+
}
156+
}
157+
158+
return false
159+
}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
package rollback
2+
3+
import (
4+
"context"
5+
"slices"
6+
"time"
7+
"workspace-engine/pkg/oapi"
8+
"workspace-engine/pkg/workspace/releasemanager/action"
9+
"workspace-engine/pkg/workspace/releasemanager/deployment/jobs"
10+
"workspace-engine/pkg/workspace/store"
11+
12+
"github.com/google/uuid"
13+
"go.opentelemetry.io/otel"
14+
"go.opentelemetry.io/otel/attribute"
15+
"go.opentelemetry.io/otel/codes"
16+
)
17+
18+
var tracer = otel.Tracer("RollbackAction")
19+
20+
type RollbackAction struct {
21+
store *store.Store
22+
dispatcher *jobs.Dispatcher
23+
}
24+
25+
func NewRollbackAction(store *store.Store, dispatcher *jobs.Dispatcher) *RollbackAction {
26+
return &RollbackAction{
27+
store: store,
28+
dispatcher: dispatcher,
29+
}
30+
}
31+
32+
func (r *RollbackAction) Name() string {
33+
return "rollback"
34+
}
35+
36+
func (r *RollbackAction) Execute(
37+
ctx context.Context,
38+
trigger action.ActionTrigger,
39+
actx action.ActionContext,
40+
) error {
41+
ctx, span := tracer.Start(ctx, "RollbackAction.Execute")
42+
defer span.End()
43+
44+
span.SetAttributes(
45+
attribute.String("trigger", string(trigger)),
46+
attribute.String("release.id", actx.Release.ID()),
47+
attribute.String("job.id", actx.Job.Id),
48+
attribute.String("job.status", string(actx.Job.Status)),
49+
)
50+
51+
if !r.shouldRollback(actx.Policies, actx.Job.Status) {
52+
span.SetAttributes(attribute.Bool("rollback_applicable", false))
53+
span.SetStatus(codes.Ok, "no applicable rollback policy")
54+
return nil
55+
}
56+
57+
span.SetAttributes(attribute.Bool("rollback_applicable", true))
58+
59+
currentRelease, lastSuccessfulJob, err := r.store.ReleaseTargets.GetCurrentRelease(ctx, &actx.Release.ReleaseTarget)
60+
if err != nil {
61+
span.AddEvent("No previous release to roll back to")
62+
span.SetStatus(codes.Ok, "no previous release available")
63+
return nil
64+
}
65+
66+
if currentRelease.ID() == actx.Release.ID() {
67+
span.AddEvent("Current release is the same as failed release, no rollback needed")
68+
span.SetStatus(codes.Ok, "already on current release")
69+
return nil
70+
}
71+
72+
span.SetAttributes(
73+
attribute.String("rollback_to_release.id", currentRelease.ID()),
74+
attribute.String("rollback_to_version.id", currentRelease.Version.Id),
75+
attribute.String("rollback_to_version.tag", currentRelease.Version.Tag),
76+
)
77+
78+
now := time.Now()
79+
newJob := oapi.Job{
80+
Id: uuid.New().String(),
81+
ReleaseId: lastSuccessfulJob.ReleaseId,
82+
JobAgentId: lastSuccessfulJob.JobAgentId,
83+
JobAgentConfig: lastSuccessfulJob.JobAgentConfig,
84+
Status: oapi.JobStatusPending,
85+
CreatedAt: now,
86+
UpdatedAt: now,
87+
}
88+
89+
r.store.Jobs.Upsert(ctx, &newJob)
90+
91+
if err := r.dispatcher.DispatchJob(ctx, &newJob); err != nil {
92+
span.RecordError(err)
93+
span.SetStatus(codes.Error, "rollback execution failed")
94+
return err
95+
}
96+
97+
span.SetStatus(codes.Ok, "rollback executed successfully")
98+
return nil
99+
}
100+
101+
func (r *RollbackAction) shouldRollback(policies []*oapi.Policy, jobStatus oapi.JobStatus) bool {
102+
for _, policy := range policies {
103+
if !policy.Enabled {
104+
continue
105+
}
106+
107+
for _, rule := range policy.Rules {
108+
if rule.Rollback == nil {
109+
continue
110+
}
111+
112+
if rule.Rollback.OnJobStatuses != nil &&
113+
slices.Contains(*rule.Rollback.OnJobStatuses, jobStatus) {
114+
return true
115+
}
116+
}
117+
}
118+
119+
return false
120+
}

0 commit comments

Comments
 (0)