@@ -51,6 +51,7 @@ export class RunExecution {
51
51
52
52
private _runFriendlyId ?: string ;
53
53
private currentSnapshotId ?: string ;
54
+ private currentAttemptNumber ?: number ;
54
55
private currentTaskRunEnv ?: Record < string , string > ;
55
56
56
57
private dequeuedAt ?: Date ;
@@ -65,6 +66,7 @@ export class RunExecution {
65
66
private snapshotPoller ?: RunExecutionSnapshotPoller ;
66
67
67
68
private lastHeartbeat ?: Date ;
69
+ private isShuttingDown = false ;
68
70
69
71
constructor ( opts : RunExecutionOptions ) {
70
72
this . id = randomBytes ( 4 ) . toString ( "hex" ) ;
@@ -86,10 +88,6 @@ export class RunExecution {
86
88
throw new Error ( "prepareForExecution called after process was already created" ) ;
87
89
}
88
90
89
- if ( this . isPreparedForNextRun ) {
90
- throw new Error ( "prepareForExecution called after execution was already prepared" ) ;
91
- }
92
-
93
91
this . taskRunProcess = this . createTaskRunProcess ( {
94
92
envVars : opts . taskRunEnv ,
95
93
isWarmStart : true ,
@@ -150,9 +148,14 @@ export class RunExecution {
150
148
}
151
149
152
150
/**
153
- * Returns true if the execution has been prepared with task run env .
151
+ * Returns true if no run has been started yet and the process is prepared for the next run .
154
152
*/
155
- get isPreparedForNextRun ( ) : boolean {
153
+ get canExecute ( ) : boolean {
154
+ // If we've ever had a run ID, this execution can't be reused
155
+ if ( this . _runFriendlyId ) {
156
+ return false ;
157
+ }
158
+
156
159
return ! ! this . taskRunProcess ?. isPreparedForNextRun ;
157
160
}
158
161
@@ -161,6 +164,11 @@ export class RunExecution {
161
164
* or when the snapshot poller detects a change
162
165
*/
163
166
public async handleSnapshotChange ( runData : RunExecutionData ) : Promise < void > {
167
+ if ( this . isShuttingDown ) {
168
+ this . sendDebugLog ( "handleSnapshotChange: shutting down, skipping" ) ;
169
+ return ;
170
+ }
171
+
164
172
const { run, snapshot, completedWaitpoints } = runData ;
165
173
166
174
const snapshotMetadata = {
@@ -191,8 +199,6 @@ export class RunExecution {
191
199
return ;
192
200
}
193
201
194
- this . sendDebugLog ( `enqueued snapshot change: ${ snapshot . executionStatus } ` , snapshotMetadata ) ;
195
-
196
202
this . snapshotChangeQueue . push ( runData ) ;
197
203
await this . processSnapshotChangeQueue ( ) ;
198
204
}
@@ -240,11 +246,16 @@ export class RunExecution {
240
246
}
241
247
242
248
if ( snapshot . friendlyId === this . currentSnapshotId ) {
243
- this . sendDebugLog ( "handleSnapshotChange: snapshot not changed" , snapshotMetadata ) ;
244
249
return ;
245
250
}
246
251
247
- this . sendDebugLog ( `snapshot change: ${ snapshot . executionStatus } ` , snapshotMetadata ) ;
252
+ if ( this . currentAttemptNumber && this . currentAttemptNumber !== run . attemptNumber ) {
253
+ this . sendDebugLog ( "ERROR: attempt number mismatch" , snapshotMetadata ) ;
254
+ await this . taskRunProcess ?. suspend ( ) ;
255
+ return ;
256
+ }
257
+
258
+ this . sendDebugLog ( `snapshot has changed to: ${ snapshot . executionStatus } ` , snapshotMetadata ) ;
248
259
249
260
// Reset the snapshot poll interval so we don't do unnecessary work
250
261
this . snapshotPoller ?. resetCurrentInterval ( ) ;
@@ -456,6 +467,16 @@ export class RunExecution {
456
467
// A snapshot was just created, so update the snapshot ID
457
468
this . currentSnapshotId = start . data . snapshot . friendlyId ;
458
469
470
+ // Also set or update the attempt number - we do this to detect illegal attempt number changes, e.g. from stalled runners coming back online
471
+ const attemptNumber = start . data . run . attemptNumber ;
472
+ if ( attemptNumber && attemptNumber > 0 ) {
473
+ this . currentAttemptNumber = attemptNumber ;
474
+ } else {
475
+ this . sendDebugLog ( "ERROR: invalid attempt number returned from start attempt" , {
476
+ attemptNumber : String ( attemptNumber ) ,
477
+ } ) ;
478
+ }
479
+
459
480
const metrics = this . measureExecutionMetrics ( {
460
481
attemptCreatedAt : attemptStartedAt ,
461
482
dequeuedAt : this . dequeuedAt ?. getTime ( ) ,
@@ -597,8 +618,18 @@ export class RunExecution {
597
618
metrics : TaskRunExecutionMetrics ;
598
619
isWarmStart ?: boolean ;
599
620
} ) {
621
+ // For immediate retries, we need to ensure the task run process is prepared for the next attempt
622
+ if (
623
+ this . runFriendlyId &&
624
+ this . taskRunProcess &&
625
+ ! this . taskRunProcess . isPreparedForNextAttempt
626
+ ) {
627
+ this . sendDebugLog ( "killing existing task run process before executing next attempt" ) ;
628
+ await this . kill ( ) . catch ( ( ) => { } ) ;
629
+ }
630
+
600
631
// To skip this step and eagerly create the task run process, run prepareForExecution first
601
- if ( ! this . taskRunProcess || ! this . isPreparedForNextRun ) {
632
+ if ( ! this . taskRunProcess || ! this . taskRunProcess . isPreparedForNextRun ) {
602
633
this . taskRunProcess = this . createTaskRunProcess ( { envVars, isWarmStart } ) ;
603
634
}
604
635
@@ -655,11 +686,15 @@ export class RunExecution {
655
686
}
656
687
657
688
public exit ( ) {
658
- if ( this . isPreparedForNextRun ) {
689
+ if ( this . taskRunProcess ?. isPreparedForNextRun ) {
659
690
this . taskRunProcess ?. forceExit ( ) ;
660
691
}
661
692
}
662
693
694
+ public async kill ( ) {
695
+ await this . taskRunProcess ?. kill ( "SIGKILL" ) ;
696
+ }
697
+
663
698
private async complete ( { completion } : { completion : TaskRunExecutionResult } ) : Promise < void > {
664
699
if ( ! this . runFriendlyId || ! this . currentSnapshotId ) {
665
700
throw new Error ( "Cannot complete run: missing run or snapshot ID" ) ;
@@ -897,7 +932,7 @@ export class RunExecution {
897
932
this . lastHeartbeat = new Date ( ) ;
898
933
}
899
934
900
- sendDebugLog (
935
+ private sendDebugLog (
901
936
message : string ,
902
937
properties ?: SendDebugLogOptions [ "properties" ] ,
903
938
runIdOverride ?: string
@@ -958,6 +993,11 @@ export class RunExecution {
958
993
}
959
994
960
995
private stopServices ( ) {
996
+ if ( this . isShuttingDown ) {
997
+ return ;
998
+ }
999
+
1000
+ this . isShuttingDown = true ;
961
1001
this . snapshotPoller ?. stop ( ) ;
962
1002
this . taskRunProcess ?. onTaskRunHeartbeat . detach ( ) ;
963
1003
}
0 commit comments