TEP-0097 breakpoint before steps for taskrun

Signed-off-by: chengjoey <zchengjoey@gmail.com>
tektoncd · Aug 22, 2024 · 4f04964 · 4f04964
1 parent c6c33e0
commit 4f04964
Show file tree

Hide file tree

Showing 28 changed files with 938 additions and 153 deletions.
diff --git a/cmd/entrypoint/main.go b/cmd/entrypoint/main.go
@@ -17,7 +17,6 @@ limitations under the License.
 package main
 
 import (
- "context"
  "encoding/json"
  "errors"
  "flag"
@@ -56,6 +55,7 @@ var (
  stdoutPath = flag.String("stdout_path", "", "If specified, file to copy stdout to")
  stderrPath = flag.String("stderr_path", "", "If specified, file to copy stderr to")
  breakpointOnFailure = flag.Bool("breakpoint_on_failure", false, "If specified, expect steps to not skip on failure")
+ debugBeforeStep = flag.Bool("debug_before_step", false, "If specified, wait for a debugger to attach before executing the step")
  onError = flag.String("on_error", "", "Set to \"continue\" to ignore an error and continue when a container terminates with a non-zero exit code."+
  " Set to \"stopAndFail\" to declare a failure with a step error and stop executing the rest of the steps.")
  stepMetadataDir = flag.String("step_metadata_dir", "", "If specified, create directory to store the step metadata e.g. /tekton/steps/<step-name>/")
@@ -66,25 +66,8 @@ var (
 
 const (
  defaultWaitPollingInterval = time.Second
- breakpointExitSuffix = ".breakpointexit"
 )
 
-func checkForBreakpointOnFailure(e entrypoint.Entrypointer, breakpointExitPostFile string) {
- if e.BreakpointOnFailure {
- if waitErr := e.Waiter.Wait(context.Background(), breakpointExitPostFile, false, false); waitErr != nil {
- log.Println("error occurred while waiting for " + breakpointExitPostFile + " : " + waitErr.Error())
- }
- // get exitcode from .breakpointexit
- exitCode, readErr := e.BreakpointExitCode(breakpointExitPostFile)
- // if readErr exists, the exitcode with default to 0 as we would like
- // to encourage to continue running the next steps in the taskRun
- if readErr != nil {
- log.Println("error occurred while reading breakpoint exit code : " + readErr.Error())
- }
- os.Exit(exitCode)
- }
-}
-
 func main() {
  // Add credential flags originally introduced with our legacy credentials helper
  // image (creds-init).
@@ -172,6 +155,7 @@ func main() {
  Timeout: timeout,
  StepWhenExpressions: when,
  BreakpointOnFailure: *breakpointOnFailure,
+ DebugBeforeStep: *debugBeforeStep,
  OnError: *onError,
  StepMetadataDir: *stepMetadataDir,
  SpireWorkloadAPI: spireWorkloadAPI,
@@ -185,8 +169,10 @@ func main() {
  }
 
  if err := e.Go(); err != nil {
- breakpointExitPostFile := e.PostFile + breakpointExitSuffix
  switch t := err.(type) { //nolint:errorlint // checking for multiple types with errors.As is ugly.
+ case entrypoint.DebugBeforeStepError:
+ log.Println("Skipping execute step script because before step breakpoint fail-continue")
+ os.Exit(1)
  case entrypoint.SkipError:
  log.Print("Skipping step because a previous step failed")
  os.Exit(1)
@@ -210,7 +196,7 @@ func main() {
  // in both cases has an ExitStatus() method with the
  // same signature.
  if status, ok := t.Sys().(syscall.WaitStatus); ok {
- checkForBreakpointOnFailure(e, breakpointExitPostFile)
+ e.CheckForBreakpointOnFailure()
  // ignore a step error i.e. do not exit if a container terminates with a non-zero exit code when onError is set to "continue"
  if e.OnError != entrypoint.ContinueOnError {
  os.Exit(status.ExitStatus())
@@ -221,7 +207,7 @@ func main() {
  log.Fatalf("Error executing command (ExitError): %v", err)
  }
  default:
- checkForBreakpointOnFailure(e, breakpointExitPostFile)
+ e.CheckForBreakpointOnFailure()
  log.Fatalf("Error executing command: %v", err)
  }
  }

diff --git a/docs/debug.md b/docs/debug.md
@@ -13,7 +13,8 @@ weight: 108
  - [Breakpoint on Failure](#breakpoint-on-failure)
  - [Failure of a Step](#failure-of-a-step)
  - [Halting a Step on failure](#halting-a-step-on-failure)
- - [Exiting breakpoint](#exiting-breakpoint)
+ - [Exiting onfailure breakpoint](#exiting-onfailure-breakpoint)
+ - [Breakpoint before step](#breakpoint-before-step)
 - [Debug Environment](#debug-environment)
  - [Mounts](#mounts)
  - [Debug Scripts](#debug-scripts)
@@ -59,12 +60,26 @@ stopping write of the `<step-no>.err` file and waiting on a signal by the user t
 In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step 
 environment using a CLI or an IDE. 
 
-#### Exiting breakpoint
+#### Exiting onfailure breakpoint
 
 To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which 
 would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/run`
 would unpause and exit the step container.
 
+### Breakpoint before step
+
+
+TaskRun will be stuck waiting for user debugging before the step execution.
+When beforeStep-Breakpoint takes effect, the user can see the following information
+from the corresponding step container log:
+```
+debug before step breakpoint has taken effect, waiting for user's decision:
+1) continue, use cmd: /tekton/debug/scripts/debug-beforestep-continue
+2) fail-continue, use cmd: /tekton/debug/scripts/debug-beforestep-fail-continue
+```
+1. Executing /tekton/debug/scripts/debug-beforestep-continue will continue to execute the step program
+2. Executing /tekton/debug/scripts/debug-beforestep-fail-continue will not continue to execute the task, and will mark the step as failed
+
 ## Debug Environment 
 
 Additional environment augmentations made available to the TaskRun Pod to aid in troubleshooting and managing step lifecycle.
@@ -80,7 +95,13 @@ to reflect step number. eg: Step 0 will have `/tekton/debug/info/0`, Step 1 will
 ### Debug Scripts
 
 `/tekton/debug/scripts/debug-continue` : Mark the step as completed with success by writing to `/tekton/run`. eg: User wants to exit
-breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0.breakpointexit`.
+onfailure breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.breakpointexit`.
 
 `/tekton/debug/scripts/debug-fail-continue` : Mark the step as completed with failure by writing to `/tekton/run`. eg: User wants to exit
-breakpoint for failed step 0. Running this script would create `/tekton/run/0.err` and `/tekton/run/0.breakpointexit`.
+onfailure breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.breakpointexit.err`.
+
+`/tekton/debug/scripts/debug-beforestep-continue` : Mark the step continue to execute by writing to `/tekton/run`. eg: User wants to exit
+before step breakpoint for before step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.beforestepexit`.
+
+`/tekton/debug/scripts/debug-beforestep-fail-continue` : Mark the step not continue to execute by writing to `/tekton/run`. eg: User wants to exit
+before step breakpoint for before step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.beforestepexit.err`.
diff --git a/docs/developers/taskruns.md b/docs/developers/taskruns.md
@@ -284,4 +284,54 @@ There are known issues with the existing implementation of sidecars:
  but an Error when the sidecar exits with an error. This is only apparent when
  using `kubectl` to get the pods of a TaskRun, not when describing the Pod
  using `kubectl describe pod ...` nor when looking at the TaskRun, but can be
- quite confusing.
+ quite confusing.
+
+## Breakpoint on Failure
+
+Halting a TaskRun execution on Failure of a step.
+
+### Failure of a Step
+
+The entrypoint binary is used to manage the lifecycle of a step. Steps are aligned beforehand by the TaskRun controller
+allowing each step to run in a particular order. This is done using `-wait_file` and the `-post_file` flags. The former
+let's the entrypoint binary know that it has to wait on creation of a particular file before starting execution of the step.
+And the latter provides information on the step number and signal the next step on completion of the step.
+
+On success of a step, the `-post-file` is written as is, signalling the next step which would have the same argument given
+for `-wait_file` to resume the entrypoint process and move ahead with the step.
+
+On failure of a step, the `-post_file` is written with appending `.err` to it denoting that the previous step has failed with
+and error. The subsequent steps are skipped in this case as well, marking the TaskRun as a failure.
+
+### Halting a Step on failure
+
+The failed step writes `<step-no>.err` to `/tekton/run` and stops running completely. To be able to debug a step we would
+need it to continue running (not exit), not skip the next steps and signal health of the step. By disabling step skipping,
+stopping write of the `<step-no>.err` file and waiting on a signal by the user to disable the halt, we would be simulating a
+"breakpoint".
+
+In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step
+environment using a CLI or an IDE.
+
+### Exiting onfailure breakpoint
+
+To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which
+would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/run`
+would unpause and exit the step container.
+
+## Breakpoint before step
+
+TaskRun will be stuck waiting for user debugging before the step execution.
+
+### Halting a Step before execution
+
+The step program will be executed after all the `-wait_file` monitoring ends. If want the user to enter the debugging before the step is executed,
+need to pass a parameter `debug_before_step` to `entrypoint`,
+and `entrypoint` will end the monitoring of `waitFiles` back pause,
+waiting to listen to the `/tekton/run/0/out.beforestepexit` file
+
+### Exiting before step breakpoint
+
+`entrypoint` listening `/tekton/run/{{ stepID }}/out.beforestepexit` or `/tekton/run/{{ stepID }}/out.beforestepexit.err` to
+decide whether to proceed this step, `out.beforestepexit` means continue with step,
+`out.beforestepexit.err` means do not continue with the step.
diff --git a/docs/pipeline-api.md b/docs/pipeline-api.md
@@ -5142,6 +5142,17 @@ string
 failed step will not exit</p>
 </td>
 </tr>
+<tr>
+<td>
+<code>beforeSteps</code><br/>
+<em>
+[]string
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+</td>
+</tr>
 </tbody>
 </table>
 <h3 id="tekton.dev/v1.TaskKind">TaskKind
@@ -14926,6 +14937,17 @@ string
 failed step will not exit</p>
 </td>
 </tr>
+<tr>
+<td>
+<code>beforeSteps</code><br/>
+<em>
+[]string
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+</td>
+</tr>
 </tbody>
 </table>
 <h3 id="tekton.dev/v1beta1.TaskKind">TaskKind

diff --git a/docs/taskruns.md b/docs/taskruns.md
@@ -909,6 +909,18 @@ spec:
  onFailure: "enabled"
 ```
 
+### Breakpoint before step
+
+If you want to set a breakpoint before the step is executed, you can add the step name to the `beforeSteps` field in the following way:
+
+```yaml
+spec:
+ debug:
+ breakpoints:
+ beforeSteps: 
+ - {{ stepName }}
+```
+
 Upon failure of a step, the TaskRun Pod execution is halted. If this TaskRun Pod continues to run without any lifecycle
 change done by the user (running the debug-continue or debug-fail-continue script) the TaskRun would be subject to
 [TaskRunTimeout](#configuring-the-failure-timeout).
@@ -931,6 +943,10 @@ perform :-
 
 `debug-fail-continue`: Mark the step as a failure and exit the breakpoint.
 
+`debug-beforestep-continue`: Mark the step continue to execute
+
+`debug-beforestep-fail-continue`: Mark the step not continue to execute
+
 *More information on the inner workings of debug can be found in the [Debug documentation](debug.md)*
 
 ## Code examples

diff --git a/pkg/apis/pipeline/v1/openapi_generated.go b/pkg/apis/pipeline/v1/openapi_generated.go
diff --git a/pkg/apis/pipeline/v1/swagger.json b/pkg/apis/pipeline/v1/swagger.json
@@ -1825,6 +1825,14 @@
  "description": "TaskBreakpoints defines the breakpoint config for a particular Task",
  "type": "object",
  "properties": {
+ "beforeSteps": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "default": ""
+ },
+ "x-kubernetes-list-type": "atomic"
+ },
  "onFailure": {
  "description": "if enabled, pause TaskRun on failure of a step failed step will not exit",
  "type": "string"

diff --git a/pkg/apis/pipeline/v1/taskrun_types.go b/pkg/apis/pipeline/v1/taskrun_types.go
@@ -26,6 +26,7 @@ import (
  metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  "k8s.io/apimachinery/pkg/runtime/schema"
  "k8s.io/apimachinery/pkg/types"
+ "k8s.io/apimachinery/pkg/util/sets"
  "k8s.io/utils/clock"
  "knative.dev/pkg/apis"
  duckv1 "knative.dev/pkg/apis/duck/v1"
@@ -121,6 +122,9 @@ type TaskBreakpoints struct {
  // failed step will not exit
  // +optional
  OnFailure string `json:"onFailure,omitempty"`
+ // +optional
+ // +listType=atomic
+ BeforeSteps []string `json:"beforeSteps,omitempty"`
 }
 
 // NeedsDebugOnFailure return true if the TaskRun is configured to debug on failure
@@ -131,14 +135,28 @@ func (trd *TaskRunDebug) NeedsDebugOnFailure() bool {
  return trd.Breakpoints.OnFailure == EnabledOnFailureBreakpoint
 }
 
+// NeedsDebugBeforeStep return true if the step is configured to debug before execution
+func (trd *TaskRunDebug) NeedsDebugBeforeStep(stepName string) bool {
+ if trd.Breakpoints == nil {
+ return false
+ }
+ beforeStepSets := sets.NewString(trd.Breakpoints.BeforeSteps...)
+ return beforeStepSets.Has(stepName)
+}
+
 // StepNeedsDebug return true if the step is configured to debug
 func (trd *TaskRunDebug) StepNeedsDebug(stepName string) bool {
- return trd.NeedsDebugOnFailure()
+ return trd.NeedsDebugOnFailure() || trd.NeedsDebugBeforeStep(stepName)
 }
 
 // NeedsDebug return true if defined onfailure or have any before, after steps
 func (trd *TaskRunDebug) NeedsDebug() bool {
- return trd.NeedsDebugOnFailure()
+ return trd.NeedsDebugOnFailure() || trd.HaveBeforeSteps()
+}
+
+// HaveBeforeSteps return true if have any before steps
+func (trd *TaskRunDebug) HaveBeforeSteps() bool {
+ return trd.Breakpoints != nil && len(trd.Breakpoints.BeforeSteps) > 0
 }
 
 // TaskRunInputs holds the input values that this task was invoked with.