Skip to content

Commit

Permalink
fix: operator retry bug fixes (#524)
Browse files Browse the repository at this point in the history
This PR fixes two bugs around operator reconciliation/retry handling:
- Infinite pending/failure loop caused by placement of uidSeen addition
- Package retry new status phase to prevent packages getting stuck in
pending

Fixes #
<!-- or -->
Relates to #

- [x] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Other (security config, docs update, etc)

- [ ] Test, docs, adr added or updated as needed
- [ ] [Contributor Guide
Steps](https://github.com/defenseunicorns/uds-template-capability/blob/main/CONTRIBUTING.md)(https://github.com/defenseunicorns/uds-template-capability/blob/main/CONTRIBUTING.md#submitting-a-pull-request)
followed

---------

Co-authored-by: Micah Nagel <micah.nagel@defenseunicorns.com>
  • Loading branch information
jeff-mccoy and mjnagel committed Jul 2, 2024
1 parent e603eaa commit d2960a9
Show file tree
Hide file tree
Showing 8 changed files with 291 additions and 374 deletions.
623 changes: 259 additions & 364 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/istio/common/manifests/pepr-istio-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Have to pre-create the namespace and also patch it with the istio-injection label later because
# Helm is kind of dumb: https://github.com/helm/helm/issues/350
kind: Namespace
apiVersion: v1
metadata:
name: pepr-system
labels:
Expand Down
13 changes: 11 additions & 2 deletions src/pepr/operator/controllers/keycloak/client-sync.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ export async function purgeSSOClients(pkg: UDSPackage, refs: string[] = []) {
async function syncClient(
{ isAuthSvcClient, secretName, secretTemplate, ...clientReq }: Sso,
pkg: UDSPackage,
isRetry = false,
) {
Log.debug(pkg.metadata, `Processing client request: ${clientReq.clientId}`);
// Not including the CR data in the ref because Keycloak client IDs must be unique already
Expand All @@ -91,7 +92,7 @@ async function syncClient(
const token = Store.getItem(name);

// If an existing client is found, update it
if (token) {
if (token && !isRetry) {
Log.debug(pkg.metadata, `Found existing token for ${clientReq.clientId}`);
client = await apiCall(clientReq, "PUT", token);
} else {
Expand All @@ -103,7 +104,15 @@ async function syncClient(
`Failed to process client request '${clientReq.clientId}' for ` +
`${pkg.metadata?.namespace}/${pkg.metadata?.name}. Error: ${err.message}`;
Log.error({ err }, msg);
throw new Error(msg);

if (isRetry) {
Log.error(`${msg}, retry failed, aborting`);
throw new Error(`${msg}. RETRY FAILED, aborting: ${err.message}`);
}

// Retry the request
Log.warn(pkg.metadata, `Failed to process client request: ${clientReq.clientId}, retrying`);
return syncClient(clientReq, pkg, true);
}

// Write the new token to the store
Expand Down
4 changes: 2 additions & 2 deletions src/pepr/operator/crd/generated/package-v1alpha1.ts
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,7 @@ export enum ClientAuthenticatorType {
*/
export interface Groups {
/**
* List of group allowed to access to client
* List of groups allowed to access to client
*/
anyOf?: string[];
}
Expand All @@ -561,8 +561,8 @@ export interface Status {
export enum Phase {
Failed = "Failed",
Pending = "Pending",
PendingRetry = "Pending Retry",
Ready = "Ready",
Retrying = "Retrying",
}

RegisterKind(Package, {
Expand Down
2 changes: 1 addition & 1 deletion src/pepr/operator/crd/sources/package/v1alpha1.ts
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ export const v1alpha1: V1CustomResourceDefinitionVersion = {
type: "integer",
},
phase: {
enum: ["Pending", "Pending Retry", "Ready", "Failed"],
enum: ["Pending", "Ready", "Failed", "Retrying"],
type: "string",
},
ssoClients: {
Expand Down
5 changes: 4 additions & 1 deletion src/pepr/operator/reconcilers/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,10 @@ describe("handleFailure", () => {

expect(PatchStatus).toHaveBeenCalledWith({
metadata: { namespace: "default", name: "test" },
status: { phase: Phase.PendingRetry, retryAttempt: 1 },
status: {
phase: Phase.Retrying,
retryAttempt: 1,
},
});
});

Expand Down
15 changes: 12 additions & 3 deletions src/pepr/operator/reconcilers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export const uidSeen = new Set<string>();
* @returns true if the CRD is pending or the current generation has been processed
*/
export function shouldSkip(cr: UDSPackage) {
const isRetrying = cr.status?.phase === Phase.Retrying;
const isPending = cr.status?.phase === Phase.Pending;
const isCurrentGeneration = cr.metadata?.generation === cr.status?.observedGeneration;

Expand All @@ -22,6 +23,12 @@ export function shouldSkip(cr: UDSPackage) {
return false;
}

// If the CR is retrying, it should not be skipped
if (isRetrying) {
Log.debug(cr, `Should skip? No, retrying`);
return false;
}

// This is the second time the CR has been seen, so check if it is pending or the current generation
if (isPending || isCurrentGeneration) {
Log.debug(cr, `Should skip? Yes, pending or current generation and not first time seen`);
Expand Down Expand Up @@ -50,6 +57,9 @@ export async function updateStatus(cr: UDSPackage, status: PkgStatus) {
},
status,
});

// Track the UID of the CRD to know if it has been seen before
uidSeen.add(cr.metadata!.uid!);
}

/**
Expand Down Expand Up @@ -95,8 +105,7 @@ export async function handleFailure(err: { status: number; message: string }, cr
const identifier = `${metadata.namespace}/${metadata.name}`;
let status: Status;

// todo: "errors" end up here from (1) our own thrown errors, (2) thrown k8s/store failures which are http responses? (3) unknown other places - each of these have a different type and should be handled differently
// todo: identify exact 404 we are targetting, possibly in `updateStatus`
// todo: identify exact 404 we are targeting, possibly in `updateStatus`
if (err.status === 404) {
Log.warn({ err }, `Package metadata seems to have been deleted`);
return;
Expand All @@ -109,7 +118,7 @@ export async function handleFailure(err: { status: number; message: string }, cr
Log.error({ err }, `Reconciliation attempt ${currRetry} failed for ${identifier}, retrying...`);

status = {
phase: Phase.PendingRetry,
phase: Phase.Retrying,
retryAttempt: currRetry,
};
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/pepr/operator/reconcilers/package-reconciler.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { Log } from "pepr";

import { handleFailure, shouldSkip, uidSeen, updateStatus } from ".";
import { handleFailure, shouldSkip, updateStatus } from ".";
import { UDSConfig } from "../../config";
import { enableInjection } from "../controllers/istio/injection";
import { istioResources } from "../controllers/istio/istio-resources";
Expand Down

0 comments on commit d2960a9

Please sign in to comment.