Skip to content

1427 - Add DNS and cert cleanup into the preview env cron #9525

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 155 additions & 64 deletions .werft/platform-delete-preview-environments-cron.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,22 @@ import { wipePreviewEnvironmentAndNamespace, helmInstallName, listAllPreviewName
import { exec } from './util/shell';
import { previewNameFromBranchName } from './util/preview';
import { CORE_DEV_KUBECONFIG_PATH, HARVESTER_KUBECONFIG_PATH } from './jobs/build/const';
import {deleteDNSRecord} from "./util/gcloud";

// for testing purposes
// if set to 'true' it shows only previews that would be deleted
const DRY_RUN = false

const SLICES = {
CONFIGURE_ACCESS: "Configuring access to relevant resources",
FETCHING_PREVIEW_ENVIRONMENTS: "Fetching preview environments",
FETCHING_BRANCHES: "Fetching branches",
DETERMINING_STALE_PREVIEW_ENVIRONMENTS: "Determining stale preview environments",
CHECKING_STALE_BRANCH: (branch: string) => `Checking for commit activity on ${branch}`,
CHECKING_DB_ACTIVITY: (preview: string) => `Checking for DB activity in ${preview}`,
DELETING_PREVIEW_ENVIRONMNETS: "Deleting preview environments"
}

// Will be set once tracing has been initialized
let werft: Werft

Expand All @@ -24,6 +35,9 @@ Tracing.initialize()
code: SpanStatusCode.ERROR,
message: err
})
console.error("Werft job failed with an error", err)
// Explicitly not using process.exit as we need to flush tracing, see tracing.js
process.exitCode = 1
})
.finally(() => {
werft.phase("Flushing telemetry", "Flushing telemetry before stopping job")
Expand All @@ -32,108 +46,185 @@ Tracing.initialize()

async function deletePreviewEnvironments() {

werft.phase("prep");
werft.phase("Configure access");
try {
const GCLOUD_SERVICE_ACCOUNT_PATH = "/mnt/secrets/gcp-sa/service-account.json";
exec(`gcloud auth activate-service-account --key-file "${GCLOUD_SERVICE_ACCOUNT_PATH}"`);
exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} gcloud container clusters get-credentials core-dev --zone europe-west1-b --project gitpod-core-dev`);
exec(`gcloud auth activate-service-account --key-file "${GCLOUD_SERVICE_ACCOUNT_PATH}"`, {slice: SLICES.CONFIGURE_ACCESS});
exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} gcloud container clusters get-credentials core-dev --zone europe-west1-b --project gitpod-core-dev`, {slice: SLICES.CONFIGURE_ACCESS});
werft.done(SLICES.CONFIGURE_ACCESS)
} catch (err) {
werft.fail("prep", err)
werft.fail(SLICES.CONFIGURE_ACCESS, err)
}
werft.done("prep")

werft.phase("Fetching previews");
werft.phase("Fetching preview environments");
let previews: string[]
try {
previews = listAllPreviewNamespaces(CORE_DEV_KUBECONFIG_PATH, {});
previews.forEach(previewNs => werft.log("Fetching preview", previewNs));
werft.done("Fetching preview");
previews.forEach(previewNs => werft.log(SLICES.FETCHING_PREVIEW_ENVIRONMENTS, previewNs));
werft.log(SLICES.FETCHING_PREVIEW_ENVIRONMENTS, `Found ${previews.length} preview environments`)
werft.done(SLICES.FETCHING_PREVIEW_ENVIRONMENTS);
} catch (err) {
werft.fail("Fetching preview", err)
werft.fail(SLICES.FETCHING_PREVIEW_ENVIRONMENTS, err)
}

werft.phase("Fetching outdated branches");
werft.phase("Fetching branches");
const branches = getAllBranches();
const outdatedPreviews = new Set(branches
werft.log(SLICES.FETCHING_BRANCHES, `Found ${branches.length} branches`)

werft.phase("Determining which preview environments are stale");

const previewNamespaceBasedOnBranches = new Set(branches.map(branch => expectedNamespaceFromBranch(branch)));

const previewNamespaceBasedOnStaleBranches = new Set(branches
.filter(branch => {
const lastCommit = exec(`git log origin/${branch} --since=$(date +%Y-%m-%d -d "5 days ago")`, { silent: true })
return lastCommit.length < 1
const sliceID = SLICES.CHECKING_STALE_BRANCH(branch)
const lastCommit = exec(`git log origin/${branch} --since=$(date +%Y-%m-%d -d "5 days ago")`, { slice: sliceID })
const hasRecentCommits = lastCommit.length > 1
werft.log(sliceID, `Has recent commits: ${hasRecentCommits}`)
werft.done(sliceID)
return !hasRecentCommits
})
.map(branch => expectedNamespaceFromBranch(branch)))

const expectedPreviewEnvironmentNamespaces = new Set(branches.map(branch => expectedNamespaceFromBranch(branch)));
const deleteDueToMissingBranch = previews.filter(ns => !previewNamespaceBasedOnBranches.has(ns))
const deleteDueToNoCommitActivity = previews.filter(ns => previewNamespaceBasedOnStaleBranches.has(ns))
const deleteDueToNoDBActivity = previews.filter(ns => isInactive(ns))
const previewsToDelete = new Set([...deleteDueToMissingBranch, ...deleteDueToNoCommitActivity, ...deleteDueToNoDBActivity])

werft.phase("deleting previews")
try {
const deleteDueToMissingBranch = previews.filter(ns => !expectedPreviewEnvironmentNamespaces.has(ns))
const deleteDueToNoCommitActivity = previews.filter(ns => outdatedPreviews.has(ns))
const deleteDueToNoDBActivity = previews.filter(ns => isInactive(ns))
const previewsToDelete = new Set([...deleteDueToMissingBranch, ...deleteDueToNoCommitActivity, ...deleteDueToNoDBActivity])
if (previewsToDelete.has("staging-main")) {
previewsToDelete.delete("staging-main")
}

if (previewsToDelete.has("staging-main")) {
previewsToDelete.delete("staging-main")
}
if (previewsToDelete.size == 0) {
werft.log(SLICES.DETERMINING_STALE_PREVIEW_ENVIRONMENTS, "No stale preview environments.")
werft.done(SLICES.DETERMINING_STALE_PREVIEW_ENVIRONMENTS)
return
} else {
werft.log(SLICES.DETERMINING_STALE_PREVIEW_ENVIRONMENTS, `Found ${previewsToDelete.size} stale preview environments`)
}

if (DRY_RUN) {
previewsToDelete.forEach(preview => werft.log("deleting preview", `would have deleted preview environment ${preview}`))
}
else {
const promises: Promise<any>[] = [];
previewsToDelete.forEach(preview => {
werft.log("deleting preview", preview)
promises.push(wipePreviewEnvironmentAndNamespace(helmInstallName, preview, CORE_DEV_KUBECONFIG_PATH, { slice: `Deleting preview ${preview}` }))
})
await Promise.all(promises)
}
werft.done("deleting preview")
werft.phase("Deleting stale preview environments")
if (DRY_RUN) {
previewsToDelete.forEach(preview => {
werft.log(SLICES.DELETING_PREVIEW_ENVIRONMNETS, `Would have deleted preview environment ${preview}`)
})
werft.done(SLICES.DELETING_PREVIEW_ENVIRONMNETS)
return
}

try {
const promises: Promise<any>[] = [];
previewsToDelete.forEach(preview => promises.push(removePreviewEnvironment(preview)))
await Promise.all(promises)
werft.done(SLICES.DELETING_PREVIEW_ENVIRONMNETS)
} catch (err) {
werft.fail("deleting preview", err)
werft.fail(SLICES.DELETING_PREVIEW_ENVIRONMNETS, err)
}
}

async function removePreviewEnvironment(previewNamespace: string) {
const sliceID = `Deleting preview ${previewNamespace}`
werft.log(sliceID, `Starting deletion of all resources related to ${previewNamespace}`)
try {
const previewDNSName = previewNamespace.replace('staging-', '')

// We're running these promises sequentially to make it easier to read the log output.
await removeCertificate(previewNamespace, CORE_DEV_KUBECONFIG_PATH, sliceID)
await removeStagingDNSRecord(previewDNSName, sliceID)
await removePreviewDNSRecord(previewDNSName, sliceID)
await wipePreviewEnvironmentAndNamespace(helmInstallName, previewNamespace, CORE_DEV_KUBECONFIG_PATH, { slice: sliceID })
werft.done(sliceID)
} catch (e) {
werft.fail(sliceID, e)
}
}

/**
* Checks whether or not a preview environment is considered inactive.
*
* It errors on the side of caution, so in case of connection issues etc. it will consider the
* preview environment active.
*/
function isInactive(previewNS: string): boolean {
const sliceID = SLICES.CHECKING_DB_ACTIVITY(previewNS)
try {
werft.log(sliceID, "Checking namespace status")
const statusNS = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get ns ${previewNS} -o jsonpath='{.status.phase}'`, { slice: sliceID })

const statusNS = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get ns ${previewNS} -o jsonpath='{.status.phase}'`, { silent: true})

if ( statusNS == "Active") {
if (statusNS != "Active") {
werft.log(sliceID, `Is inactive: false - The namespace is ${statusNS}`)
werft.done(sliceID)
return false
}

const emptyNS = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get pods -n ${previewNS} -o jsonpath='{.items.*}'`, { silent: true})
werft.log(sliceID, "Checking status of the MySQL pod")
const statusDB = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get pods mysql-0 -n ${previewNS} -o jsonpath='{.status.phase}'`, { slice: sliceID})
const statusDbContainer = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get pods mysql-0 -n ${previewNS} -o jsonpath='{.status.containerStatuses.*.ready}'`, { slice: sliceID})

if ( emptyNS.length < 1 ) {
return false;
if (statusDB.code != 0 || statusDB != "Running" || statusDbContainer == "false") {
werft.log(sliceID, "Is inactive: false - The database is not reachable")
werft.done(sliceID)
return false
}

const statusDB = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get pods mysql-0 -n ${previewNS} -o jsonpath='{.status.phase}'`, { silent: true})
const statusDbContainer = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get pods mysql-0 -n ${previewNS} -o jsonpath='{.status.containerStatuses.*.ready}'`, { silent: true})
const dbPassword = exec(`KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get secret db-password -n ${previewNS} -o jsonpath='{.data.mysql-root-password}' | base64 -d`, {silent: true}).stdout.trim()
const connectionToDb = `mysql --host=db.${previewNS}.svc.cluster.local --port=3306 --user=root --database=gitpod -s -N --password=${dbPassword}`

if (statusDB.code == 0 && statusDB == "Running" && statusDbContainer != "false") {
const latestInstanceTimeout = 48
const latestInstance = exec(`${connectionToDb} --execute="SELECT creationTime FROM d_b_workspace_instance WHERE creationTime > DATE_SUB(NOW(), INTERVAL '${latestInstanceTimeout}' HOUR) LIMIT 1"`, { slice: sliceID})

const connectionToDb = `KUBECONFIG=${CORE_DEV_KUBECONFIG_PATH} kubectl get secret db-password -n ${previewNS} -o jsonpath='{.data.mysql-root-password}' | base64 -d | mysql --host=db.${previewNS}.svc.cluster.local --port=3306 --user=root --database=gitpod -s -N -p`
const latestUserTimeout = 48
const latestUser= exec(`${connectionToDb} --execute="SELECT creationDate FROM d_b_user WHERE creationDate > DATE_SUB(NOW(), INTERVAL '${latestUserTimeout}' HOUR) LIMIT 1"`, { slice: sliceID})

const latestInstanceTimeout = 24
const latestInstance = exec(`${connectionToDb} --execute="SELECT creationTime FROM d_b_workspace_instance WHERE creationTime > DATE_SUB(NOW(), INTERVAL '${latestInstanceTimeout}' HOUR) LIMIT 1"`, { silent: true })
const lastModifiedTimeout = 48
const lastModified= exec(`${connectionToDb} --execute="SELECT _lastModified FROM d_b_user WHERE _lastModified > DATE_SUB(NOW(), INTERVAL '${lastModifiedTimeout}' HOUR) LIMIT 1"`, { slice: sliceID})

const latestUserTimeout = 24
const latestUser= exec(`${connectionToDb} --execute="SELECT creationDate FROM d_b_user WHERE creationDate > DATE_SUB(NOW(), INTERVAL '${latestUserTimeout}' HOUR) LIMIT 1"`, { silent: true })
const heartbeatTimeout = 48
const heartbeat= exec(`${connectionToDb} --execute="SELECT lastSeen FROM d_b_workspace_instance_user WHERE lastSeen > DATE_SUB(NOW(), INTERVAL '${heartbeatTimeout}' HOUR) LIMIT 1"`, { slice: sliceID})

const lastModifiedTimeout = 24
const lastModified= exec(`${connectionToDb} --execute="SELECT _lastModified FROM d_b_user WHERE _lastModified > DATE_SUB(NOW(), INTERVAL '${lastModifiedTimeout}' HOUR) LIMIT 1"`, { silent: true })
const isInactive = (heartbeat.length < 1) && (latestInstance.length < 1) && (latestUser.length < 1) && (lastModified.length < 1)
werft.log(sliceID, `Is inactive: ${isInactive}`)
werft.done(sliceID)
return isInactive
} catch (err) {
werft.log(sliceID, "Is inactive: false - Unable to check DB activity")
werft.done(sliceID)
return false
}
}

const heartbeatTimeout = 24
const heartbeat= exec(`${connectionToDb} --execute="SELECT lastSeen FROM d_b_workspace_instance_user WHERE lastSeen > DATE_SUB(NOW(), INTERVAL '${heartbeatTimeout}' HOUR) LIMIT 1"`, { silent: true })
async function removeCertificate(preview: string, kubectlConfig: string, slice: string) {
exec(`kubectl --kubeconfig ${kubectlConfig} -n certs delete cert ${preview}`, {slice: slice})
}

if ( (heartbeat.length < 1) &&
(latestInstance.length < 1) &&
(latestUser.length < 1) &&
(lastModified.length < 1) ) {
return true;
} else {
return false;
}
}
}
// remove DNS records for core-dev-based preview environments
async function removeStagingDNSRecord(preview: string, sliceID: string) {
werft.log(sliceID, "Deleting core-dev related DNS records for the preview environment")
await Promise.all([
deleteDNSRecord('A', `*.ws-dev.${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID),
deleteDNSRecord('A', `*.${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID),
deleteDNSRecord('A', `${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID),
deleteDNSRecord('A', `prometheus-${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID),
deleteDNSRecord('TXT', `prometheus-${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID),
deleteDNSRecord('A', `grafana-${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID),
deleteDNSRecord('TXT', `grafana-${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID),
deleteDNSRecord('TXT', `_acme-challenge.${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID),
deleteDNSRecord('TXT', `_acme-challenge.ws-dev.${preview}.staging.gitpod-dev.com`, 'gitpod-dev', 'gitpod-dev-com', sliceID)
])
}

// remove DNS records for harvester-based preview environments
async function removePreviewDNSRecord(preview: string, sliceID: string) {
werft.log(sliceID, "Deleting harvester related DNS records for the preview environment")
await Promise.all([
deleteDNSRecord('A', `*.ws-dev.${preview}.preview.gitpod-dev.com`, 'gitpod-core-dev', 'preview-gitpod-dev-com', sliceID),
deleteDNSRecord('A', `*.${preview}.preview.gitpod-dev.com`, 'gitpod-core-dev', 'preview-gitpod-dev-com', sliceID),
deleteDNSRecord('A', `${preview}.preview.gitpod-dev.com`, 'gitpod-core-dev', 'preview-gitpod-dev-com', sliceID),
deleteDNSRecord('A', `prometheus-${preview}.preview.gitpod-dev.com`, 'gitpod-core-dev', 'preview-gitpod-dev-com', sliceID),
deleteDNSRecord('TXT', `prometheus-${preview}.preview.gitpod-dev.com`, 'gitpod-core-dev', 'preview-gitpod-dev-com', sliceID),
deleteDNSRecord('A', `grafana-${preview}.preview.gitpod-dev.com`, 'gitpod-core-dev', 'preview-gitpod-dev-com', sliceID),
deleteDNSRecord('TXT', `grafana-${preview}.preview.gitpod-dev.com`, 'gitpod-core-dev', 'preview-gitpod-dev-com', sliceID)
])
}

async function cleanLoadbalancer() {
Expand Down
20 changes: 19 additions & 1 deletion .werft/util/gcloud.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,24 @@ export async function createDNSRecord(options: {domain: string, projectId: strin
}
}

export async function deleteDNSRecord(recordType: string, domain: string, projectId: string, dnsZone: string, slicdeID: string): Promise<void> {
const werft = getGlobalWerftInstance()

const dnsClient = new DNS({
projectId: projectId,
keyFilename: GCLOUD_SERVICE_ACCOUNT_PATH,
})
const zone = dnsClient.zone(dnsZone)
const [records] = await zone.getRecords({ name: `${domain}.`, type: recordType })

werft.log(slicdeID, `Found ${records.length} for ${domain}`)

await Promise.all(records.map(record => {
werft.log(slicdeID, `Deleting ${record.metadata.name}`)
return record.delete()
}))
}

// matchesExistingRecord will return true only if the existing record matches the same name and IP.
// If IP doesn't match, then the record needs to be replaced in a following step.
async function matchesExistingRecord(zone: Zone, domain: string, IP: string): Promise<boolean> {
Expand Down Expand Up @@ -88,4 +106,4 @@ async function createOrReplaceRecord(zone: Zone, domain: string, IP: string, sli

werft.log(slice, `Creating DNS record: ${JSON.stringify(record)}`) // delete before submiting PR
await zone.addRecords(record)
}
}