From e338e54a9326adbb8e7097cc3064fbc0d73fcee9 Mon Sep 17 00:00:00 2001 From: Emika Hammond Date: Fri, 9 Aug 2024 18:54:59 -0400 Subject: [PATCH] Integrate new probe interface into GCP framework (#259) * add gcp_curl_probe * change startup-script to split printing and curl service to print userdata begin runs multiple times despite succeeding. service to run curl fails. printing end token requires curl to succeed so it does not start running. crash recovery kernel arming runs everytime vm is created. * added shell script to run curl and access config shell script prints starting and ending token and runs curl. shell script checks if command was successful. also added external IP so curl has an output * add systemd instance termination after set delay used a systemd script and shell script that gets GCP NAME and ZONE so the instance self deletes. this ensures that if something happens to the client, the resource is not left on customer accounts. * silence kernel completely and fetch egress URLs Egress URLs are fetched from GitHub as done in the AWS verifier * remove target after multi-user target after multi-user unneeded because other outputs are completely silenced and wont interfere with probe output * ignore curl error codes that are network errors systemd service that runs curl should only fail if bash script returns an error code, this happens if curl error code is 1-4, 27, 41-43, 45 which means curl failed, not a network failure * update startup-script and gcp probe comments * combine GCP curl probe into AWS curl probe added userDataTemplate to the probe interface so getExpandedUserData can take in different templates for GCP and AWS * Update pkg/probes/legacy/legacy.go Co-authored-by: Alex Vulaj * Update pkg/verifier/aws/entry_point.go Co-authored-by: Alex Vulaj * clean code * change startup-script to use all userDataVariables made minor changes to curl probe to make more readible and added functionality to startup-script to add cacerts and export proxy environment variables if specified * revert probe interface and pass platform as map a workaround of not changing the probe interface and using a different template for GetExpandedUserData is to pass a userDataVariable through the probe interface. In the GetExpandedUserData, based off if a GCP platform exists, the function chooses userdata-template or startup-script. also changed gcp verifier to use GetMachineImageID which sets default machine image. * remove unused testing function * remove unused packages and add comment * use CPUArchitecture.DefaultInstanceType function * add error * Update comment for userDataVariable Co-authored-by: Anthony Byrne * Change time unit to seconds Co-authored-by: Anthony Byrne * import time * change get_tokens function to camel case --------- Co-authored-by: Alex Vulaj Co-authored-by: Anthony Byrne --- pkg/probes/curl/curl_json.go | 9 ++ pkg/probes/curl/machine_images.go | 4 +- pkg/probes/curl/systemd-template.sh | 83 +++++++++++++ pkg/verifier/gcp/entry_point.go | 112 +++++++++++++----- pkg/verifier/gcp/gcp_verifier.go | 7 ++ pkg/verifier/gcp/gcp_verifier_functions.go | 44 +++++++ .../gcp/gcp_verifier_functions_test.go | 60 ++++++++++ 7 files changed, 286 insertions(+), 33 deletions(-) create mode 100644 pkg/probes/curl/systemd-template.sh create mode 100644 pkg/verifier/gcp/gcp_verifier_functions.go create mode 100644 pkg/verifier/gcp/gcp_verifier_functions_test.go diff --git a/pkg/probes/curl/curl_json.go b/pkg/probes/curl/curl_json.go index 12555b98..225917d0 100644 --- a/pkg/probes/curl/curl_json.go +++ b/pkg/probes/curl/curl_json.go @@ -26,6 +26,9 @@ type Probe struct{} //go:embed userdata-template.yaml var userDataTemplate string +//go:embed systemd-template.sh +var systemdTemplate string + const startingToken = "NV_CURLJSON_BEGIN" const endingToken = "NV_CURLJSON_END" const outputLinePrefix = "@NV@" @@ -82,6 +85,12 @@ func (clp Probe) GetMachineImageID(platformType string, cpuArch cpu.Architecture // values *are* provided for variables that must be set to a certain value for the probe to // function correctly (presetUserDataVariables) -- this function will fill-in those values for you. func (clp Probe) GetExpandedUserData(userDataVariables map[string]string) (string, error) { + // Use systemd to run curl (instead of cloud-init) if requested. Useful for + // platforms that don't include cloud-init in their OS images (e.g., GCP) + if userDataVariables["USE_SYSTEMD"] == "true" { + userDataTemplate = systemdTemplate + } + // Extract required variables specified in template (if any) directivelessUserDataTemplate, requiredVariables := helpers.ExtractRequiredVariablesDirective(userDataTemplate) diff --git a/pkg/probes/curl/machine_images.go b/pkg/probes/curl/machine_images.go index 39802dd7..f6bbc7ed 100644 --- a/pkg/probes/curl/machine_images.go +++ b/pkg/probes/curl/machine_images.go @@ -75,10 +75,10 @@ var cloudMachineImageMap = map[string]map[cpu.Architecture]map[string]string{ // See function docstring's note on GCP; tl;dr: deepest key should be "*" helpers.PlatformGCP: { cpu.ArchX86: { - "*": "rhel-9", + "*": "rhel-9-v20240709", }, cpu.ArchARM: { - "*": "rhel-9-arm64", + "*": "rhel-9-arm64-v20240709", }, }, } diff --git a/pkg/probes/curl/systemd-template.sh b/pkg/probes/curl/systemd-template.sh new file mode 100644 index 00000000..51c3deea --- /dev/null +++ b/pkg/probes/curl/systemd-template.sh @@ -0,0 +1,83 @@ +#!/bin/sh +# GCP compute engine copies startup script to VM and runs script as root when the VM boots + +# get name and zone needed for instance deletion from compute metadata server +cat < /usr/bin/terminate.sh +#! /bin/sh +if gcloud --quiet compute instances delete $(curl -X GET http://metadata.google.internal/computeMetadata/v1/instance/name -H 'Metadata-Flavor: Google') --zone=$(curl -X GET http://metadata.google.internal/computeMetadata/v1/instance/zone -H 'Metadata-Flavor: Google'); then : ; else + exit 255 +fi +EOF + +# print curl output and tokens to serial output for client +cat <<'EOF' > /usr/bin/curl.sh +#! /bin/sh +array=(1 2 3 4 27 41 42 43 45) +if echo ${USERDATA_BEGIN} > /dev/ttyS0 ; then : ; else + exit 255 +fi +curl --retry 3 --retry-connrefused -t B -Z -s -I -m ${TIMEOUT} -w "%{stderr}${LINE_PREFIX}%{json}\n" ${CURLOPT} ${URLS} --proto =http,https,telnet ${TLSDISABLED_URLS_RENDERED} 2>/dev/ttyS0 +ret=$? +value="\<${ret}\>" +if [[ " ${array[@]} " =~ $value ]]; then + exit 255 +fi +if echo ${USERDATA_END} > /dev/ttyS0 ; then : ; else + exit 255 +fi +EOF + +# create systemd units for silencing serial console, running curl and deleting instance +cat < /etc/systemd/system/silence.service +[Unit] +Description=Serial Console Silencing Service +[Service] +Type=oneshot +ExecStart=systemctl mask --now serial-getty@ttyS0.service +ExecStart=systemctl disable --now syslog.socket rsyslog.service +ExecStart=sysctl -w kernel.printk="0 4 0 7" +ExecStart=kill -SIGRTMIN+21 1 +Restart=on-failure +[Install] +WantedBy=multi-user.target +EOF +cat < /etc/systemd/system/curl.service +[Unit] +Description=Curl Output Service +[Service] +Type=oneshot +ExecStart=/usr/bin/curl.sh +Restart=on-failure +RemainAfterExit=true +[Install] +WantedBy=multi-user.target +EOF +cat < /etc/systemd/system/terminate.service +[Unit] +Description=Compute Instance Deletion Service +[Service] +Type=oneshot +ExecStart=/usr/bin/terminate.sh +Restart=on-failure +EOF +cat < /etc/systemd/system/terminate.timer +[Unit] +Description=Instance Deletion Timer +[Timer] +OnBootSec=${DELAY}min +Unit=terminate.service +[Install] +WantedBy=multi-user.target +EOF + +# if cacert is provided, curl probe adds CURLOPT to use the provided cacert +echo "${CACERT}" | base64 > /proxy.pem +chmod 0755 /proxy.pem + +# set proxy environment variables, make script executable and start systemd services +export http_proxy=${HTTP_PROXY} https_proxy=${HTTPS_PROXY} +chmod 777 /usr/bin/curl.sh /usr/bin/terminate.sh +systemctl daemon-reload +systemctl start silence +systemctl start curl +systemctl start terminate.timer \ No newline at end of file diff --git a/pkg/verifier/gcp/entry_point.go b/pkg/verifier/gcp/entry_point.go index 9e09e8b8..f91dc3d5 100644 --- a/pkg/verifier/gcp/entry_point.go +++ b/pkg/verifier/gcp/entry_point.go @@ -5,15 +5,18 @@ import ( "fmt" "math/rand" "strconv" + "time" + "github.com/openshift/osd-network-verifier/pkg/data/cpu" + "github.com/openshift/osd-network-verifier/pkg/data/egress_lists" + "github.com/openshift/osd-network-verifier/pkg/helpers" "github.com/openshift/osd-network-verifier/pkg/output" - "github.com/openshift/osd-network-verifier/pkg/probes/dummy" + "github.com/openshift/osd-network-verifier/pkg/probes/curl" "github.com/openshift/osd-network-verifier/pkg/verifier" ) const ( - cloudImageIDDefault = "rhel-9-v20240703" - DEFAULT_INSTANCE_TYPE = "e2-micro" + DEFAULT_TIMEOUT = 5 * time.Second ) // validateEgress performs validation process for egress @@ -23,52 +26,98 @@ const ( // - find unreachable endpoints & parse output, then terminate instance // - return `g.output` which stores the execution results func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.Output { - g.Logger.Debug(vei.Ctx, "Using configured timeout of %s for each egress request", vei.Timeout.String()) - //default gcp machine e2 - if vei.InstanceType == "" { - vei.InstanceType = DEFAULT_INSTANCE_TYPE + // Validate cloud platform type and default to PlatformGCP if not specified + if vei.PlatformType == "" { + vei.PlatformType = helpers.PlatformGCP + } + if !vei.CPUArchitecture.IsValid() { + vei.CPUArchitecture = cpu.ArchX86 + } + // Default to curl.Probe if no Probe specified + if vei.Probe == nil { + vei.Probe = curl.Probe{} + g.Logger.Debug(vei.Ctx, "defaulted to curl probe") + } + + // Set timeout to default if not specified + if vei.Timeout <= 0 { + vei.Timeout = DEFAULT_TIMEOUT } + g.Logger.Debug(vei.Ctx, "configured a %s timeout for each egress request", vei.Timeout) - // need to set InstanceType here because default is a AWS machine type - vei.InstanceType = DEFAULT_INSTANCE_TYPE + // Set instance type to default if not specified and validate it + if vei.InstanceType == "" { + var err error + vei.InstanceType, err = vei.CPUArchitecture.DefaultInstanceType(helpers.PlatformGCP) + if err != nil { + return g.Output.AddError(err) + } + g.Logger.Debug(vei.Ctx, fmt.Sprintf("defaulted to instance type %s", vei.InstanceType)) + } if err := g.validateMachineType(vei.GCP.ProjectID, vei.GCP.Zone, vei.InstanceType); err != nil { return g.Output.AddError(fmt.Errorf("instance type %s is invalid: %s", vei.InstanceType, err)) } - // Fetch the egress URL list as two strings (one for normal URLs, the other - // for TLS disabled URLs); note that this is TOTALLY IGNORED by LegacyProbe, - // as that probe only knows how to use the egress URL lists baked into its - // AMIs/container images - // egressListStr, tlsDisabledEgressListStr, err := egress_lists.GetEgressListAsString(vei.PlatformType, a.AwsClient.Region) - // if err != nil { - // return a.Output.AddError(err) - // } + // Fetch the egress URL list from github, falling back to local lists in the event of a failure. + egressListYaml := vei.EgressListYaml + var egressListStr, tlsDisabledEgressListStr string + if egressListYaml == "" { + githubEgressList, githubListErr := egress_lists.GetGithubEgressList(vei.PlatformType) + if githubListErr == nil { + egressListYaml, githubListErr = githubEgressList.GetContent() + if githubListErr == nil { + g.Logger.Debug(vei.Ctx, "Using egress URL list from %s at SHA %s", githubEgressList.GetURL(), githubEgressList.GetSHA()) + egressListStr, tlsDisabledEgressListStr, githubListErr = egress_lists.EgressListToString(egressListYaml, map[string]string{}) + } + } + if githubListErr != nil { + var err error + g.Output.AddError(fmt.Errorf("failed to get egress list from GitHub, falling back to local list: %v", githubListErr)) + egressListYaml, err = egress_lists.GetLocalEgressList(vei.PlatformType) + if err != nil { + return g.Output.AddError(err) + } + egressListStr, tlsDisabledEgressListStr, err = egress_lists.EgressListToString(egressListYaml, map[string]string{}) + if err != nil { + return g.Output.AddError(err) + } + } + } + + // Generate the userData file + // Expand replaces all ${var} (using empty string for unknown ones), adding the env variables used in startup-script.sh userDataVariables := map[string]string{ - "AWS_REGION": "us-east-2", // Not sure if this is the correct data - "TIMEOUT": vei.Timeout.String(), - "HTTP_PROXY": vei.Proxy.HttpProxy, - "HTTPS_PROXY": vei.Proxy.HttpsProxy, - "CACERT": base64.StdEncoding.EncodeToString([]byte(vei.Proxy.Cacert)), - "NOTLS": strconv.FormatBool(vei.Proxy.NoTls), - "DELAY": "5", - "URLS": "quay.io", + "TIMEOUT": vei.Timeout.String(), + "HTTP_PROXY": vei.Proxy.HttpProxy, + "HTTPS_PROXY": vei.Proxy.HttpsProxy, + "CACERT": base64.StdEncoding.EncodeToString([]byte(vei.Proxy.Cacert)), + "NOTLS": strconv.FormatBool(vei.Proxy.NoTls), + "DELAY": "5", + "URLS": egressListStr, + "TLSDISABLED_URLS": tlsDisabledEgressListStr, + // Add fake userDatavariables to replace normal shell variables in startup-script.sh which will otherwise be erased by os.Expand + "ret": "${ret}", + "?": "$?", + "array[@]": "${array[@]}", + "value": "$value", + "USE_SYSTEMD": "true", } - // set probe - vei.Probe = dummy.Probe{} + userData, err := vei.Probe.GetExpandedUserData(userDataVariables) if err != nil { return g.Output.AddError(err) } - g.Logger.Debug(vei.Ctx, "Generated userdata script:\n---\n%s\n---", userData) if vei.CloudImageID == "" { - vei.CloudImageID = cloudImageIDDefault + vei.CloudImageID, err = vei.Probe.GetMachineImageID(vei.PlatformType, vei.CPUArchitecture, vei.GCP.Region) + if err != nil { + return g.Output.AddError(err) + } } //image list https://cloud.google.com/compute/docs/images/os-details#red_hat_enterprise_linux_rhel - instance, err := g.createComputeServiceInstance(createComputeServiceInstanceInput{ projectID: vei.GCP.ProjectID, zone: vei.GCP.Zone, @@ -89,7 +138,8 @@ func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O g.Logger.Debug(vei.Ctx, "Waiting for ComputeService instance %s to be running", instance.Name) if instanceReadyErr := g.waitForComputeServiceInstanceCompletion(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name); instanceReadyErr != nil { - err = g.GcpClient.TerminateComputeServiceInstance(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name) // try to terminate the created instanc + // try to terminate instance if instance is not running + err = g.GcpClient.TerminateComputeServiceInstance(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name) if err != nil { g.Output.AddError(err) } diff --git a/pkg/verifier/gcp/gcp_verifier.go b/pkg/verifier/gcp/gcp_verifier.go index 37f99ccd..654d2772 100644 --- a/pkg/verifier/gcp/gcp_verifier.go +++ b/pkg/verifier/gcp/gcp_verifier.go @@ -92,6 +92,13 @@ func (g *GcpVerifier) createComputeServiceInstance(input createComputeServiceIns { Name: input.networkName, Subnetwork: input.vpcSubnetID, + // Only one accessConfigs exist which is ONE_TO_ONE_NAT + // needed for external internet access including egress + AccessConfigs: []*computev1.AccessConfig{ + { + Name: "External NAT", + }, + }, }, }, ServiceAccounts: []*computev1.ServiceAccount{ diff --git a/pkg/verifier/gcp/gcp_verifier_functions.go b/pkg/verifier/gcp/gcp_verifier_functions.go new file mode 100644 index 00000000..407678e5 --- /dev/null +++ b/pkg/verifier/gcp/gcp_verifier_functions.go @@ -0,0 +1,44 @@ +package gcpverifier + +import ( + "fmt" + "strings" + + "github.com/openshift/osd-network-verifier/pkg/helpers" + "github.com/openshift/osd-network-verifier/pkg/probes" +) + +// function that tests probe order logic that is part of findUnreachableEndpoints in gcp_verifier.go +// get_tokens checks for the presence of startingToken and endingToken in the consoleOutput +// probe outsput should be between startingToken and endingToken +func getTokens(consoleOutput string, probe probes.Probe) bool { + // Check for startingToken and endingToken + startingTokenSeen := strings.Contains(consoleOutput, probe.GetStartingToken()) + endingTokenSeen := strings.Contains(consoleOutput, probe.GetEndingToken()) + if !startingTokenSeen { + if endingTokenSeen { + fmt.Printf("raw console logs:\n---\n%s\n---", consoleOutput) + fmt.Printf("probe output corrupted: endingToken encountered before startingToken") + return false + } + fmt.Printf("consoleOutput contains data, but probe has not yet printed startingToken, continuing to wait...") + return false + } + if !endingTokenSeen { + fmt.Printf("consoleOutput contains data, but probe has not yet printed endingToken, continuing to wait...") + return false + } + // If we make it this far, we know that both startingTokenSeen and endingTokenSeen are true + + // Separate the probe's output from the rest of the console output (using startingToken and endingToken) + rawProbeOutput := strings.TrimSpace(helpers.CutBetween(consoleOutput, probe.GetStartingToken(), probe.GetEndingToken())) + if len(rawProbeOutput) < 1 { + fmt.Printf("raw console logs:\n---\n%s\n---", consoleOutput) + fmt.Printf("probe output corrupted: no data between startingToken and endingToken") + return false + } + // Send probe's output off to the Probe interface for parsing + fmt.Printf("probe output:\n---\n%s\n---", rawProbeOutput) + + return true +} diff --git a/pkg/verifier/gcp/gcp_verifier_functions_test.go b/pkg/verifier/gcp/gcp_verifier_functions_test.go new file mode 100644 index 00000000..d6165d16 --- /dev/null +++ b/pkg/verifier/gcp/gcp_verifier_functions_test.go @@ -0,0 +1,60 @@ +package gcpverifier + +import ( + "testing" + + "github.com/openshift/osd-network-verifier/pkg/probes" + "github.com/openshift/osd-network-verifier/pkg/probes/curl" +) + +func TestGetTokens(t *testing.T) { + type args struct { + consoleOutput string + probe probes.Probe + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "tokens in order", + args: args{ + consoleOutput: "otherinfoNV_CURLJSON_BEGIN\nhello world\nNV_CURLJSON_END\njj", + probe: curl.Probe{}, + }, + want: true, + }, + { + name: "only start token", + args: args{ + consoleOutput: "NV_CURLJSON_BEGIN\nhello world\n", + probe: curl.Probe{}, + }, + want: false, + }, + { + name: "only end token", + args: args{ + consoleOutput: "hello world\nNV_CURLJSON_END\njj", + probe: curl.Probe{}, + }, + want: false, + }, + { + name: "token order reversed", + args: args{ + consoleOutput: "fjsdklNV_CURLJSON_END\nhello world\nNV_CURLJSON_BEGIN\njj", + probe: curl.Probe{}, + }, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := getTokens(tt.args.consoleOutput, tt.args.probe); got != tt.want { + t.Errorf("get_tokens() = %v, want %v", got, tt.want) + } + }) + } +}