Skip to content

Commit

Permalink
Integrate new probe interface into GCP framework (#259)
Browse files Browse the repository at this point in the history
* add gcp_curl_probe

* change startup-script to split printing and curl

service to print userdata begin runs multiple times despite
succeeding. service to run curl fails. printing end token requires
curl to succeed so it does not start running. crash recovery kernel
arming runs everytime vm is created.

* added shell script to run curl and access config

shell script prints starting and ending token and runs curl. shell
script checks if command was successful. also added external IP so
curl has an output

* add systemd instance termination after set delay

used a systemd script and shell script that gets GCP NAME and ZONE
so the instance self deletes. this ensures that if something
happens to the client, the resource is not left on customer
accounts.

* silence kernel completely and fetch egress URLs

Egress URLs are fetched from GitHub as done in the AWS verifier

* remove target after multi-user

target after multi-user unneeded because other outputs are
completely silenced and wont interfere with probe output

* ignore curl error codes that are network errors

systemd service that runs curl should only fail if bash script
returns an error code, this happens if curl error code is
1-4, 27, 41-43, 45 which means curl failed, not a network failure

* update startup-script and gcp probe comments

* combine GCP curl probe into AWS curl probe

added userDataTemplate to the probe interface so
getExpandedUserData can take in different templates for GCP and AWS

* Update pkg/probes/legacy/legacy.go

Co-authored-by: Alex Vulaj <ajvulaj@gmail.com>

* Update pkg/verifier/aws/entry_point.go

Co-authored-by: Alex Vulaj <ajvulaj@gmail.com>

* clean code

* change startup-script to use all userDataVariables

made minor changes to curl probe to make more readible and added
functionality to startup-script to add cacerts and export proxy
environment variables if specified

* revert probe interface and pass platform as map

a workaround of not changing the probe interface and using a
different template for GetExpandedUserData is to pass a
userDataVariable through the probe interface. In the
GetExpandedUserData, based off if a GCP platform exists, the
function chooses userdata-template or startup-script.
also changed gcp verifier to use GetMachineImageID which sets
default machine image.

* remove unused testing function

* remove unused packages and add comment

* use CPUArchitecture.DefaultInstanceType function

* add error

* Update comment for userDataVariable

Co-authored-by: Anthony Byrne <abyrne@redhat.com>

* Change time unit to seconds

Co-authored-by: Anthony Byrne <abyrne@redhat.com>

* import time

* change get_tokens function to camel case

---------

Co-authored-by: Alex Vulaj <ajvulaj@gmail.com>
Co-authored-by: Anthony Byrne <abyrne@redhat.com>
  • Loading branch information
3 people authored Aug 9, 2024
1 parent 90e5f19 commit e338e54
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 33 deletions.
9 changes: 9 additions & 0 deletions pkg/probes/curl/curl_json.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ type Probe struct{}
//go:embed userdata-template.yaml
var userDataTemplate string

//go:embed systemd-template.sh
var systemdTemplate string

const startingToken = "NV_CURLJSON_BEGIN"
const endingToken = "NV_CURLJSON_END"
const outputLinePrefix = "@NV@"
Expand Down Expand Up @@ -82,6 +85,12 @@ func (clp Probe) GetMachineImageID(platformType string, cpuArch cpu.Architecture
// values *are* provided for variables that must be set to a certain value for the probe to
// function correctly (presetUserDataVariables) -- this function will fill-in those values for you.
func (clp Probe) GetExpandedUserData(userDataVariables map[string]string) (string, error) {
// Use systemd to run curl (instead of cloud-init) if requested. Useful for
// platforms that don't include cloud-init in their OS images (e.g., GCP)
if userDataVariables["USE_SYSTEMD"] == "true" {
userDataTemplate = systemdTemplate
}

// Extract required variables specified in template (if any)
directivelessUserDataTemplate, requiredVariables := helpers.ExtractRequiredVariablesDirective(userDataTemplate)

Expand Down
4 changes: 2 additions & 2 deletions pkg/probes/curl/machine_images.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ var cloudMachineImageMap = map[string]map[cpu.Architecture]map[string]string{
// See function docstring's note on GCP; tl;dr: deepest key should be "*"
helpers.PlatformGCP: {
cpu.ArchX86: {
"*": "rhel-9",
"*": "rhel-9-v20240709",
},
cpu.ArchARM: {
"*": "rhel-9-arm64",
"*": "rhel-9-arm64-v20240709",
},
},
}
83 changes: 83 additions & 0 deletions pkg/probes/curl/systemd-template.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/bin/sh
# GCP compute engine copies startup script to VM and runs script as root when the VM boots

# get name and zone needed for instance deletion from compute metadata server
cat <<EOF > /usr/bin/terminate.sh
#! /bin/sh
if gcloud --quiet compute instances delete $(curl -X GET http://metadata.google.internal/computeMetadata/v1/instance/name -H 'Metadata-Flavor: Google') --zone=$(curl -X GET http://metadata.google.internal/computeMetadata/v1/instance/zone -H 'Metadata-Flavor: Google'); then : ; else
exit 255
fi
EOF

# print curl output and tokens to serial output for client
cat <<'EOF' > /usr/bin/curl.sh
#! /bin/sh
array=(1 2 3 4 27 41 42 43 45)
if echo ${USERDATA_BEGIN} > /dev/ttyS0 ; then : ; else
exit 255
fi
curl --retry 3 --retry-connrefused -t B -Z -s -I -m ${TIMEOUT} -w "%{stderr}${LINE_PREFIX}%{json}\n" ${CURLOPT} ${URLS} --proto =http,https,telnet ${TLSDISABLED_URLS_RENDERED} 2>/dev/ttyS0
ret=$?
value="\<${ret}\>"
if [[ " ${array[@]} " =~ $value ]]; then
exit 255
fi
if echo ${USERDATA_END} > /dev/ttyS0 ; then : ; else
exit 255
fi
EOF

# create systemd units for silencing serial console, running curl and deleting instance
cat <<EOF > /etc/systemd/system/silence.service
[Unit]
Description=Serial Console Silencing Service
[Service]
Type=oneshot
ExecStart=systemctl mask --now serial-getty@ttyS0.service
ExecStart=systemctl disable --now syslog.socket rsyslog.service
ExecStart=sysctl -w kernel.printk="0 4 0 7"
ExecStart=kill -SIGRTMIN+21 1
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
cat <<EOF > /etc/systemd/system/curl.service
[Unit]
Description=Curl Output Service
[Service]
Type=oneshot
ExecStart=/usr/bin/curl.sh
Restart=on-failure
RemainAfterExit=true
[Install]
WantedBy=multi-user.target
EOF
cat <<EOF > /etc/systemd/system/terminate.service
[Unit]
Description=Compute Instance Deletion Service
[Service]
Type=oneshot
ExecStart=/usr/bin/terminate.sh
Restart=on-failure
EOF
cat <<EOF > /etc/systemd/system/terminate.timer
[Unit]
Description=Instance Deletion Timer
[Timer]
OnBootSec=${DELAY}min
Unit=terminate.service
[Install]
WantedBy=multi-user.target
EOF

# if cacert is provided, curl probe adds CURLOPT to use the provided cacert
echo "${CACERT}" | base64 > /proxy.pem
chmod 0755 /proxy.pem

# set proxy environment variables, make script executable and start systemd services
export http_proxy=${HTTP_PROXY} https_proxy=${HTTPS_PROXY}
chmod 777 /usr/bin/curl.sh /usr/bin/terminate.sh
systemctl daemon-reload
systemctl start silence
systemctl start curl
systemctl start terminate.timer
112 changes: 81 additions & 31 deletions pkg/verifier/gcp/entry_point.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@ import (
"fmt"
"math/rand"
"strconv"
"time"

"github.com/openshift/osd-network-verifier/pkg/data/cpu"
"github.com/openshift/osd-network-verifier/pkg/data/egress_lists"
"github.com/openshift/osd-network-verifier/pkg/helpers"
"github.com/openshift/osd-network-verifier/pkg/output"
"github.com/openshift/osd-network-verifier/pkg/probes/dummy"
"github.com/openshift/osd-network-verifier/pkg/probes/curl"
"github.com/openshift/osd-network-verifier/pkg/verifier"
)

const (
cloudImageIDDefault = "rhel-9-v20240703"
DEFAULT_INSTANCE_TYPE = "e2-micro"
DEFAULT_TIMEOUT = 5 * time.Second
)

// validateEgress performs validation process for egress
Expand All @@ -23,52 +26,98 @@ const (
// - find unreachable endpoints & parse output, then terminate instance
// - return `g.output` which stores the execution results
func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.Output {
g.Logger.Debug(vei.Ctx, "Using configured timeout of %s for each egress request", vei.Timeout.String())
//default gcp machine e2
if vei.InstanceType == "" {
vei.InstanceType = DEFAULT_INSTANCE_TYPE
// Validate cloud platform type and default to PlatformGCP if not specified
if vei.PlatformType == "" {
vei.PlatformType = helpers.PlatformGCP
}
if !vei.CPUArchitecture.IsValid() {
vei.CPUArchitecture = cpu.ArchX86
}
// Default to curl.Probe if no Probe specified
if vei.Probe == nil {
vei.Probe = curl.Probe{}
g.Logger.Debug(vei.Ctx, "defaulted to curl probe")
}

// Set timeout to default if not specified
if vei.Timeout <= 0 {
vei.Timeout = DEFAULT_TIMEOUT
}
g.Logger.Debug(vei.Ctx, "configured a %s timeout for each egress request", vei.Timeout)

// need to set InstanceType here because default is a AWS machine type
vei.InstanceType = DEFAULT_INSTANCE_TYPE
// Set instance type to default if not specified and validate it
if vei.InstanceType == "" {
var err error
vei.InstanceType, err = vei.CPUArchitecture.DefaultInstanceType(helpers.PlatformGCP)
if err != nil {
return g.Output.AddError(err)
}
g.Logger.Debug(vei.Ctx, fmt.Sprintf("defaulted to instance type %s", vei.InstanceType))
}

if err := g.validateMachineType(vei.GCP.ProjectID, vei.GCP.Zone, vei.InstanceType); err != nil {
return g.Output.AddError(fmt.Errorf("instance type %s is invalid: %s", vei.InstanceType, err))
}

// Fetch the egress URL list as two strings (one for normal URLs, the other
// for TLS disabled URLs); note that this is TOTALLY IGNORED by LegacyProbe,
// as that probe only knows how to use the egress URL lists baked into its
// AMIs/container images
// egressListStr, tlsDisabledEgressListStr, err := egress_lists.GetEgressListAsString(vei.PlatformType, a.AwsClient.Region)
// if err != nil {
// return a.Output.AddError(err)
// }
// Fetch the egress URL list from github, falling back to local lists in the event of a failure.
egressListYaml := vei.EgressListYaml
var egressListStr, tlsDisabledEgressListStr string
if egressListYaml == "" {
githubEgressList, githubListErr := egress_lists.GetGithubEgressList(vei.PlatformType)
if githubListErr == nil {
egressListYaml, githubListErr = githubEgressList.GetContent()
if githubListErr == nil {
g.Logger.Debug(vei.Ctx, "Using egress URL list from %s at SHA %s", githubEgressList.GetURL(), githubEgressList.GetSHA())
egressListStr, tlsDisabledEgressListStr, githubListErr = egress_lists.EgressListToString(egressListYaml, map[string]string{})
}
}
if githubListErr != nil {
var err error
g.Output.AddError(fmt.Errorf("failed to get egress list from GitHub, falling back to local list: %v", githubListErr))
egressListYaml, err = egress_lists.GetLocalEgressList(vei.PlatformType)
if err != nil {
return g.Output.AddError(err)
}
egressListStr, tlsDisabledEgressListStr, err = egress_lists.EgressListToString(egressListYaml, map[string]string{})
if err != nil {
return g.Output.AddError(err)
}
}
}

// Generate the userData file
// Expand replaces all ${var} (using empty string for unknown ones), adding the env variables used in startup-script.sh
userDataVariables := map[string]string{
"AWS_REGION": "us-east-2", // Not sure if this is the correct data
"TIMEOUT": vei.Timeout.String(),
"HTTP_PROXY": vei.Proxy.HttpProxy,
"HTTPS_PROXY": vei.Proxy.HttpsProxy,
"CACERT": base64.StdEncoding.EncodeToString([]byte(vei.Proxy.Cacert)),
"NOTLS": strconv.FormatBool(vei.Proxy.NoTls),
"DELAY": "5",
"URLS": "quay.io",
"TIMEOUT": vei.Timeout.String(),
"HTTP_PROXY": vei.Proxy.HttpProxy,
"HTTPS_PROXY": vei.Proxy.HttpsProxy,
"CACERT": base64.StdEncoding.EncodeToString([]byte(vei.Proxy.Cacert)),
"NOTLS": strconv.FormatBool(vei.Proxy.NoTls),
"DELAY": "5",
"URLS": egressListStr,
"TLSDISABLED_URLS": tlsDisabledEgressListStr,
// Add fake userDatavariables to replace normal shell variables in startup-script.sh which will otherwise be erased by os.Expand
"ret": "${ret}",
"?": "$?",
"array[@]": "${array[@]}",
"value": "$value",
"USE_SYSTEMD": "true",
}
// set probe
vei.Probe = dummy.Probe{}

userData, err := vei.Probe.GetExpandedUserData(userDataVariables)
if err != nil {
return g.Output.AddError(err)
}

g.Logger.Debug(vei.Ctx, "Generated userdata script:\n---\n%s\n---", userData)

if vei.CloudImageID == "" {
vei.CloudImageID = cloudImageIDDefault
vei.CloudImageID, err = vei.Probe.GetMachineImageID(vei.PlatformType, vei.CPUArchitecture, vei.GCP.Region)
if err != nil {
return g.Output.AddError(err)
}
}

//image list https://cloud.google.com/compute/docs/images/os-details#red_hat_enterprise_linux_rhel

instance, err := g.createComputeServiceInstance(createComputeServiceInstanceInput{
projectID: vei.GCP.ProjectID,
zone: vei.GCP.Zone,
Expand All @@ -89,7 +138,8 @@ func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O

g.Logger.Debug(vei.Ctx, "Waiting for ComputeService instance %s to be running", instance.Name)
if instanceReadyErr := g.waitForComputeServiceInstanceCompletion(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name); instanceReadyErr != nil {
err = g.GcpClient.TerminateComputeServiceInstance(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name) // try to terminate the created instanc
// try to terminate instance if instance is not running
err = g.GcpClient.TerminateComputeServiceInstance(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name)
if err != nil {
g.Output.AddError(err)
}
Expand Down
7 changes: 7 additions & 0 deletions pkg/verifier/gcp/gcp_verifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ func (g *GcpVerifier) createComputeServiceInstance(input createComputeServiceIns
{
Name: input.networkName,
Subnetwork: input.vpcSubnetID,
// Only one accessConfigs exist which is ONE_TO_ONE_NAT
// needed for external internet access including egress
AccessConfigs: []*computev1.AccessConfig{
{
Name: "External NAT",
},
},
},
},
ServiceAccounts: []*computev1.ServiceAccount{
Expand Down
44 changes: 44 additions & 0 deletions pkg/verifier/gcp/gcp_verifier_functions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package gcpverifier

import (
"fmt"
"strings"

"github.com/openshift/osd-network-verifier/pkg/helpers"
"github.com/openshift/osd-network-verifier/pkg/probes"
)

// function that tests probe order logic that is part of findUnreachableEndpoints in gcp_verifier.go
// get_tokens checks for the presence of startingToken and endingToken in the consoleOutput
// probe outsput should be between startingToken and endingToken
func getTokens(consoleOutput string, probe probes.Probe) bool {
// Check for startingToken and endingToken
startingTokenSeen := strings.Contains(consoleOutput, probe.GetStartingToken())
endingTokenSeen := strings.Contains(consoleOutput, probe.GetEndingToken())
if !startingTokenSeen {
if endingTokenSeen {
fmt.Printf("raw console logs:\n---\n%s\n---", consoleOutput)
fmt.Printf("probe output corrupted: endingToken encountered before startingToken")
return false
}
fmt.Printf("consoleOutput contains data, but probe has not yet printed startingToken, continuing to wait...")
return false
}
if !endingTokenSeen {
fmt.Printf("consoleOutput contains data, but probe has not yet printed endingToken, continuing to wait...")
return false
}
// If we make it this far, we know that both startingTokenSeen and endingTokenSeen are true

// Separate the probe's output from the rest of the console output (using startingToken and endingToken)
rawProbeOutput := strings.TrimSpace(helpers.CutBetween(consoleOutput, probe.GetStartingToken(), probe.GetEndingToken()))
if len(rawProbeOutput) < 1 {
fmt.Printf("raw console logs:\n---\n%s\n---", consoleOutput)
fmt.Printf("probe output corrupted: no data between startingToken and endingToken")
return false
}
// Send probe's output off to the Probe interface for parsing
fmt.Printf("probe output:\n---\n%s\n---", rawProbeOutput)

return true
}
60 changes: 60 additions & 0 deletions pkg/verifier/gcp/gcp_verifier_functions_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package gcpverifier

import (
"testing"

"github.com/openshift/osd-network-verifier/pkg/probes"
"github.com/openshift/osd-network-verifier/pkg/probes/curl"
)

func TestGetTokens(t *testing.T) {
type args struct {
consoleOutput string
probe probes.Probe
}
tests := []struct {
name string
args args
want bool
}{
{
name: "tokens in order",
args: args{
consoleOutput: "otherinfoNV_CURLJSON_BEGIN\nhello world\nNV_CURLJSON_END\njj",
probe: curl.Probe{},
},
want: true,
},
{
name: "only start token",
args: args{
consoleOutput: "NV_CURLJSON_BEGIN\nhello world\n",
probe: curl.Probe{},
},
want: false,
},
{
name: "only end token",
args: args{
consoleOutput: "hello world\nNV_CURLJSON_END\njj",
probe: curl.Probe{},
},
want: false,
},
{
name: "token order reversed",
args: args{
consoleOutput: "fjsdklNV_CURLJSON_END\nhello world\nNV_CURLJSON_BEGIN\njj",
probe: curl.Probe{},
},
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := getTokens(tt.args.consoleOutput, tt.args.probe); got != tt.want {
t.Errorf("get_tokens() = %v, want %v", got, tt.want)
}
})
}
}

0 comments on commit e338e54

Please sign in to comment.