Skip to content
This repository has been archived by the owner on Jul 25, 2022. It is now read-only.

SSH code optimization #231

Merged
merged 1 commit into from
Jul 23, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 37 additions & 32 deletions pkg/cmd/ssh_aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,27 @@ func sshToAWSNode(nodeName, path, user, pathSSKeypair string, sshPublicKey []byt
fmt.Println("(3/4) Creating bastion host")
a.createBastionHostInstance()

fmt.Println("Waiting SSH 22 port open")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe in the future it makes sense to have a logging framework with configurable verbosity levels.
For instance to also see verbose output of the SSH command (good for understanding what went wrong).
Just to keep in mind.

Copy link
Contributor Author

@tedteng tedteng Jul 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure np, at the moments, after fixing the ssh process display looks like this when ssh failure" we can get brief information closed by 3.121.114.80 port 22 to identify the issue from local to bastion instance or bastion instance to node instance.

SSH gardener@3.121.114.80 => gardener@ip-10-250-23-169.eu-central-1.compute.internalConnection closed by 3.121.114.80 port 22
kex_exchange_identification: Connection closed by remote host
exit status 255

After investigation this time, I got some new findings. I think the root cause is from AWS EC2, As testing, SSH service of EC2 instance may not fully running immediately when EC2 instance status become running from pending/Initializing.. which causes SSH command may be hung at the first time until timeout exit. so I detecting 22 port open and retry ssh command this time.

-Ted

attemptCnt := 0
for attemptCnt < 60 {
ncCmd := fmt.Sprintf("nc -rz -w 5 %s 22", a.BastionIP)
cmd := exec.Command("bash", "-c", ncCmd)
output, _ := cmd.CombinedOutput()
fmt.Println(" ")
if strings.Contains(string(output), "succeeded") {
break
}
time.Sleep(time.Second * 5)
attemptCnt++
}

bastionNode := user + "@" + a.BastionIP
node := user + "@" + nodeName
fmt.Println("Waiting 60 seconds until ports are open.")
time.Sleep(60 * time.Second)

fmt.Print("SSH " + bastionNode + " => " + node)
key := filepath.Join(pathSSKeypair, "key")
sshCmd := fmt.Sprintf("ssh -i " + key + " -o \"ProxyCommand ssh -W %%h:%%p -i " + key + " -o IdentitiesOnly=yes -o StrictHostKeyChecking=no " + bastionNode + "\" " + node + " -o IdentitiesOnly=yes -o StrictHostKeyChecking=no")

sshCmd := fmt.Sprintf("ssh -i " + key + " -o ConnectionAttempts=3 -o \"ProxyCommand ssh -W %%h:%%p -i " + key + " -o IdentitiesOnly=yes -o StrictHostKeyChecking=no " + bastionNode + "\" " + node + " -o IdentitiesOnly=yes -o StrictHostKeyChecking=no")
cmd := exec.Command("bash", "-c", sshCmd)
cmd.Stdout = os.Stdout
cmd.Stdin = os.Stdin
Expand Down Expand Up @@ -161,7 +175,7 @@ func (a *AwsInstanceAttribute) createBastionHostSecurityGroup() {
// check if security group exists
a.getBastionSecurityGroupID()
if a.BastionSecurityGroupID != "" {
fmt.Println("Security Group exists, skipping creation.")
fmt.Println("Security Group exists " + a.BastionSecurityGroupID + " skipping creation.")
return
}

Expand Down Expand Up @@ -266,34 +280,22 @@ func (a *AwsInstanceAttribute) createBastionHostInstance() {
a.BastionInstanceID = value
}
}
fmt.Println("Bastion host instance created.")
fmt.Println("Bastion host instance " + a.BastionInstanceID + " Initializing.")
fmt.Println("")

// check if bastion host is up and running, timeout after 3 minutes
attemptCnt := 0
for attemptCnt < 60 {
arguments = "aws ec2 describe-instances --instance-id=" + a.BastionInstanceID + " --query Reservations[*].Instances[].[State.Name] --output text"
captured = capture()
operate("aws", arguments)
capturedOutput, err = captured()
checkError(err)
fmt.Println("Instance State: " + capturedOutput)
if strings.Trim(capturedOutput, "\n") == "running" {
arguments := "aws ec2 describe-instances --instance-id " + a.BastionInstanceID + " --query Reservations[*].Instances[*].PublicIpAddress"
captured := capture()
operate("aws", arguments)
capturedOutput, err := captured()
checkError(err)
a.BastionIP = capturedOutput
return
}
time.Sleep(time.Second * 2)
attemptCnt++
}
if attemptCnt == 90 {
fmt.Println("Bastion server instance timeout. Please try again.")
os.Exit(2)
}
// waiting instance running
arguments = "aws ec2 wait instance-running --instance-ids " + a.BastionInstanceID
operate("aws", arguments)
fmt.Println("Bastion host instance running.")

// fetch BastionInstanceID
arguments = "aws ec2 describe-instances --instance-id " + a.BastionInstanceID + " --query Reservations[*].Instances[*].PublicIpAddress"
captured = capture()
operate("aws", arguments)
capturedOutput, err = captured()
checkError(err)
a.BastionIP = strings.Trim(capturedOutput, "\n")

}

// getAWSMachineClasses returns machine classes for the cluster nodes
Expand Down Expand Up @@ -344,8 +346,11 @@ func (a *AwsInstanceAttribute) cleanupAwsBastionHost() {
// clean up bastion security group
fmt.Println(" (3/3) Clean up bastion host security group")
fmt.Println("")
fmt.Println(" Waiting 45 seconds until instance is deleted to remove all dependencies.")
time.Sleep(time.Second * 45)
arguments = "aws ec2 wait instance-terminated --instance-ids " + a.BastionInstanceID
captured = capture()
operate("aws", arguments)
_, err = captured()
checkError(err)
arguments = fmt.Sprintf("aws ec2 delete-security-group --group-id %s", a.BastionSecurityGroupID)
captured = capture()
operate("aws", arguments)
Expand Down