Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added current eks docs #968

Open
wants to merge 1 commit into
base: staging
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions docs/eks/cluster-autoscaler.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["events", "endpoints"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: [""]
resources: ["pods/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["endpoints"]
resourceNames: ["cluster-autoscaler"]
verbs: ["get", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["watch", "list", "get", "update"]
- apiGroups: [""]
resources:
- "pods"
- "services"
- "replicationcontrollers"
- "persistentvolumeclaims"
- "persistentvolumes"
verbs: ["watch", "list", "get"]
- apiGroups: ["extensions"]
resources: ["replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["watch", "list"]
- apiGroups: ["apps"]
resources: ["statefulsets", "replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses"]
verbs: ["watch", "list", "get"]
- apiGroups: ["batch", "extensions"]
resources: ["jobs"]
verbs: ["get", "list", "watch", "patch"]
- apiGroups: ["storage.k8s.io"]
resources: ["csinodes"]
verbs: ["watch", "list", "get"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create","list","watch"]
- apiGroups: [""]
resources: ["configmaps"]
resourceNames: ["cluster-autoscaler-status", "cluster-autoscaler-priority-expander"]
verbs: ["delete", "get", "update", "watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system

---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system

---
apiVersion: apps/v1
kind: Deployment
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
app: cluster-autoscaler
spec:
replicas: 1
selector:
matchLabels:
app: cluster-autoscaler
template:
metadata:
labels:
app: cluster-autoscaler
spec:
serviceAccountName: cluster-autoscaler
containers:
- image: us.gcr.io/k8s-artifacts-prod/autoscaling/cluster-autoscaler:v1.16.7
name: cluster-autoscaler
resources:
limits:
cpu: 100m
memory: 600Mi
requests:
cpu: 100m
memory: 600Mi
command:
- ./cluster-autoscaler
- --v=4
- --stderrthreshold=info
- --cloud-provider=aws
- --skip-nodes-with-local-storage=false
- --expander=least-waste
- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/pangeo
volumeMounts:
- name: ssl-certs
mountPath: /etc/ssl/certs/ca-certificates.crt
readOnly: true
imagePullPolicy: "Always"
volumes:
- name: ssl-certs
hostPath:
path: "/etc/ssl/certs/ca-bundle.crt"
200 changes: 200 additions & 0 deletions docs/eks/eksctl-config-20200510.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
name: pangeo
region: us-west-2

availabilityZones: ["us-west-2a", "us-west-2b", "us-west-2d"]

cloudWatch:
clusterLogging:
enableTypes: ["*"]

# manually create role and modify trust relationships

iam:
withOIDC: true
serviceAccounts:
- metadata:
name: pangeo
namespace: icesat2-prod
labels:
aws-usage: "application"
attachPolicyARNs:
- "arn:aws:iam::783380859522:policy/pangeo-data-s3"
- metadata:
name: pangeo
namespace: icesat2-staging
labels:
aws-usage: "application"
attachPolicyARNs:
- "arn:aws:iam::783380859522:policy/pangeo-data-s3"
- metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
aws-usage: "cluster-ops"
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
attachPolicyARNs:
- "arn:aws:iam::783380859522:policy/cluster-autoscaler"

nodeGroups:
- name: core-spot
availabilityZones: ["us-west-2a"] # where persistant hb-db-pv lives
minSize: 1
maxSize: 2
desiredCapacity: 1
privateNetworking: true
volumeSize: 100
volumeType: gp2
labels:
node-role.kubernetes.io/core: core
hub.jupyter.org/node-purpose: core
ami: auto
amiFamily: AmazonLinux2
instancesDistribution:
instanceTypes:
- t3a.large
- t3.large
spotInstancePools: 2
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0 # all spot
iam:
withAddonPolicies:
autoScaler: true
- name: user-spot
minSize: 0
maxSize: 50
desiredCapacity: 0
privateNetworking: true
instancesDistribution:
instanceTypes:
- m5.2xlarge
- m5a.2xlarge
- m5n.2xlarge
spotInstancePools: 3
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0 # all spot
volumeSize: 100
volumeType: gp2
labels:
node-role.kubernetes.io/user: user
hub.jupyter.org/node-purpose: user
taints:
hub.jupyter.org/dedicated: 'user:NoSchedule'
tags:
k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org/dedicated: 'user:NoSchedule'
ami: auto
amiFamily: AmazonLinux2
iam:
withAddonPolicies:
autoScaler: true
preBootstrapCommands: # see https://github.com/weaveworks/eksctl/issues/1310
- yum install -y iptables-services
- iptables --insert FORWARD 1 --in-interface eni+ --destination 169.254.169.254/32 --jump DROP
- iptables-save | tee /etc/sysconfig/iptables
- systemctl enable --now iptables
- name: worker-spot
minSize: 0
maxSize: 50
desiredCapacity: 0
privateNetworking: true
instancesDistribution:
instanceTypes:
- r5.2xlarge
- r5a.2xlarge
- r5n.2xlarge
spotInstancePools: 3
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0
volumeSize: 100
volumeType: gp2
labels:
node-role.kubernetes.io/worker: worker
k8s.dask.org/node-purpose: worker
taints:
k8s.dask.org/dedicated: 'worker:NoSchedule'
tags:
k8s.io/cluster-autoscaler/node-template/label/k8s.dask.org/node-purpose: worker
k8s.io/cluster-autoscaler/node-template/taint/k8s.dask.org/dedicated: "worker:NoSchedule"
ami: auto
amiFamily: AmazonLinux2
iam:
withAddonPolicies:
autoScaler: true
preBootstrapCommands: # see https://github.com/weaveworks/eksctl/issues/1310
- yum install -y iptables-services
- iptables --insert FORWARD 1 --in-interface eni+ --destination 169.254.169.254/32 --jump DROP
- iptables-save | tee /etc/sysconfig/iptables
- systemctl enable --now iptables
- name: scheduler-spot
minSize: 0
maxSize: 20
desiredCapacity: 0
privateNetworking: true
instancesDistribution:
instanceTypes:
- t3.large
- t3a.large
spotInstancePools: 2
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0
volumeSize: 100
volumeType: gp2
labels:
node-role.kubernetes.io/scheduler: scheduler
k8s.dask.org/node-purpose: scheduler
taints:
k8s.dask.org/dedicated: 'scheduler:NoSchedule'
tags:
k8s.io/cluster-autoscaler/node-template/label/k8s.dask.org/node-purpose: scheduler
k8s.io/cluster-autoscaler/node-template/taint/k8s.dask.org/dedicated: "scheduler:NoSchedule"
ami: auto
amiFamily: AmazonLinux2
iam:
withAddonPolicies:
autoScaler: true
preBootstrapCommands: # see https://github.com/weaveworks/eksctl/issues/1310
- yum install -y iptables-services
- iptables --insert FORWARD 1 --in-interface eni+ --destination 169.254.169.254/32 --jump DROP
- iptables-save | tee /etc/sysconfig/iptables
- systemctl enable --now iptables
- name: user-gpu-spot
privateNetworking: true
instancesDistribution:
instanceTypes:
- p2.xlarge
- g3s.xlarge
# - p3.2xlarge
spotInstancePools: 2
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0
minSize: 0
maxSize: 4
desiredCapacity: 0
volumeSize: 100
volumeType: gp2
# https://github.com/kubernetes/autoscaler/issues/2642
labels:
node-role.kubernetes.io/user-gpu: user-gpu-spot
hub.jupyter.org/node-purpose: user
cloud.google.com/gke-accelerator: nvidia
tags:
k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu: 'present:NoSchedule'
k8s.io/cluster-autoscaler/node-template/label/cloud.google.com/gke-accelerator: nvidia
taints:
nvidia.com/gpu: 'present:NoSchedule'
ami: auto
amiFamily: AmazonLinux2
iam:
withAddonPolicies:
autoScaler: true
preBootstrapCommands: # see https://github.com/weaveworks/eksctl/issues/1310
- yum install -y iptables-services
- iptables --insert FORWARD 1 --in-interface eni+ --destination 169.254.169.254/32 --jump DROP
- iptables-save | tee /etc/sysconfig/iptables
- systemctl enable --now iptables
Loading