Running Production Workloads on AWS EKS
Setting up EKS for production - networking, security, monitoring, and cost optimization tips.
Table of Contents
EKS is AWS’s managed Kubernetes service. Here’s what I’ve learned running production workloads on it.
Cluster Setup #
Use eksctl for Quick Setup #
eksctl create cluster \
--name production \
--version 1.28 \
--region us-west-2 \
--nodegroup-name workers \
--node-type m5.large \
--nodes 3 \
--nodes-min 2 \
--nodes-max 10 \
--managed
Or Terraform for Infrastructure as Code #
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 19.0"
cluster_name = "production"
cluster_version = "1.28"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
cluster_endpoint_public_access = true
eks_managed_node_groups = {
general = {
desired_size = 3
min_size = 2
max_size = 10
instance_types = ["m5.large"]
capacity_type = "ON_DEMAND"
labels = {
workload = "general"
}
}
spot = {
desired_size = 2
min_size = 0
max_size = 20
instance_types = ["m5.large", "m5a.large", "m5n.large"]
capacity_type = "SPOT"
labels = {
workload = "batch"
}
taints = [{
key = "spot"
value = "true"
effect = "NO_SCHEDULE"
}]
}
}
}
Networking #
VPC Design #
┌─────────────────────────────────────────────────────────────┐
│ VPC: 10.0.0.0/16 │
├─────────────────────────────────────────────────────────────┤
│ AZ-a │ AZ-b │ AZ-c │
├───────────────────────┼───────────────────────┼─────────────┤
│ Public: 10.0.1.0/24 │ Public: 10.0.2.0/24 │ ... │
│ (NAT Gateway, ALB) │ │ │
├───────────────────────┼───────────────────────┼─────────────┤
│ Private: 10.0.10.0/24 │ Private: 10.0.11.0/24 │ ... │
│ (EKS Nodes) │ │ │
├───────────────────────┼───────────────────────┼─────────────┤
│ Data: 10.0.20.0/24 │ Data: 10.0.21.0/24 │ ... │
│ (RDS, ElastiCache) │ │ │
└───────────────────────┴───────────────────────┴─────────────┘
AWS Load Balancer Controller #
# Install AWS Load Balancer Controller
helm repo add eks https://aws.github.io/eks-charts
helm install aws-load-balancer-controller eks/aws-load-balancer-controller \
-n kube-system \
--set clusterName=production \
--set serviceAccount.create=false \
--set serviceAccount.name=aws-load-balancer-controller
Create an ALB Ingress:
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: app-ingress
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/healthcheck-path: /healthz
alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:...
alb.ingress.kubernetes.io/ssl-redirect: "443"
spec:
rules:
- host: api.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: api
port:
number: 80
Security #
IRSA (IAM Roles for Service Accounts) #
Don’t use node IAM roles. Use IRSA:
apiVersion: v1
kind: ServiceAccount
metadata:
name: s3-reader
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::123456789:role/S3ReaderRole
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: app
spec:
template:
spec:
serviceAccountName: s3-reader
containers:
- name: app
image: myapp:latest
# AWS SDK automatically uses IRSA credentials
Create the IAM role:
module "s3_reader_role" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
role_name = "S3ReaderRole"
role_policy_arns = {
policy = aws_iam_policy.s3_read.arn
}
oidc_providers = {
main = {
provider_arn = module.eks.oidc_provider_arn
namespace_service_accounts = ["default:s3-reader"]
}
}
}
Network Policies #
Restrict pod-to-pod communication:
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: api-policy
spec:
podSelector:
matchLabels:
app: api
policyTypes:
- Ingress
- Egress
ingress:
- from:
- podSelector:
matchLabels:
app: frontend
ports:
- port: 8080
egress:
- to:
- podSelector:
matchLabels:
app: database
ports:
- port: 5432
- to:
- namespaceSelector: {}
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- port: 53
protocol: UDP
Secrets with External Secrets #
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: app-secrets
spec:
refreshInterval: 1h
secretStoreRef:
name: aws-secrets-manager
kind: ClusterSecretStore
target:
name: app-secrets
data:
- secretKey: database-url
remoteRef:
key: production/database
property: url
Monitoring #
Container Insights #
# Enable Container Insights
aws eks create-addon \
--cluster-name production \
--addon-name amazon-cloudwatch-observability
Prometheus + Grafana #
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install prometheus prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--create-namespace \
--set grafana.adminPassword=admin \
--set prometheus.prometheusSpec.retention=30d \
--set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi
Key Metrics to Monitor #
# Create PrometheusRule for alerts
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: app-alerts
spec:
groups:
- name: app
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/ sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: High error rate detected
- alert: PodNotReady
expr: |
kube_pod_status_ready{condition="false"} == 1
for: 5m
labels:
severity: warning
Cost Optimization #
Spot Instances for Non-Critical Workloads #
apiVersion: apps/v1
kind: Deployment
metadata:
name: batch-processor
spec:
template:
spec:
nodeSelector:
workload: batch
tolerations:
- key: spot
value: "true"
effect: NoSchedule
containers:
- name: processor
image: batch:latest
Karpenter for Smarter Autoscaling #
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: default
spec:
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["spot", "on-demand"]
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
limits:
resources:
cpu: 1000
providerRef:
name: default
ttlSecondsAfterEmpty: 30
Right-Size Pods #
Use Vertical Pod Autoscaler recommendations:
kubectl get vpa -o jsonpath='{.items[*].status.recommendation}'
Common Issues #
DNS Throttling #
Increase CoreDNS replicas:
kubectl scale deployment coredns -n kube-system --replicas=5
Node Not Ready #
Check kubelet and containerd:
# SSH to node
systemctl status kubelet
journalctl -u kubelet -f
ImagePullBackOff #
Check ECR permissions and image existence:
aws ecr describe-images --repository-name myapp --image-ids imageTag=v1.0.0
Key Takeaways #
- Use managed node groups for easier upgrades
- Implement IRSA for pod-level AWS permissions
- Use Network Policies for security
- Enable Container Insights for basic monitoring
- Use Spot instances for cost savings on non-critical workloads
- Consider Karpenter over Cluster Autoscaler
EKS removes much of the Kubernetes operational burden, but understanding the fundamentals is still essential.