Running Production Workloads on AWS EKS

EKS is AWS’s managed Kubernetes service. Here’s what I’ve learned running production workloads on it.

Cluster Setup #

Use eksctl for Quick Setup #

eksctl create cluster \
  --name production \
  --version 1.28 \
  --region us-west-2 \
  --nodegroup-name workers \
  --node-type m5.large \
  --nodes 3 \
  --nodes-min 2 \
  --nodes-max 10 \
  --managed

Or Terraform for Infrastructure as Code #

module "eks" {
  source  = "terraform-aws-modules/eks/aws"
  version = "~> 19.0"

  cluster_name    = "production"
  cluster_version = "1.28"

  vpc_id     = module.vpc.vpc_id
  subnet_ids = module.vpc.private_subnets

  cluster_endpoint_public_access = true

  eks_managed_node_groups = {
    general = {
      desired_size = 3
      min_size     = 2
      max_size     = 10

      instance_types = ["m5.large"]
      capacity_type  = "ON_DEMAND"

      labels = {
        workload = "general"
      }
    }

    spot = {
      desired_size = 2
      min_size     = 0
      max_size     = 20

      instance_types = ["m5.large", "m5a.large", "m5n.large"]
      capacity_type  = "SPOT"

      labels = {
        workload = "batch"
      }

      taints = [{
        key    = "spot"
        value  = "true"
        effect = "NO_SCHEDULE"
      }]
    }
  }
}

Networking #

VPC Design #

┌─────────────────────────────────────────────────────────────┐
│ VPC: 10.0.0.0/16                                            │
├─────────────────────────────────────────────────────────────┤
│ AZ-a                  │ AZ-b                  │ AZ-c        │
├───────────────────────┼───────────────────────┼─────────────┤
│ Public: 10.0.1.0/24   │ Public: 10.0.2.0/24   │ ...         │
│ (NAT Gateway, ALB)    │                       │             │
├───────────────────────┼───────────────────────┼─────────────┤
│ Private: 10.0.10.0/24 │ Private: 10.0.11.0/24 │ ...         │
│ (EKS Nodes)           │                       │             │
├───────────────────────┼───────────────────────┼─────────────┤
│ Data: 10.0.20.0/24    │ Data: 10.0.21.0/24    │ ...         │
│ (RDS, ElastiCache)    │                       │             │
└───────────────────────┴───────────────────────┴─────────────┘

AWS Load Balancer Controller #

# Install AWS Load Balancer Controller
helm repo add eks https://aws.github.io/eks-charts
helm install aws-load-balancer-controller eks/aws-load-balancer-controller \
  -n kube-system \
  --set clusterName=production \
  --set serviceAccount.create=false \
  --set serviceAccount.name=aws-load-balancer-controller

Create an ALB Ingress:

apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: app-ingress
  annotations:
    kubernetes.io/ingress.class: alb
    alb.ingress.kubernetes.io/scheme: internet-facing
    alb.ingress.kubernetes.io/target-type: ip
    alb.ingress.kubernetes.io/healthcheck-path: /healthz
    alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:...
    alb.ingress.kubernetes.io/ssl-redirect: "443"
spec:
  rules:
    - host: api.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 80

Security #

IRSA (IAM Roles for Service Accounts) #

Don’t use node IAM roles. Use IRSA:

apiVersion: v1
kind: ServiceAccount
metadata:
  name: s3-reader
  annotations:
    eks.amazonaws.com/role-arn: arn:aws:iam::123456789:role/S3ReaderRole
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: app
spec:
  template:
    spec:
      serviceAccountName: s3-reader
      containers:
        - name: app
          image: myapp:latest
          # AWS SDK automatically uses IRSA credentials

Create the IAM role:

module "s3_reader_role" {
  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"

  role_name = "S3ReaderRole"

  role_policy_arns = {
    policy = aws_iam_policy.s3_read.arn
  }

  oidc_providers = {
    main = {
      provider_arn               = module.eks.oidc_provider_arn
      namespace_service_accounts = ["default:s3-reader"]
    }
  }
}

Network Policies #

Restrict pod-to-pod communication:

apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
  name: api-policy
spec:
  podSelector:
    matchLabels:
      app: api
  policyTypes:
    - Ingress
    - Egress
  ingress:
    - from:
        - podSelector:
            matchLabels:
              app: frontend
      ports:
        - port: 8080
  egress:
    - to:
        - podSelector:
            matchLabels:
              app: database
      ports:
        - port: 5432
    - to:
        - namespaceSelector: {}
          podSelector:
            matchLabels:
              k8s-app: kube-dns
      ports:
        - port: 53
          protocol: UDP

Secrets with External Secrets #

apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
  name: app-secrets
spec:
  refreshInterval: 1h
  secretStoreRef:
    name: aws-secrets-manager
    kind: ClusterSecretStore
  target:
    name: app-secrets
  data:
    - secretKey: database-url
      remoteRef:
        key: production/database
        property: url

Monitoring #

Container Insights #

# Enable Container Insights
aws eks create-addon \
  --cluster-name production \
  --addon-name amazon-cloudwatch-observability

Prometheus + Grafana #

helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install prometheus prometheus-community/kube-prometheus-stack \
  --namespace monitoring \
  --create-namespace \
  --set grafana.adminPassword=admin \
  --set prometheus.prometheusSpec.retention=30d \
  --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi

Key Metrics to Monitor #

# Create PrometheusRule for alerts
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: app-alerts
spec:
  groups:
    - name: app
      rules:
        - alert: HighErrorRate
          expr: |
            sum(rate(http_requests_total{status=~"5.."}[5m])) 
            / sum(rate(http_requests_total[5m])) > 0.05
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: High error rate detected
            
        - alert: PodNotReady
          expr: |
            kube_pod_status_ready{condition="false"} == 1
          for: 5m
          labels:
            severity: warning

Cost Optimization #

Spot Instances for Non-Critical Workloads #

apiVersion: apps/v1
kind: Deployment
metadata:
  name: batch-processor
spec:
  template:
    spec:
      nodeSelector:
        workload: batch
      tolerations:
        - key: spot
          value: "true"
          effect: NoSchedule
      containers:
        - name: processor
          image: batch:latest

Karpenter for Smarter Autoscaling #

apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
  name: default
spec:
  requirements:
    - key: karpenter.sh/capacity-type
      operator: In
      values: ["spot", "on-demand"]
    - key: kubernetes.io/arch
      operator: In
      values: ["amd64"]
  limits:
    resources:
      cpu: 1000
  providerRef:
    name: default
  ttlSecondsAfterEmpty: 30

Right-Size Pods #

Use Vertical Pod Autoscaler recommendations:

kubectl get vpa -o jsonpath='{.items[*].status.recommendation}'

Common Issues #

DNS Throttling #

Increase CoreDNS replicas:

kubectl scale deployment coredns -n kube-system --replicas=5

Node Not Ready #

Check kubelet and containerd:

# SSH to node
systemctl status kubelet
journalctl -u kubelet -f

ImagePullBackOff #

Check ECR permissions and image existence:

aws ecr describe-images --repository-name myapp --image-ids imageTag=v1.0.0

Key Takeaways #

Use managed node groups for easier upgrades
Implement IRSA for pod-level AWS permissions
Use Network Policies for security
Enable Container Insights for basic monitoring
Use Spot instances for cost savings on non-critical workloads
Consider Karpenter over Cluster Autoscaler

EKS removes much of the Kubernetes operational burden, but understanding the fundamentals is still essential.