kubectl get pods -n mcp-server-langgraph# Output: pod/mcp-server-xxxx 0/1 CrashLoopBackOff# Check pod eventskubectl describe pod mcp-server-${POD_ID}x -n mcp-server-langgraph | tail -20# Check logskubectl logs mcp-server-xxxx -n mcp-server-langgraph --previous
2
Common causes
Application errors:
Database connection failed
Missing environment variables
Config file not found
Resource limits:
Out of memory (OOMKilled)
CPU throttling
Permission issues:
Can’t read secrets
IRSA role misconfigured
3
Fix based on cause
Database Connection
IRSA Issues
Resource Limits
# Verify database secret existskubectl get secret database-credentials -n mcp-server-langgraph# Check RDS endpoint is reachablekubectl run -it --rm debug --image=postgres:15 --restart=Never -- \ psql -h DB_ENDPOINT -U mcp_langgraph -d mcp_langgraph# Verify RDS security group allows EKS nodes# Check in AWS Console: RDS > Security Groups
# Check service account annotationkubectl describe serviceaccount mcp-server-langgraph -n mcp-server-langgraph | grep role-arn# Check if role existsaws iam get-role --role-name mcp-langgraph-prod-application# Check pod has AWS credentials injectedkubectl describe pod mcp-server-${POD_ID}x -n mcp-server-langgraph | grep AWS_ROLE_ARN
# Check if pod was OOMKilledkubectl describe pod mcp-server-${POD_ID}x -n mcp-server-langgraph | grep -A 5 "Last State"# Increase memory limitkubectl set resources deployment mcp-server-langgraph \ -n mcp-server-langgraph \ --limits=memory=2Gi \ --requests=memory=1Gi
4
Verify fix
kubectl get pods -n mcp-server-langgraph -w# Wait for pod to reach Running statekubectl logs -f mcp-server-xxxx -n mcp-server-langgraph# Check logs for successful startup
kubectl describe pod POD_NAME -n mcp-server-langgraph | grep -A 10 Events
Common reasons:
Insufficient CPU
Insufficient memory
No nodes available matching node selector
Taint toleration not satisfied
2
Check Cluster Autoscaler
# Check if autoscaler is adding nodeskubectl logs -f deployment/cluster-autoscaler -n kube-system --tail=100# Check autoscaler statuskubectl get nodes -l node.kubernetes.io/lifecycle=spot -o wide
3
Manual scaling if needed
# Scale node group via Terraformcd terraform/environments/prod# Edit terraform.tfvarsgeneral_node_group_desired_size = 5 # Increase from 3terraform apply -target=module.eks# Or via AWS CLI (temporary)aws eks update-nodegroup-config \ --cluster-name mcp-langgraph-prod \ --nodegroup-name general-nodes \ --scaling-config minSize=2,maxSize=10,desiredSize=5
4
Check for taint issues
# If pod requires specific node group (e.g., compute-optimized)kubectl describe node NODE_NAME | grep Taints# Add toleration to pod# In deployment.yaml:tolerations:- key: "workload" operator: "Equal" value: "llm" effect: "NoSchedule"
# Deploy test podskubectl run test-source --image=busybox --restart=Never -- sleep 3600kubectl run test-dest --image=nginx --restart=Never# Get dest pod IPDEST_IP=$(kubectl get pod test-dest -o jsonpath='{.status.podIP}')# Test from sourcekubectl exec test-source -- wget -O- http://$DEST_IP# Clean upkubectl delete pod test-source test-dest
kubectl run -it --rm redis-test --image=redis:7-alpine --restart=Never -- sh# Inside pod:redis-cli -h REDIS_ENDPOINT -p 6379 -a AUTH_TOKEN# Test commands:PINGINFOCLUSTER INFO # If cluster mode
3
Check security group
# Get ElastiCache security groupaws elasticache describe-replication-groups \ --replication-group-id mcp-langgraph-prod \ --query "ReplicationGroups[].NodeGroups[].PrimaryEndpoint"# Verify EKS nodes can reach Redis on port 6379
kubectl logs -f deployment/cluster-autoscaler -n kube-system --tail=200 | grep -E "(scale|node)"# Look for:# - "ScaleUp: group X -> Y nodes"# - "ScaleDown: removing node Z"# - Errors about IAM permissions
2
Verify IRSA permissions
# Check service account annotationkubectl describe serviceaccount cluster-autoscaler -n kube-system | grep role-arn# Check IAM role has autoscaling permissionsaws iam get-role-policy \ --role-name mcp-langgraph-prod-cluster-autoscaler \ --policy-name cluster-autoscaler-policy
3
Check node group limits
# Verify max_size is not reachedaws eks describe-nodegroup \ --cluster-name mcp-langgraph-prod \ --nodegroup-name general-nodes \ --query "nodegroup.scalingConfig"# Output should show: {minSize, maxSize, desiredSize}
4
Check for pending pods
kubectl get pods -A | grep Pending# Autoscaler only scales up if pods are pending# Check pod events for reasonkubectl describe pod PENDING_POD -n NAMESPACE
# Check if API server is reachablekubectl cluster-info# Check AWS service health# https://health.aws.amazon.com/health/status# Check CloudWatch Eventsaws cloudwatch describe-alarms --state-value ALARM
2
Check control plane
# View control plane logsaws logs tail /aws/eks/mcp-langgraph-prod/cluster --follow# Check control plane statusaws eks describe-cluster --name mcp-langgraph-prod \ --query "cluster.status"