Runbooks
Operational runbooks for common procedures and troubleshooting.
Purpose
Runbooks provide step-by-step instructions for:
- Routine operations
- Emergency procedures
- Troubleshooting guides
- Maintenance tasks
Database Operations
Backup Database
# Manual backup
kubectl exec -it postgres-0 -- pg_dump -U postgres mydatabase > backup.sql
# Restore from backup
kubectl exec -i postgres-0 -- psql -U postgres mydatabase < backup.sql
# Verify backup
kubectl exec -it postgres-0 -- psql -U postgres mydatabase -c "\dt"
Check Database Connections
# View active connections
kubectl exec -it postgres-0 -- psql -U postgres -c "
SELECT count(*) as connections, usename
FROM pg_stat_activity
GROUP BY usename;
"
# Kill long-running queries
kubectl exec -it postgres-0 -- psql -U postgres -c "
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'active'
AND query_start < NOW() - INTERVAL '5 minutes';
"
Database Performance
# Check slow queries
kubectl exec -it postgres-0 -- psql -U postgres -c "
SELECT query, calls, total_time, mean_time
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;
"
# Check index usage
kubectl exec -it postgres-0 -- psql -U postgres -c "
SELECT schemaname, tablename, indexname, idx_scan
FROM pg_stat_user_indexes
WHERE idx_scan = 0;
"
Kubernetes Operations
Restart Deployment
# Restart specific deployment
kubectl rollout restart deployment/[product]-backend -n [namespace]
# Check rollout status
kubectl rollout status deployment/[product]-backend -n [namespace]
# View rollout history
kubectl rollout history deployment/[product]-backend -n [namespace]
Scale Deployment
# Scale up
kubectl scale deployment/[product]-backend --replicas=5 -n [namespace]
# Scale down
kubectl scale deployment/[product]-backend --replicas=2 -n [namespace]
# Autoscaling
kubectl autoscale deployment/[product]-backend \
--cpu-percent=70 \
--min=3 \
--max=10 \
-n [namespace]
Check Pod Status
# List pods
kubectl get pods -n [namespace]
# Describe pod
kubectl describe pod [pod-name] -n [namespace]
# View logs
kubectl logs [pod-name] -n [namespace] --tail=100 -f
# Execute into pod
kubectl exec -it [pod-name] -n [namespace] -- /bin/sh
Debug Failing Pods
# Check pod events
kubectl get events -n [namespace] --sort-by='.lastTimestamp'
# Check resource usage
kubectl top pods -n [namespace]
# Check node status
kubectl top nodes
# Describe node
kubectl describe node [node-name]
Deployment Operations
Deploy New Version
# Using Helm
helm upgrade [product] ./helm \
-f helm/values-alpha.yaml \
--namespace [namespace] \
--set image.tag=[new-tag]
# Verify deployment
kubectl get pods -n [namespace] -w
# Check rollout status
kubectl rollout status deployment/[product] -n [namespace]
Rollback Deployment
# Rollback to previous version
kubectl rollout undo deployment/[product] -n [namespace]
# Rollback to specific revision
kubectl rollout undo deployment/[product] --to-revision=3 -n [namespace]
# Using Helm
helm rollback [product] [revision] -n [namespace]
# Verify rollback
kubectl rollout status deployment/[product] -n [namespace]
Blue-Green Deployment
# Deploy green version
kubectl apply -f deployment-green.yaml
# Test green version
kubectl port-forward svc/[product]-green 8080:80
# Switch traffic to green
kubectl patch service [product] -p '{"spec":{"selector":{"version":"green"}}}'
# Delete blue version (after verification)
kubectl delete deployment [product]-blue
Incident Response
High CPU Usage
# Check pod CPU usage
kubectl top pods -n [namespace] --sort-by=cpu
# Check container metrics
kubectl exec -it [pod-name] -- top
# Scale up if needed
kubectl scale deployment/[product] --replicas=5
# Investigate CPU-intensive processes
kubectl exec -it [pod-name] -- ps aux --sort=-%cpu | head
High Memory Usage
# Check pod memory usage
kubectl top pods -n [namespace] --sort-by=memory
# Check memory details
kubectl exec -it [pod-name] -- free -h
# Check for memory leaks
kubectl exec -it [pod-name] -- cat /proc/meminfo
# Restart pod if needed
kubectl delete pod [pod-name] -n [namespace]
Service Unavailable
# Check service status
kubectl get svc [service-name] -n [namespace]
# Check endpoints
kubectl get endpoints [service-name] -n [namespace]
# Test service connectivity
kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \
curl http://[service-name].[namespace].svc.cluster.local
# Check ingress
kubectl get ingress -n [namespace]
kubectl describe ingress [ingress-name] -n [namespace]
Certificate Issues
# Check certificate expiration
echo | openssl s_client -servername [domain] -connect [domain]:443 2>/dev/null | \
openssl x509 -noout -dates
# Renew Let's Encrypt certificate
kubectl delete certificate [cert-name] -n [namespace]
# Certificate will auto-renew
# Check cert-manager logs
kubectl logs -n cert-manager -l app=cert-manager -f
Monitoring Operations
Check Prometheus
# Port-forward to Prometheus
kubectl port-forward -n monitoring svc/prometheus 9090:9090
# Query metrics
curl 'http://localhost:9090/api/v1/query?query=up'
# Check targets
curl 'http://localhost:9090/api/v1/targets'
Check Grafana
# Port-forward to Grafana
kubectl port-forward -n monitoring svc/grafana 3000:80
# Access: http://localhost:3000
# Default credentials: admin/admin
Check Alerts
# Port-forward to AlertManager
kubectl port-forward -n monitoring svc/alertmanager 9093:9093
# View active alerts
curl http://localhost:9093/api/v1/alerts
# Silence alert
curl -X POST http://localhost:9093/api/v1/silences \
-d '{
"matchers": [{"name": "alertname", "value": "HighErrorRate"}],
"startsAt": "2024-01-01T00:00:00Z",
"endsAt": "2024-01-01T01:00:00Z",
"createdBy": "oncall",
"comment": "Planned maintenance"
}'
Security Operations
Rotate Secrets
# Update Kubernetes secret
kubectl create secret generic [secret-name] \
--from-literal=key=new-value \
--dry-run=client -o yaml | \
kubectl apply -f -
# Restart pods to pick up new secret
kubectl rollout restart deployment/[product] -n [namespace]
Check Security Scan Results
# Scan Docker image
trivy image [image:tag]
# Scan Kubernetes manifests
trivy config ./k8s/
# Check pod security policies
kubectl get psp
kubectl describe psp [policy-name]
Backup and Restore
Backup Kubernetes Resources
# Backup all resources in namespace
kubectl get all -n [namespace] -o yaml > backup-[namespace].yaml
# Backup specific resources
kubectl get deployment,service,configmap,secret -n [namespace] -o yaml > backup.yaml
Restore from Backup
# Restore resources
kubectl apply -f backup-[namespace].yaml
# Verify restoration
kubectl get all -n [namespace]
Maintenance Tasks
Update Node.js Dependencies
cd [product]-frontend
npm outdated
npm update
npm audit fix
npm test
git commit -am "Update dependencies"
Update Python Dependencies
cd [product]-backend
poetry update
poetry run pytest
git commit -am "Update dependencies"
Clean Up Old Images
# List images
kubectl get pods -n [namespace] -o jsonpath='{.items[*].spec.containers[*].image}' | tr ' ' '\n' | sort -u
# Clean up ACR
az acr repository list --name burdenoff
az acr repository delete --name burdenoff --image [image:tag]
Emergency Contacts
| Role | Contact | Primary | Backup |
|---|---|---|---|
| On-Call Engineer | PagerDuty | - | - |
| Team Lead | Slack/Phone | [Name] | [Name] |
| DevOps Lead | Slack/Phone | [Name] | [Name] |
| Security Lead | Slack/Phone | [Name] | [Name] |
| CTO | Phone | [Name] | - |