Runbooks

Operational runbooks for common procedures and troubleshooting.

Purpose

Runbooks provide step-by-step instructions for:

Routine operations
Emergency procedures
Troubleshooting guides
Maintenance tasks

Database Operations

Backup Database

# Manual backup
kubectl exec -it postgres-0 -- pg_dump -U postgres mydatabase > backup.sql

# Restore from backup
kubectl exec -i postgres-0 -- psql -U postgres mydatabase < backup.sql

# Verify backup
kubectl exec -it postgres-0 -- psql -U postgres mydatabase -c "\dt"

Check Database Connections

# View active connections
kubectl exec -it postgres-0 -- psql -U postgres -c "
  SELECT count(*) as connections, usename
  FROM pg_stat_activity
  GROUP BY usename;
"

# Kill long-running queries
kubectl exec -it postgres-0 -- psql -U postgres -c "
  SELECT pg_terminate_backend(pid)
  FROM pg_stat_activity
  WHERE state = 'active'
  AND query_start < NOW() - INTERVAL '5 minutes';
"

Database Performance

# Check slow queries
kubectl exec -it postgres-0 -- psql -U postgres -c "
  SELECT query, calls, total_time, mean_time
  FROM pg_stat_statements
  ORDER BY mean_time DESC
  LIMIT 10;
"

# Check index usage
kubectl exec -it postgres-0 -- psql -U postgres -c "
  SELECT schemaname, tablename, indexname, idx_scan
  FROM pg_stat_user_indexes
  WHERE idx_scan = 0;
"

Kubernetes Operations

Restart Deployment

# Restart specific deployment
kubectl rollout restart deployment/[product]-backend -n [namespace]

# Check rollout status
kubectl rollout status deployment/[product]-backend -n [namespace]

# View rollout history
kubectl rollout history deployment/[product]-backend -n [namespace]

Scale Deployment

# Scale up
kubectl scale deployment/[product]-backend --replicas=5 -n [namespace]

# Scale down
kubectl scale deployment/[product]-backend --replicas=2 -n [namespace]

# Autoscaling
kubectl autoscale deployment/[product]-backend \
  --cpu-percent=70 \
  --min=3 \
  --max=10 \
  -n [namespace]

Check Pod Status

# List pods
kubectl get pods -n [namespace]

# Describe pod
kubectl describe pod [pod-name] -n [namespace]

# View logs
kubectl logs [pod-name] -n [namespace] --tail=100 -f

# Execute into pod
kubectl exec -it [pod-name] -n [namespace] -- /bin/sh

Debug Failing Pods

# Check pod events
kubectl get events -n [namespace] --sort-by='.lastTimestamp'

# Check resource usage
kubectl top pods -n [namespace]

# Check node status
kubectl top nodes

# Describe node
kubectl describe node [node-name]

Deployment Operations

Deploy New Version

# Using Helm
helm upgrade [product] ./helm \
  -f helm/values-alpha.yaml \
  --namespace [namespace] \
  --set image.tag=[new-tag]

# Verify deployment
kubectl get pods -n [namespace] -w

# Check rollout status
kubectl rollout status deployment/[product] -n [namespace]

Rollback Deployment

# Rollback to previous version
kubectl rollout undo deployment/[product] -n [namespace]

# Rollback to specific revision
kubectl rollout undo deployment/[product] --to-revision=3 -n [namespace]

# Using Helm
helm rollback [product] [revision] -n [namespace]

# Verify rollback
kubectl rollout status deployment/[product] -n [namespace]

Blue-Green Deployment

# Deploy green version
kubectl apply -f deployment-green.yaml

# Test green version
kubectl port-forward svc/[product]-green 8080:80

# Switch traffic to green
kubectl patch service [product] -p '{"spec":{"selector":{"version":"green"}}}'

# Delete blue version (after verification)
kubectl delete deployment [product]-blue

Incident Response

High CPU Usage

# Check pod CPU usage
kubectl top pods -n [namespace] --sort-by=cpu

# Check container metrics
kubectl exec -it [pod-name] -- top

# Scale up if needed
kubectl scale deployment/[product] --replicas=5

# Investigate CPU-intensive processes
kubectl exec -it [pod-name] -- ps aux --sort=-%cpu | head

High Memory Usage

# Check pod memory usage
kubectl top pods -n [namespace] --sort-by=memory

# Check memory details
kubectl exec -it [pod-name] -- free -h

# Check for memory leaks
kubectl exec -it [pod-name] -- cat /proc/meminfo

# Restart pod if needed
kubectl delete pod [pod-name] -n [namespace]

Service Unavailable

# Check service status
kubectl get svc [service-name] -n [namespace]

# Check endpoints
kubectl get endpoints [service-name] -n [namespace]

# Test service connectivity
kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \
  curl http://[service-name].[namespace].svc.cluster.local

# Check ingress
kubectl get ingress -n [namespace]
kubectl describe ingress [ingress-name] -n [namespace]

Certificate Issues

# Check certificate expiration
echo | openssl s_client -servername [domain] -connect [domain]:443 2>/dev/null | \
  openssl x509 -noout -dates

# Renew Let's Encrypt certificate
kubectl delete certificate [cert-name] -n [namespace]
# Certificate will auto-renew

# Check cert-manager logs
kubectl logs -n cert-manager -l app=cert-manager -f

Monitoring Operations

Check Prometheus

# Port-forward to Prometheus
kubectl port-forward -n monitoring svc/prometheus 9090:9090

# Query metrics
curl 'http://localhost:9090/api/v1/query?query=up'

# Check targets
curl 'http://localhost:9090/api/v1/targets'

Check Grafana

# Port-forward to Grafana
kubectl port-forward -n monitoring svc/grafana 3000:80

# Access: http://localhost:3000
# Default credentials: admin/admin

Check Alerts

# Port-forward to AlertManager
kubectl port-forward -n monitoring svc/alertmanager 9093:9093

# View active alerts
curl http://localhost:9093/api/v1/alerts

# Silence alert
curl -X POST http://localhost:9093/api/v1/silences \
  -d '{
    "matchers": [{"name": "alertname", "value": "HighErrorRate"}],
    "startsAt": "2024-01-01T00:00:00Z",
    "endsAt": "2024-01-01T01:00:00Z",
    "createdBy": "oncall",
    "comment": "Planned maintenance"
  }'

Security Operations

Rotate Secrets

# Update Kubernetes secret
kubectl create secret generic [secret-name] \
  --from-literal=key=new-value \
  --dry-run=client -o yaml | \
  kubectl apply -f -

# Restart pods to pick up new secret
kubectl rollout restart deployment/[product] -n [namespace]

Check Security Scan Results

# Scan Docker image
trivy image [image:tag]

# Scan Kubernetes manifests
trivy config ./k8s/

# Check pod security policies
kubectl get psp
kubectl describe psp [policy-name]

Backup and Restore

Backup Kubernetes Resources

# Backup all resources in namespace
kubectl get all -n [namespace] -o yaml > backup-[namespace].yaml

# Backup specific resources
kubectl get deployment,service,configmap,secret -n [namespace] -o yaml > backup.yaml

Restore from Backup

# Restore resources
kubectl apply -f backup-[namespace].yaml

# Verify restoration
kubectl get all -n [namespace]

Maintenance Tasks

Update Node.js Dependencies

cd [product]-frontend
npm outdated
npm update
npm audit fix
npm test
git commit -am "Update dependencies"

Update Python Dependencies

cd [product]-backend
poetry update
poetry run pytest
git commit -am "Update dependencies"

Clean Up Old Images

# List images
kubectl get pods -n [namespace] -o jsonpath='{.items[*].spec.containers[*].image}' | tr ' ' '\n' | sort -u

# Clean up ACR
az acr repository list --name burdenoff
az acr repository delete --name burdenoff --image [image:tag]

Emergency Contacts

Role	Contact	Primary	Backup
On-Call Engineer	PagerDuty	-	-
Team Lead	Slack/Phone	[Name]	[Name]
DevOps Lead	Slack/Phone	[Name]	[Name]
Security Lead	Slack/Phone	[Name]	[Name]
CTO	Phone	[Name]	-

Purpose​

Database Operations​

Backup Database​

Check Database Connections​

Database Performance​

Kubernetes Operations​

Restart Deployment​

Scale Deployment​

Check Pod Status​

Debug Failing Pods​

Deployment Operations​

Deploy New Version​

Rollback Deployment​

Blue-Green Deployment​

Incident Response​

High CPU Usage​

High Memory Usage​

Service Unavailable​

Certificate Issues​

Monitoring Operations​

Check Prometheus​

Check Grafana​

Check Alerts​

Security Operations​

Rotate Secrets​

Check Security Scan Results​

Backup and Restore​

Backup Kubernetes Resources​

Restore from Backup​

Maintenance Tasks​

Update Node.js Dependencies​

Update Python Dependencies​

Clean Up Old Images​

Emergency Contacts​

References​

Purpose

Database Operations

Backup Database

Check Database Connections

Database Performance

Kubernetes Operations

Restart Deployment

Scale Deployment

Check Pod Status

Debug Failing Pods

Deployment Operations

Deploy New Version

Rollback Deployment

Blue-Green Deployment

Incident Response

High CPU Usage

High Memory Usage

Service Unavailable

Certificate Issues

Monitoring Operations

Check Prometheus

Check Grafana

Check Alerts

Security Operations

Rotate Secrets

Check Security Scan Results

Backup and Restore

Backup Kubernetes Resources

Restore from Backup

Maintenance Tasks

Update Node.js Dependencies

Update Python Dependencies

Clean Up Old Images

Emergency Contacts

References