Skip to main content

Runbooks

Operational runbooks for common procedures and troubleshooting.

Purpose

Runbooks provide step-by-step instructions for:

  • Routine operations
  • Emergency procedures
  • Troubleshooting guides
  • Maintenance tasks

Database Operations

Backup Database

# Manual backup
kubectl exec -it postgres-0 -- pg_dump -U postgres mydatabase > backup.sql

# Restore from backup
kubectl exec -i postgres-0 -- psql -U postgres mydatabase < backup.sql

# Verify backup
kubectl exec -it postgres-0 -- psql -U postgres mydatabase -c "\dt"

Check Database Connections

# View active connections
kubectl exec -it postgres-0 -- psql -U postgres -c "
SELECT count(*) as connections, usename
FROM pg_stat_activity
GROUP BY usename;
"

# Kill long-running queries
kubectl exec -it postgres-0 -- psql -U postgres -c "
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'active'
AND query_start < NOW() - INTERVAL '5 minutes';
"

Database Performance

# Check slow queries
kubectl exec -it postgres-0 -- psql -U postgres -c "
SELECT query, calls, total_time, mean_time
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;
"

# Check index usage
kubectl exec -it postgres-0 -- psql -U postgres -c "
SELECT schemaname, tablename, indexname, idx_scan
FROM pg_stat_user_indexes
WHERE idx_scan = 0;
"

Kubernetes Operations

Restart Deployment

# Restart specific deployment
kubectl rollout restart deployment/[product]-backend -n [namespace]

# Check rollout status
kubectl rollout status deployment/[product]-backend -n [namespace]

# View rollout history
kubectl rollout history deployment/[product]-backend -n [namespace]

Scale Deployment

# Scale up
kubectl scale deployment/[product]-backend --replicas=5 -n [namespace]

# Scale down
kubectl scale deployment/[product]-backend --replicas=2 -n [namespace]

# Autoscaling
kubectl autoscale deployment/[product]-backend \
--cpu-percent=70 \
--min=3 \
--max=10 \
-n [namespace]

Check Pod Status

# List pods
kubectl get pods -n [namespace]

# Describe pod
kubectl describe pod [pod-name] -n [namespace]

# View logs
kubectl logs [pod-name] -n [namespace] --tail=100 -f

# Execute into pod
kubectl exec -it [pod-name] -n [namespace] -- /bin/sh

Debug Failing Pods

# Check pod events
kubectl get events -n [namespace] --sort-by='.lastTimestamp'

# Check resource usage
kubectl top pods -n [namespace]

# Check node status
kubectl top nodes

# Describe node
kubectl describe node [node-name]

Deployment Operations

Deploy New Version

# Using Helm
helm upgrade [product] ./helm \
-f helm/values-alpha.yaml \
--namespace [namespace] \
--set image.tag=[new-tag]

# Verify deployment
kubectl get pods -n [namespace] -w

# Check rollout status
kubectl rollout status deployment/[product] -n [namespace]

Rollback Deployment

# Rollback to previous version
kubectl rollout undo deployment/[product] -n [namespace]

# Rollback to specific revision
kubectl rollout undo deployment/[product] --to-revision=3 -n [namespace]

# Using Helm
helm rollback [product] [revision] -n [namespace]

# Verify rollback
kubectl rollout status deployment/[product] -n [namespace]

Blue-Green Deployment

# Deploy green version
kubectl apply -f deployment-green.yaml

# Test green version
kubectl port-forward svc/[product]-green 8080:80

# Switch traffic to green
kubectl patch service [product] -p '{"spec":{"selector":{"version":"green"}}}'

# Delete blue version (after verification)
kubectl delete deployment [product]-blue

Incident Response

High CPU Usage

# Check pod CPU usage
kubectl top pods -n [namespace] --sort-by=cpu

# Check container metrics
kubectl exec -it [pod-name] -- top

# Scale up if needed
kubectl scale deployment/[product] --replicas=5

# Investigate CPU-intensive processes
kubectl exec -it [pod-name] -- ps aux --sort=-%cpu | head

High Memory Usage

# Check pod memory usage
kubectl top pods -n [namespace] --sort-by=memory

# Check memory details
kubectl exec -it [pod-name] -- free -h

# Check for memory leaks
kubectl exec -it [pod-name] -- cat /proc/meminfo

# Restart pod if needed
kubectl delete pod [pod-name] -n [namespace]

Service Unavailable

# Check service status
kubectl get svc [service-name] -n [namespace]

# Check endpoints
kubectl get endpoints [service-name] -n [namespace]

# Test service connectivity
kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \
curl http://[service-name].[namespace].svc.cluster.local

# Check ingress
kubectl get ingress -n [namespace]
kubectl describe ingress [ingress-name] -n [namespace]

Certificate Issues

# Check certificate expiration
echo | openssl s_client -servername [domain] -connect [domain]:443 2>/dev/null | \
openssl x509 -noout -dates

# Renew Let's Encrypt certificate
kubectl delete certificate [cert-name] -n [namespace]
# Certificate will auto-renew

# Check cert-manager logs
kubectl logs -n cert-manager -l app=cert-manager -f

Monitoring Operations

Check Prometheus

# Port-forward to Prometheus
kubectl port-forward -n monitoring svc/prometheus 9090:9090

# Query metrics
curl 'http://localhost:9090/api/v1/query?query=up'

# Check targets
curl 'http://localhost:9090/api/v1/targets'

Check Grafana

# Port-forward to Grafana
kubectl port-forward -n monitoring svc/grafana 3000:80

# Access: http://localhost:3000
# Default credentials: admin/admin

Check Alerts

# Port-forward to AlertManager
kubectl port-forward -n monitoring svc/alertmanager 9093:9093

# View active alerts
curl http://localhost:9093/api/v1/alerts

# Silence alert
curl -X POST http://localhost:9093/api/v1/silences \
-d '{
"matchers": [{"name": "alertname", "value": "HighErrorRate"}],
"startsAt": "2024-01-01T00:00:00Z",
"endsAt": "2024-01-01T01:00:00Z",
"createdBy": "oncall",
"comment": "Planned maintenance"
}'

Security Operations

Rotate Secrets

# Update Kubernetes secret
kubectl create secret generic [secret-name] \
--from-literal=key=new-value \
--dry-run=client -o yaml | \
kubectl apply -f -

# Restart pods to pick up new secret
kubectl rollout restart deployment/[product] -n [namespace]

Check Security Scan Results

# Scan Docker image
trivy image [image:tag]

# Scan Kubernetes manifests
trivy config ./k8s/

# Check pod security policies
kubectl get psp
kubectl describe psp [policy-name]

Backup and Restore

Backup Kubernetes Resources

# Backup all resources in namespace
kubectl get all -n [namespace] -o yaml > backup-[namespace].yaml

# Backup specific resources
kubectl get deployment,service,configmap,secret -n [namespace] -o yaml > backup.yaml

Restore from Backup

# Restore resources
kubectl apply -f backup-[namespace].yaml

# Verify restoration
kubectl get all -n [namespace]

Maintenance Tasks

Update Node.js Dependencies

cd [product]-frontend
npm outdated
npm update
npm audit fix
npm test
git commit -am "Update dependencies"

Update Python Dependencies

cd [product]-backend
poetry update
poetry run pytest
git commit -am "Update dependencies"

Clean Up Old Images

# List images
kubectl get pods -n [namespace] -o jsonpath='{.items[*].spec.containers[*].image}' | tr ' ' '\n' | sort -u

# Clean up ACR
az acr repository list --name burdenoff
az acr repository delete --name burdenoff --image [image:tag]

Emergency Contacts

RoleContactPrimaryBackup
On-Call EngineerPagerDuty--
Team LeadSlack/Phone[Name][Name]
DevOps LeadSlack/Phone[Name][Name]
Security LeadSlack/Phone[Name][Name]
CTOPhone[Name]-

References