8. Mantenimiento y Operaciones
8. Mantenimiento y Operaciones
Esta sección cubre los procedimientos de mantenimiento, actualizaciones, gestión de incidentes y operaciones diarias del sistema DTEM.
8.1. Mantenimiento Preventivo
8.1.1. Schedule de Mantenimiento
Daily Tasks
#!/bin/bash
# daily-maintenance.sh
echo "Starting daily maintenance tasks..."
# 1. Health checks
echo "Performing health checks..."
curl -f http://localhost:3000/health || exit 1
curl -f http://localhost:3000/health/db || exit 1
curl -f http://localhost:3000/health/redis || exit 1
# 2. Log rotation
echo "Rotating logs..."
logrotate -f /etc/logrotate.d/dtem
# 3. Cache cleanup
echo "Cleaning Redis cache..."
redis-cli --scan --pattern "temp:*" | xargs redis-cli del
# 4. Temporary file cleanup
echo "Cleaning temporary files..."
find /tmp -name "dtem_*" -mtime +1 -delete
find /app/uploads/temp -name "*" -mtime +1 -delete
# 5. Database statistics
echo "Updating database statistics..."
psql -U postgres -d dtem_prod -c "ANALYZE;"
# 6. Backup verification
echo "Verifying last backup..."
LATEST_BACKUP=$(ls -t /backups/daily/ | head -n1)
if [ -f "/backups/daily/$LATEST_BACKUP" ]; then
echo "✅ Latest backup verified: $LATEST_BACKUP"
else
echo "❌ No backup found!"
exit 1
fi
echo "Daily maintenance completed successfully"
Weekly Tasks
#!/bin/bash
# weekly-maintenance.sh
echo "Starting weekly maintenance tasks..."
# 1. Security updates
echo "Checking for security updates..."
apt list --upgradable 2>/dev/null | grep -i security
# 2. Certificate expiry check
echo "Checking certificate expiry..."
for cert in /app/certs/*.crt; do
expiry=$(openssl x509 -in "$cert" -noout -enddate | cut -d= -f2)
expiry_epoch=$(date -d "$expiry" +%s)
current_epoch=$(date +%s)
days_until_expiry=$(( (expiry_epoch - current_epoch) / 86400 ))
if [ "$days_until_expiry" -lt 30 ]; then
echo "⚠️ Certificate $cert expires in $days_until_expiry days"
fi
done
# 3. Performance analysis
echo "Running performance analysis..."
psql -U postgres -d dtem_prod -c "
SELECT
schemaname,
tablename,
n_tup_ins,
n_tup_upd,
n_tup_del,
n_live_tup,
n_dead_tup
FROM pg_stat_user_tables
WHERE n_dead_tup > n_live_tup * 0.1;
"
# 4. Index maintenance
echo "Checking index usage..."
psql -U postgres -d dtem_prod -c "
SELECT
schemaname,
tablename,
indexname,
idx_scan,
pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
WHERE idx_scan < 100
AND pg_relation_size(indexrelid) > 10485760;
"
# 5. Log analysis
echo "Analyzing error logs..."
ERROR_COUNT=$(grep -c "ERROR" /var/log/dtem/app.log.$(date +%Y-%m-%d))
if [ "$ERROR_COUNT" -gt 10 ]; then
echo "⚠️ High error count detected: $ERROR_COUNT errors"
fi
echo "Weekly maintenance completed"
Monthly Tasks
#!/bin/bash
# monthly-maintenance.sh
echo "Starting monthly maintenance tasks..."
# 1. Full system backup
echo "Performing full system backup..."
./scripts/backup-full-system.sh
# 2. Database vacuum
echo "Running database vacuum..."
psql -U postgres -d dtem_prod -c "VACUUM ANALYZE;"
# 3. Archive old data
echo "Archiving old data..."
psql -U postgres -d dtem_prod -c "
CREATE TABLE audit_logs_archive_$(date +%Y_%m) AS
SELECT * FROM audit_logs
WHERE timestamp < NOW() - INTERVAL '1 year';
DELETE FROM audit_logs
WHERE timestamp < NOW() - INTERVAL '1 year';
"
# 4. Performance tuning
echo "Running performance tuning..."
./scripts/performance-tuning.sh
# 5. Security audit
echo "Running security audit..."
./scripts/security-audit.sh
echo "Monthly maintenance completed"
8.1.2. Automated Maintenance
Kubernetes CronJobs
# maintenance-cronjobs.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: daily-maintenance
namespace: dtem-prod
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: maintenance
image: dtem/maintenance:latest
command: ["/bin/bash", "-c", "./scripts/daily-maintenance.sh"]
env:
- name: DB_HOST
value: "postgres-service"
- name: REDIS_HOST
value: "redis-service"
volumeMounts:
- name: scripts
mountPath: /app/scripts
- name: logs
mountPath: /var/log/dtem
volumes:
- name: scripts
configMap:
name: maintenance-scripts
- name: logs
persistentVolumeClaim:
claimName: logs-pvc
restartPolicy: OnFailure
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: weekly-maintenance
namespace: dtem-prod
spec:
schedule: "0 3 * * 0" # Weekly on Sunday at 3 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: maintenance
image: dtem/maintenance:latest
command: ["/bin/bash", "-c", "./scripts/weekly-maintenance.sh"]
env:
- name: DB_HOST
value: "postgres-service"
volumeMounts:
- name: scripts
mountPath: /app/scripts
volumes:
- name: scripts
configMap:
name: maintenance-scripts
restartPolicy: OnFailure
8.2. Gestión de Actualizaciones
8.2.1. Update Process
Update Workflow
flowchart TD
A[Update Available] --> B[Schedule Maintenance Window]
B --> C[Create Backup]
C --> D[Deploy to Staging]
D --> E[Run Tests]
E --> F{Tests Pass?}
F -->|Yes| G[Deploy to Production]
F -->|No| H[Rollback Staging]
H --> I[Fix Issues]
I --> D
G --> J[Monitor Health]
J --> K{Health OK?}
K -->|Yes| L[Update Complete]
K -->|No| M[Rollback Production]
M --> N[Investigate Issues]
Automated Update Script
#!/bin/bash
# update-system.sh
set -euo pipefail
# Configuration
NEW_VERSION="${1:-latest}"
BACKUP_DIR="/backups/pre-update-$(date +%Y%m%d_%H%M%S)"
MAINTENANCE_MODE_FILE="/tmp/maintenance.mode"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
log() {
echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}
warn() {
echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}
error() {
echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
exit 1
}
# Pre-update checks
pre_update_checks() {
log "Performing pre-update checks..."
# Check disk space
DISK_USAGE=$(df / | awk 'NR==2 {print $5}' | sed 's/%//')
if [ "$DISK_USAGE" -gt 80 ]; then
error "Disk usage is ${DISK_USAGE}%. Please free up space before updating."
fi
# Check memory
AVAILABLE_MEM=$(free -m | awk 'NR==2{printf "%.0f", $7}')
if [ "$AVAILABLE_MEM" -lt 1024 ]; then
error "Available memory is ${AVAILABLE_MEM}MB. Minimum 1GB required."
fi
# Check database connections
DB_CONNECTIONS=$(psql -U postgres -d dtem_prod -t -c "SELECT count(*) FROM pg_stat_activity;")
if [ "$DB_CONNECTIONS" -gt 50 ]; then
warn "High database connections: $DB_CONNECTIONS"
fi
log "Pre-update checks completed"
}
# Create backup
create_backup() {
log "Creating backup in $BACKUP_DIR..."
mkdir -p "$BACKUP_DIR"
# Database backup
pg_dump -U postgres -d dtem_prod -Fc > "$BACKUP_DIR/database.dump"
# Configuration backup
cp -r /app/config "$BACKUP_DIR/"
cp -r /app/certs "$BACKUP_DIR/"
# Verify backup
if [ -f "$BACKUP_DIR/database.dump" ]; then
log "Backup created successfully"
else
error "Backup creation failed"
fi
}
# Enable maintenance mode
enable_maintenance_mode() {
log "Enabling maintenance mode..."
touch "$MAINTENANCE_MODE_FILE"
# Scale down frontend
kubectl scale deployment web-frontend --replicas=0 -n dtem-prod
# Wait for pods to terminate
kubectl wait --for=delete pod -l app=web-frontend -n dtem-prod --timeout=300s
log "Maintenance mode enabled"
}
# Update application
update_application() {
log "Updating application to version $NEW_VERSION..."
# Pull new image
docker pull dtem/api-gateway:$NEW_VERSION
docker pull dtem/dte-service:$NEW_VERSION
docker pull dtem/web-frontend:$NEW_VERSION
# Update deployments
kubectl set image deployment/api-gateway api-gateway=dtem/api-gateway:$NEW_VERSION -n dtem-prod
kubectl set image deployment/dte-service dte-service=dtem/dte-service:$NEW_VERSION -n dtem-prod
kubectl set image deployment/web-frontend web=dtem/web-frontend:$NEW_VERSION -n dtem-prod
# Wait for rollout
kubectl rollout status deployment/api-gateway -n dtem-prod --timeout=600s
kubectl rollout status deployment/dte-service -n dtem-prod --timeout=600s
kubectl rollout status deployment/web-frontend -n dtem-prod --timeout=600s
log "Application updated successfully"
}
# Run post-update tests
post_update_tests() {
log "Running post-update tests..."
# Health checks
for i in {1..30}; do
if curl -f http://api.dtem.empresa.cl/health; then
log "Health check passed"
break
fi
if [ $i -eq 30 ]; then
error "Health check failed after 30 attempts"
fi
sleep 10
done
# Database connectivity
if psql -U postgres -d dtem_prod -c "SELECT 1;" > /dev/null 2>&1; then
log "Database connectivity test passed"
else
error "Database connectivity test failed"
fi
# Redis connectivity
if redis-cli ping > /dev/null 2>&1; then
log "Redis connectivity test passed"
else
error "Redis connectivity test failed"
fi
log "Post-update tests completed"
}
# Disable maintenance mode
disable_maintenance_mode() {
log "Disabling maintenance mode..."
rm -f "$MAINTENANCE_MODE_FILE"
# Scale up frontend
kubectl scale deployment web-frontend --replicas=3 -n dtem-prod
# Wait for pods to be ready
kubectl wait --for=condition=ready pod -l app=web-frontend -n dtem-prod --timeout=300s
log "Maintenance mode disabled"
}
# Rollback function
rollback() {
warn "Rolling back update..."
# Restore database
pg_restore -U postgres -d dtem_prod --clean --if-exists "$BACKUP_DIR/database.dump"
# Restore previous image versions
kubectl set image deployment/api-gateway api-gateway=dtem/api-gateway:previous -n dtem-prod
kubectl set image deployment/dte-service dte-service=dtem/dte-service:previous -n dtem-prod
kubectl set image deployment/web-frontend web=dtem/web-frontend:previous -n dtem-prod
# Wait for rollout
kubectl rollout status deployment/api-gateway -n dtem-prod --timeout=600s
kubectl rollout status deployment/dte-service -n dtem-prod --timeout=600s
kubectl rollout status deployment/web-frontend -n dtem-prod --timeout=600s
disable_maintenance_mode
error "Update rolled back due to errors"
}
# Main execution
main() {
log "Starting system update to version $NEW_VERSION"
# Trap errors and rollback
trap rollback ERR
pre_update_checks
create_backup
enable_maintenance_mode
update_application
post_update_tests
disable_maintenance_mode
log "System update completed successfully"
# Cleanup
rm -rf "$BACKUP_DIR"
}
# Execute main function
main "$@"
8.2.2. Blue-Green Deployment
Blue-Green Strategy
# blue-green-deployment.yaml
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: dtem-api-gateway
namespace: dtem-prod
spec:
replicas: 5
strategy:
blueGreen:
activeService: api-gateway-active
previewService: api-gateway-preview
autoPromotionEnabled: false
scaleDownDelaySeconds: 30
prePromotionAnalysis:
templates:
- templateName: success-rate
args:
- name: service-name
value: api-gateway-preview
postPromotionAnalysis:
templates:
- templateName: success-rate
args:
- name: service-name
value: api-gateway-active
selector:
matchLabels:
app: api-gateway
template:
metadata:
labels:
app: api-gateway
spec:
containers:
- name: api-gateway
image: dtem/api-gateway:{{ .Values.image.tag }}
ports:
- containerPort: 3000
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
8.3. Gestión de Incidentes
8.3.1. Incident Response Process
Incident Classification
| Severity | Response Time | Escalation | Impact |
|---|---|---|---|
| P0 - Critical | 15 minutes | Immediate | System down, data loss |
| P1 - High | 1 hour | 4 hours | Major functionality affected |
| P2 - Medium | 4 hours | 24 hours | Partial functionality affected |
| P3 - Low | 24 hours | 72 hours | Minor issues, cosmetic |
Incident Response Workflow
flowchart TD
A[Incident Detected] --> B{Triage & Classify}
B --> C[Create Incident Ticket]
C --> D[Notify On-call Team]
D --> E[Investigate & Diagnose]
E --> F[Implement Fix]
F --> G[Monitor Resolution]
G --> H{Resolved?}
H -->|Yes| I[Post-Incident Review]
H -->|No| E
I --> J[Documentation]
J --> K[Close Incident]
Incident Management Script
#!/bin/bash
# incident-response.sh
INCIDENT_ID="$1"
SEVERITY="$2"
DESCRIPTION="$3"
# Create incident directory
INCIDENT_DIR="/incidents/$(date +%Y-%m-%d)/$INCIDENT_ID"
mkdir -p "$INCIDENT_DIR"
# Create incident log
cat > "$INCIDENT_DIR/incident.log" << EOF
Incident ID: $INCIDENT_ID
Severity: $SEVERITY
Description: $DESCRIPTION
Start Time: $(date)
Status: Open
EOF
# Notify teams
send_notification() {
local message="$1"
local channel="$2"
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"$message\"}" \
"$SLACK_WEBHOOK_URL"
}
# Send initial notification
send_notification "🚨 Incident $INCIDENT_ID ($SEVERITY): $DESCRIPTION" "#incidents"
# Start incident response session
start_incident_session() {
echo "Starting incident response session for $INCIDENT_ID"
# Create war room
kubectl create namespace "incident-$INCIDENT_ID" --dry-run=client -o yaml | kubectl apply -f -
# Deploy debugging tools
kubectl run debug-pod --image=nicolaka/netshoot --rm -it --namespace="incident-$INCIDENT_ID" -- bash
# Start logging
script -a "$INCIDENT_DIR/session.log"
}
# Generate incident report
generate_report() {
cat > "$INCIDENT_DIR/report.md" << EOF
# Incident Report: $INCIDENT_ID
## Summary
- **ID**: $INCIDENT_ID
- **Severity**: $SEVERITY
- **Start Time**: $(date)
- **Description**: $DESCRIPTION
## Timeline
$(cat "$INCIDENT_DIR/timeline.log")
## Root Cause Analysis
$(cat "$INCIDENT_DIR/root-cause.md")
## Resolution
$(cat "$INCIDENT_DIR/resolution.md")
## Lessons Learned
$(cat "$INCIDENT_DIR/lessons-learned.md")
## Action Items
$(cat "$INCIDENT_DIR/action-items.md")
EOF
}
start_incident_session
8.3.2. Post-Mortem Process
Post-Mortem Template
# Post-Mortem: Incident [INCIDENT_ID]
## Executive Summary
[Brief summary of the incident and its impact]
## Timeline
- **HH:MM**: Incident detected
- **HH:MM**: Investigation started
- **HH:MM**: Root cause identified
- **HH:MM**: Fix implemented
- **HH:MM**: Service restored
- **HH:MM**: Incident resolved
## Impact Assessment
- **Affected Services**: [List of services]
- **Duration**: [Total downtime]
- **Users Affected**: [Number of users]
- **Business Impact**: [Financial/operational impact]
## Root Cause Analysis
### What happened?
[Detailed description of the incident]
### Why did it happen?
[Root cause analysis]
### Contributing factors
[Factors that contributed to the incident]
## Resolution
### Immediate Actions
[Actions taken to resolve the incident]
### Permanent Fixes
[Long-term fixes implemented]
## Prevention Measures
### Short-term (1-4 weeks)
- [ ] Action item 1
- [ ] Action item 2
### Long-term (1-6 months)
- [ ] Action item 3
- [ ] Action item 4
## Lessons Learned
### What went well
- [Positive aspects of the response]
### What could be improved
- [Areas for improvement]
### Action Items
| Item | Owner | Due Date | Status |
|------|-------|----------|--------|
| [Action 1] | [Owner] | [Date] | [Status] |
| [Action 2] | [Owner] | [Date] | [Status] |
8.4. Performance Tuning
8.4.1. Database Performance
Performance Optimization Script
#!/bin/bash
# db-performance-tuning.sh
echo "Starting database performance tuning..."
# Analyze slow queries
echo "Analyzing slow queries..."
psql -U postgres -d dtem_prod -c "
SELECT
query,
calls,
total_time,
mean_time,
rows
FROM pg_stat_statements
WHERE mean_time > 1000
ORDER BY mean_time DESC
LIMIT 10;
"
# Check missing indexes
echo "Checking for missing indexes..."
psql -U postgres -d dtem_prod -c "
SELECT
schemaname,
tablename,
attname,
n_distinct,
correlation
FROM pg_stats
WHERE schemaname = 'public'
AND n_distinct > 100
ORDER BY n_distinct DESC;
"
# Update table statistics
echo "Updating table statistics..."
psql -U postgres -d dtem_prod -c "ANALYZE;"
# Rebuild fragmented indexes
echo "Rebuilding fragmented indexes..."
psql -U postgres -d dtem_prod -c "
SELECT
schemaname,
tablename,
indexname,
pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
WHERE idx_scan > 1000
AND pg_relation_size(indexrelid) > 104857600
AND (schemaname, tablename) IN (
SELECT schemaname, tablename
FROM pg_stat_user_tables
WHERE n_dead_tup > n_live_tup * 0.1
);
" | while read -r schema table index size; do
echo "Rebuilding index $schema.$table.$index ($size)"
psql -U postgres -d dtem_prod -c "REINDEX INDEX CONCURRENTLY $schema.$index;"
done
# Optimize configuration
echo "Checking configuration optimization..."
cat << EOF
Recommended PostgreSQL configuration changes:
shared_buffers = 25% of RAM
effective_cache_size = 75% of RAM
work_mem = 4MB per connection
maintenance_work_mem = 10% of RAM
checkpoint_completion_target = 0.9
wal_buffers = 16MB
default_statistics_target = 100
random_page_cost = 1.1
effective_io_concurrency = 200
EOF
echo "Database performance tuning completed"
8.4.2. Application Performance
Application Optimization
// Performance monitoring middleware
const performanceMonitor = (req, res, next) => {
const start = process.hrtime.bigint();
res.on('finish', () => {
const end = process.hrtime.bigint();
const duration = Number(end - start) / 1000000; // Convert to milliseconds
// Log slow requests
if (duration > 1000) {
console.warn(`Slow request: ${req.method} ${req.path} - ${duration}ms`);
}
// Record metrics
httpRequestDuration.observe({
method: req.method,
route: req.route?.path || req.path,
status_code: res.statusCode.toString()
}, duration / 1000);
});
next();
};
// Database query optimization
class QueryOptimizer {
static async optimizedDTEQuery(filters) {
let query = `
SELECT
d.id,
d.document_type,
d.folio,
d.rut_receptor,
d.fecha_emision,
d.monto_total,
d.status,
d.created_at
FROM dte_documents d
WHERE 1=1
`;
const params = [];
let paramIndex = 1;
// Add filters efficiently
if (filters.companyId) {
query += ` AND d.company_id = $${paramIndex++}`;
params.push(filters.companyId);
}
if (filters.documentType) {
query += ` AND d.document_type = $${paramIndex++}`;
params.push(filters.documentType);
}
if (filters.startDate) {
query += ` AND d.fecha_emision >= $${paramIndex++}`;
params.push(filters.startDate);
}
if (filters.endDate) {
query += ` AND d.fecha_emision <= $${paramIndex++}`;
params.push(filters.endDate);
}
// Add pagination
query += ` ORDER BY d.created_at DESC LIMIT $${paramIndex++} OFFSET $${paramIndex++}`;
params.push(filters.limit || 50, filters.offset || 0);
// Execute with timeout
const result = await pool.query(query, params);
return result.rows;
}
}
// Cache optimization
class CacheManager {
constructor(redisClient) {
this.redis = redisClient;
this.defaultTTL = 300; // 5 minutes
}
async get(key) {
try {
const cached = await this.redis.get(key);
return cached ? JSON.parse(cached) : null;
} catch (error) {
console.error('Cache get error:', error);
return null;
}
}
async set(key, value, ttl = this.defaultTTL) {
try {
await this.redis.setex(key, ttl, JSON.stringify(value));
} catch (error) {
console.error('Cache set error:', error);
}
}
async invalidate(pattern) {
try {
const keys = await this.redis.keys(pattern);
if (keys.length > 0) {
await this.redis.del(...keys);
}
} catch (error) {
console.error('Cache invalidate error:', error);
}
}
}
8.5. Capacity Planning
8.5.1. Resource Monitoring
Capacity Analysis Script
#!/bin/bash
# capacity-analysis.sh
echo "Starting capacity analysis..."
# CPU usage analysis
echo "=== CPU Usage Analysis ==="
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
echo "Current CPU usage: $CPU_USAGE%"
# Memory usage analysis
echo "=== Memory Usage Analysis ==="
MEMORY_INFO=$(free -h | grep "Mem:")
TOTAL_MEM=$(echo $MEMORY_INFO | awk '{print $2}')
USED_MEM=$(echo $MEMORY_INFO | awk '{print $3}')
echo "Memory usage: $USED_MEM / $TOTAL_MEM"
# Disk usage analysis
echo "=== Disk Usage Analysis ==="
df -h | grep -E "^/dev/" | while read -r line; do
echo "$line"
done
# Database size analysis
echo "=== Database Size Analysis ==="
psql -U postgres -d dtem_prod -c "
SELECT
pg_size_pretty(pg_database_size('dtem_prod')) as database_size,
(SELECT count(*) FROM users) as users_count,
(SELECT count(*) FROM dte_documents) as dte_count,
(SELECT count(*) FROM cafs) as caf_count;
"
# Growth trends
echo "=== Growth Trends (Last 30 Days) ==="
psql -U postgres -d dtem_prod -c "
SELECT
DATE_TRUNC('day', created_at) as date,
COUNT(*) as new_dtes
FROM dte_documents
WHERE created_at >= NOW() - INTERVAL '30 days'
GROUP BY DATE_TRUNC('day', created_at)
ORDER BY date DESC
LIMIT 7;
"
# Performance metrics
echo "=== Performance Metrics ==="
psql -U postgres -d dtem_prod -c "
SELECT
(SELECT count(*) FROM pg_stat_activity WHERE state = 'active') as active_connections,
(SELECT count(*) FROM pg_stat_activity) as total_connections,
(SELECT pg_size_pretty(pg_database_size('dtem_prod'))) as db_size;
"
echo "Capacity analysis completed"
8.5.2. Scaling Recommendations
Auto-scaling Configuration
# hpa-recommendations.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: dtem-api-gateway-hpa
namespace: dtem-prod
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: api-gateway
minReplicas: 3
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: http_requests_per_second
target:
type: AverageValue
averageValue: "100"
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 50
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
selectPolicy: Max
Próxima sección: 9. Disaster Recovery