8. Mantenimiento y Operaciones

8. Mantenimiento y Operaciones

Esta sección cubre los procedimientos de mantenimiento, actualizaciones, gestión de incidentes y operaciones diarias del sistema DTEM.

8.1. Mantenimiento Preventivo

8.1.1. Schedule de Mantenimiento

Daily Tasks

#!/bin/bash
# daily-maintenance.sh

echo "Starting daily maintenance tasks..."

# 1. Health checks
echo "Performing health checks..."
curl -f http://localhost:3000/health || exit 1
curl -f http://localhost:3000/health/db || exit 1
curl -f http://localhost:3000/health/redis || exit 1

# 2. Log rotation
echo "Rotating logs..."
logrotate -f /etc/logrotate.d/dtem

# 3. Cache cleanup
echo "Cleaning Redis cache..."
redis-cli --scan --pattern "temp:*" | xargs redis-cli del

# 4. Temporary file cleanup
echo "Cleaning temporary files..."
find /tmp -name "dtem_*" -mtime +1 -delete
find /app/uploads/temp -name "*" -mtime +1 -delete

# 5. Database statistics
echo "Updating database statistics..."
psql -U postgres -d dtem_prod -c "ANALYZE;"

# 6. Backup verification
echo "Verifying last backup..."
LATEST_BACKUP=$(ls -t /backups/daily/ | head -n1)
if [ -f "/backups/daily/$LATEST_BACKUP" ]; then
    echo "✅ Latest backup verified: $LATEST_BACKUP"
else
    echo "❌ No backup found!"
    exit 1
fi

echo "Daily maintenance completed successfully"

Weekly Tasks

#!/bin/bash
# weekly-maintenance.sh

echo "Starting weekly maintenance tasks..."

# 1. Security updates
echo "Checking for security updates..."
apt list --upgradable 2>/dev/null | grep -i security

# 2. Certificate expiry check
echo "Checking certificate expiry..."
for cert in /app/certs/*.crt; do
    expiry=$(openssl x509 -in "$cert" -noout -enddate | cut -d= -f2)
    expiry_epoch=$(date -d "$expiry" +%s)
    current_epoch=$(date +%s)
    days_until_expiry=$(( (expiry_epoch - current_epoch) / 86400 ))
    
    if [ "$days_until_expiry" -lt 30 ]; then
        echo "⚠️  Certificate $cert expires in $days_until_expiry days"
    fi
done

# 3. Performance analysis
echo "Running performance analysis..."
psql -U postgres -d dtem_prod -c "
    SELECT 
        schemaname,
        tablename,
        n_tup_ins,
        n_tup_upd,
        n_tup_del,
        n_live_tup,
        n_dead_tup
    FROM pg_stat_user_tables 
    WHERE n_dead_tup > n_live_tup * 0.1;
"

# 4. Index maintenance
echo "Checking index usage..."
psql -U postgres -d dtem_prod -c "
    SELECT 
        schemaname,
        tablename,
        indexname,
        idx_scan,
        pg_size_pretty(pg_relation_size(indexrelid)) as size
    FROM pg_stat_user_indexes 
    WHERE idx_scan < 100 
        AND pg_relation_size(indexrelid) > 10485760;
"

# 5. Log analysis
echo "Analyzing error logs..."
ERROR_COUNT=$(grep -c "ERROR" /var/log/dtem/app.log.$(date +%Y-%m-%d))
if [ "$ERROR_COUNT" -gt 10 ]; then
    echo "⚠️  High error count detected: $ERROR_COUNT errors"
fi

echo "Weekly maintenance completed"

Monthly Tasks

#!/bin/bash
# monthly-maintenance.sh

echo "Starting monthly maintenance tasks..."

# 1. Full system backup
echo "Performing full system backup..."
./scripts/backup-full-system.sh

# 2. Database vacuum
echo "Running database vacuum..."
psql -U postgres -d dtem_prod -c "VACUUM ANALYZE;"

# 3. Archive old data
echo "Archiving old data..."
psql -U postgres -d dtem_prod -c "
    CREATE TABLE audit_logs_archive_$(date +%Y_%m) AS 
    SELECT * FROM audit_logs 
    WHERE timestamp < NOW() - INTERVAL '1 year';
    
    DELETE FROM audit_logs 
    WHERE timestamp < NOW() - INTERVAL '1 year';
"

# 4. Performance tuning
echo "Running performance tuning..."
./scripts/performance-tuning.sh

# 5. Security audit
echo "Running security audit..."
./scripts/security-audit.sh

echo "Monthly maintenance completed"

8.1.2. Automated Maintenance

Kubernetes CronJobs

# maintenance-cronjobs.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
  name: daily-maintenance
  namespace: dtem-prod
spec:
  schedule: "0 2 * * *"  # Daily at 2 AM
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: maintenance
            image: dtem/maintenance:latest
            command: ["/bin/bash", "-c", "./scripts/daily-maintenance.sh"]
            env:
            - name: DB_HOST
              value: "postgres-service"
            - name: REDIS_HOST
              value: "redis-service"
            volumeMounts:
            - name: scripts
              mountPath: /app/scripts
            - name: logs
              mountPath: /var/log/dtem
          volumes:
          - name: scripts
            configMap:
              name: maintenance-scripts
          - name: logs
            persistentVolumeClaim:
              claimName: logs-pvc
          restartPolicy: OnFailure

---
apiVersion: batch/v1
kind: CronJob
metadata:
  name: weekly-maintenance
  namespace: dtem-prod
spec:
  schedule: "0 3 * * 0"  # Weekly on Sunday at 3 AM
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: maintenance
            image: dtem/maintenance:latest
            command: ["/bin/bash", "-c", "./scripts/weekly-maintenance.sh"]
            env:
            - name: DB_HOST
              value: "postgres-service"
            volumeMounts:
            - name: scripts
              mountPath: /app/scripts
          volumes:
          - name: scripts
            configMap:
              name: maintenance-scripts
          restartPolicy: OnFailure

8.2. Gestión de Actualizaciones

8.2.1. Update Process

Update Workflow

flowchart TD
    A[Update Available] --> B[Schedule Maintenance Window]
    B --> C[Create Backup]
    C --> D[Deploy to Staging]
    D --> E[Run Tests]
    E --> F{Tests Pass?}
    F -->|Yes| G[Deploy to Production]
    F -->|No| H[Rollback Staging]
    H --> I[Fix Issues]
    I --> D
    G --> J[Monitor Health]
    J --> K{Health OK?}
    K -->|Yes| L[Update Complete]
    K -->|No| M[Rollback Production]
    M --> N[Investigate Issues]

Automated Update Script

#!/bin/bash
# update-system.sh

set -euo pipefail

# Configuration
NEW_VERSION="${1:-latest}"
BACKUP_DIR="/backups/pre-update-$(date +%Y%m%d_%H%M%S)"
MAINTENANCE_MODE_FILE="/tmp/maintenance.mode"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

log() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}

warn() {
    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}

error() {
    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
    exit 1
}

# Pre-update checks
pre_update_checks() {
    log "Performing pre-update checks..."
    
    # Check disk space
    DISK_USAGE=$(df / | awk 'NR==2 {print $5}' | sed 's/%//')
    if [ "$DISK_USAGE" -gt 80 ]; then
        error "Disk usage is ${DISK_USAGE}%. Please free up space before updating."
    fi
    
    # Check memory
    AVAILABLE_MEM=$(free -m | awk 'NR==2{printf "%.0f", $7}')
    if [ "$AVAILABLE_MEM" -lt 1024 ]; then
        error "Available memory is ${AVAILABLE_MEM}MB. Minimum 1GB required."
    fi
    
    # Check database connections
    DB_CONNECTIONS=$(psql -U postgres -d dtem_prod -t -c "SELECT count(*) FROM pg_stat_activity;")
    if [ "$DB_CONNECTIONS" -gt 50 ]; then
        warn "High database connections: $DB_CONNECTIONS"
    fi
    
    log "Pre-update checks completed"
}

# Create backup
create_backup() {
    log "Creating backup in $BACKUP_DIR..."
    mkdir -p "$BACKUP_DIR"
    
    # Database backup
    pg_dump -U postgres -d dtem_prod -Fc > "$BACKUP_DIR/database.dump"
    
    # Configuration backup
    cp -r /app/config "$BACKUP_DIR/"
    cp -r /app/certs "$BACKUP_DIR/"
    
    # Verify backup
    if [ -f "$BACKUP_DIR/database.dump" ]; then
        log "Backup created successfully"
    else
        error "Backup creation failed"
    fi
}

# Enable maintenance mode
enable_maintenance_mode() {
    log "Enabling maintenance mode..."
    touch "$MAINTENANCE_MODE_FILE"
    
    # Scale down frontend
    kubectl scale deployment web-frontend --replicas=0 -n dtem-prod
    
    # Wait for pods to terminate
    kubectl wait --for=delete pod -l app=web-frontend -n dtem-prod --timeout=300s
    
    log "Maintenance mode enabled"
}

# Update application
update_application() {
    log "Updating application to version $NEW_VERSION..."
    
    # Pull new image
    docker pull dtem/api-gateway:$NEW_VERSION
    docker pull dtem/dte-service:$NEW_VERSION
    docker pull dtem/web-frontend:$NEW_VERSION
    
    # Update deployments
    kubectl set image deployment/api-gateway api-gateway=dtem/api-gateway:$NEW_VERSION -n dtem-prod
    kubectl set image deployment/dte-service dte-service=dtem/dte-service:$NEW_VERSION -n dtem-prod
    kubectl set image deployment/web-frontend web=dtem/web-frontend:$NEW_VERSION -n dtem-prod
    
    # Wait for rollout
    kubectl rollout status deployment/api-gateway -n dtem-prod --timeout=600s
    kubectl rollout status deployment/dte-service -n dtem-prod --timeout=600s
    kubectl rollout status deployment/web-frontend -n dtem-prod --timeout=600s
    
    log "Application updated successfully"
}

# Run post-update tests
post_update_tests() {
    log "Running post-update tests..."
    
    # Health checks
    for i in {1..30}; do
        if curl -f http://api.dtem.empresa.cl/health; then
            log "Health check passed"
            break
        fi
        
        if [ $i -eq 30 ]; then
            error "Health check failed after 30 attempts"
        fi
        
        sleep 10
    done
    
    # Database connectivity
    if psql -U postgres -d dtem_prod -c "SELECT 1;" > /dev/null 2>&1; then
        log "Database connectivity test passed"
    else
        error "Database connectivity test failed"
    fi
    
    # Redis connectivity
    if redis-cli ping > /dev/null 2>&1; then
        log "Redis connectivity test passed"
    else
        error "Redis connectivity test failed"
    fi
    
    log "Post-update tests completed"
}

# Disable maintenance mode
disable_maintenance_mode() {
    log "Disabling maintenance mode..."
    rm -f "$MAINTENANCE_MODE_FILE"
    
    # Scale up frontend
    kubectl scale deployment web-frontend --replicas=3 -n dtem-prod
    
    # Wait for pods to be ready
    kubectl wait --for=condition=ready pod -l app=web-frontend -n dtem-prod --timeout=300s
    
    log "Maintenance mode disabled"
}

# Rollback function
rollback() {
    warn "Rolling back update..."
    
    # Restore database
    pg_restore -U postgres -d dtem_prod --clean --if-exists "$BACKUP_DIR/database.dump"
    
    # Restore previous image versions
    kubectl set image deployment/api-gateway api-gateway=dtem/api-gateway:previous -n dtem-prod
    kubectl set image deployment/dte-service dte-service=dtem/dte-service:previous -n dtem-prod
    kubectl set image deployment/web-frontend web=dtem/web-frontend:previous -n dtem-prod
    
    # Wait for rollout
    kubectl rollout status deployment/api-gateway -n dtem-prod --timeout=600s
    kubectl rollout status deployment/dte-service -n dtem-prod --timeout=600s
    kubectl rollout status deployment/web-frontend -n dtem-prod --timeout=600s
    
    disable_maintenance_mode
    
    error "Update rolled back due to errors"
}

# Main execution
main() {
    log "Starting system update to version $NEW_VERSION"
    
    # Trap errors and rollback
    trap rollback ERR
    
    pre_update_checks
    create_backup
    enable_maintenance_mode
    update_application
    post_update_tests
    disable_maintenance_mode
    
    log "System update completed successfully"
    
    # Cleanup
    rm -rf "$BACKUP_DIR"
}

# Execute main function
main "$@"

8.2.2. Blue-Green Deployment

Blue-Green Strategy

# blue-green-deployment.yaml
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
  name: dtem-api-gateway
  namespace: dtem-prod
spec:
  replicas: 5
  strategy:
    blueGreen:
      activeService: api-gateway-active
      previewService: api-gateway-preview
      autoPromotionEnabled: false
      scaleDownDelaySeconds: 30
      prePromotionAnalysis:
        templates:
        - templateName: success-rate
        args:
        - name: service-name
          value: api-gateway-preview
      postPromotionAnalysis:
        templates:
        - templateName: success-rate
        args:
        - name: service-name
          value: api-gateway-active
  selector:
    matchLabels:
      app: api-gateway
  template:
    metadata:
      labels:
        app: api-gateway
    spec:
      containers:
      - name: api-gateway
        image: dtem/api-gateway:{{ .Values.image.tag }}
        ports:
        - containerPort: 3000
        livenessProbe:
          httpGet:
            path: /health
            port: 3000
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /ready
            port: 3000
          initialDelaySeconds: 5
          periodSeconds: 5
        resources:
          requests:
            memory: "256Mi"
            cpu: "250m"
          limits:
            memory: "512Mi"
            cpu: "500m"

8.3. Gestión de Incidentes

8.3.1. Incident Response Process

Incident Classification

Severity Response Time Escalation Impact
P0 - Critical 15 minutes Immediate System down, data loss
P1 - High 1 hour 4 hours Major functionality affected
P2 - Medium 4 hours 24 hours Partial functionality affected
P3 - Low 24 hours 72 hours Minor issues, cosmetic

Incident Response Workflow

flowchart TD
    A[Incident Detected] --> B{Triage & Classify}
    B --> C[Create Incident Ticket]
    C --> D[Notify On-call Team]
    D --> E[Investigate & Diagnose]
    E --> F[Implement Fix]
    F --> G[Monitor Resolution]
    G --> H{Resolved?}
    H -->|Yes| I[Post-Incident Review]
    H -->|No| E
    I --> J[Documentation]
    J --> K[Close Incident]

Incident Management Script

#!/bin/bash
# incident-response.sh

INCIDENT_ID="$1"
SEVERITY="$2"
DESCRIPTION="$3"

# Create incident directory
INCIDENT_DIR="/incidents/$(date +%Y-%m-%d)/$INCIDENT_ID"
mkdir -p "$INCIDENT_DIR"

# Create incident log
cat > "$INCIDENT_DIR/incident.log" << EOF
Incident ID: $INCIDENT_ID
Severity: $SEVERITY
Description: $DESCRIPTION
Start Time: $(date)
Status: Open
EOF

# Notify teams
send_notification() {
    local message="$1"
    local channel="$2"
    
    curl -X POST -H 'Content-type: application/json' \
        --data "{\"text\":\"$message\"}" \
        "$SLACK_WEBHOOK_URL"
}

# Send initial notification
send_notification "🚨 Incident $INCIDENT_ID ($SEVERITY): $DESCRIPTION" "#incidents"

# Start incident response session
start_incident_session() {
    echo "Starting incident response session for $INCIDENT_ID"
    
    # Create war room
    kubectl create namespace "incident-$INCIDENT_ID" --dry-run=client -o yaml | kubectl apply -f -
    
    # Deploy debugging tools
    kubectl run debug-pod --image=nicolaka/netshoot --rm -it --namespace="incident-$INCIDENT_ID" -- bash
    
    # Start logging
    script -a "$INCIDENT_DIR/session.log"
}

# Generate incident report
generate_report() {
    cat > "$INCIDENT_DIR/report.md" << EOF
# Incident Report: $INCIDENT_ID

## Summary
- **ID**: $INCIDENT_ID
- **Severity**: $SEVERITY
- **Start Time**: $(date)
- **Description**: $DESCRIPTION

## Timeline
$(cat "$INCIDENT_DIR/timeline.log")

## Root Cause Analysis
$(cat "$INCIDENT_DIR/root-cause.md")

## Resolution
$(cat "$INCIDENT_DIR/resolution.md")

## Lessons Learned
$(cat "$INCIDENT_DIR/lessons-learned.md")

## Action Items
$(cat "$INCIDENT_DIR/action-items.md")
EOF
}

start_incident_session

8.3.2. Post-Mortem Process

Post-Mortem Template

# Post-Mortem: Incident [INCIDENT_ID]

## Executive Summary
[Brief summary of the incident and its impact]

## Timeline
- **HH:MM**: Incident detected
- **HH:MM**: Investigation started
- **HH:MM**: Root cause identified
- **HH:MM**: Fix implemented
- **HH:MM**: Service restored
- **HH:MM**: Incident resolved

## Impact Assessment
- **Affected Services**: [List of services]
- **Duration**: [Total downtime]
- **Users Affected**: [Number of users]
- **Business Impact**: [Financial/operational impact]

## Root Cause Analysis
### What happened?
[Detailed description of the incident]

### Why did it happen?
[Root cause analysis]

### Contributing factors
[Factors that contributed to the incident]

## Resolution
### Immediate Actions
[Actions taken to resolve the incident]

### Permanent Fixes
[Long-term fixes implemented]

## Prevention Measures
### Short-term (1-4 weeks)
- [ ] Action item 1
- [ ] Action item 2

### Long-term (1-6 months)
- [ ] Action item 3
- [ ] Action item 4

## Lessons Learned
### What went well
- [Positive aspects of the response]

### What could be improved
- [Areas for improvement]

### Action Items
| Item | Owner | Due Date | Status |
|------|-------|----------|--------|
| [Action 1] | [Owner] | [Date] | [Status] |
| [Action 2] | [Owner] | [Date] | [Status] |

8.4. Performance Tuning

8.4.1. Database Performance

Performance Optimization Script

#!/bin/bash
# db-performance-tuning.sh

echo "Starting database performance tuning..."

# Analyze slow queries
echo "Analyzing slow queries..."
psql -U postgres -d dtem_prod -c "
    SELECT 
        query,
        calls,
        total_time,
        mean_time,
        rows
    FROM pg_stat_statements 
    WHERE mean_time > 1000 
    ORDER BY mean_time DESC 
    LIMIT 10;
"

# Check missing indexes
echo "Checking for missing indexes..."
psql -U postgres -d dtem_prod -c "
    SELECT 
        schemaname,
        tablename,
        attname,
        n_distinct,
        correlation
    FROM pg_stats 
    WHERE schemaname = 'public'
        AND n_distinct > 100
    ORDER BY n_distinct DESC;
"

# Update table statistics
echo "Updating table statistics..."
psql -U postgres -d dtem_prod -c "ANALYZE;"

# Rebuild fragmented indexes
echo "Rebuilding fragmented indexes..."
psql -U postgres -d dtem_prod -c "
    SELECT 
        schemaname,
        tablename,
        indexname,
        pg_size_pretty(pg_relation_size(indexrelid)) as size
    FROM pg_stat_user_indexes 
    WHERE idx_scan > 1000 
        AND pg_relation_size(indexrelid) > 104857600
        AND (schemaname, tablename) IN (
            SELECT schemaname, tablename 
            FROM pg_stat_user_tables 
            WHERE n_dead_tup > n_live_tup * 0.1
        );
" | while read -r schema table index size; do
    echo "Rebuilding index $schema.$table.$index ($size)"
    psql -U postgres -d dtem_prod -c "REINDEX INDEX CONCURRENTLY $schema.$index;"
done

# Optimize configuration
echo "Checking configuration optimization..."
cat << EOF
Recommended PostgreSQL configuration changes:

shared_buffers = 25% of RAM
effective_cache_size = 75% of RAM
work_mem = 4MB per connection
maintenance_work_mem = 10% of RAM
checkpoint_completion_target = 0.9
wal_buffers = 16MB
default_statistics_target = 100
random_page_cost = 1.1
effective_io_concurrency = 200
EOF

echo "Database performance tuning completed"

8.4.2. Application Performance

Application Optimization

// Performance monitoring middleware
const performanceMonitor = (req, res, next) => {
    const start = process.hrtime.bigint();
    
    res.on('finish', () => {
        const end = process.hrtime.bigint();
        const duration = Number(end - start) / 1000000; // Convert to milliseconds
        
        // Log slow requests
        if (duration > 1000) {
            console.warn(`Slow request: ${req.method} ${req.path} - ${duration}ms`);
        }
        
        // Record metrics
        httpRequestDuration.observe({
            method: req.method,
            route: req.route?.path || req.path,
            status_code: res.statusCode.toString()
        }, duration / 1000);
    });
    
    next();
};

// Database query optimization
class QueryOptimizer {
    static async optimizedDTEQuery(filters) {
        let query = `
            SELECT 
                d.id,
                d.document_type,
                d.folio,
                d.rut_receptor,
                d.fecha_emision,
                d.monto_total,
                d.status,
                d.created_at
            FROM dte_documents d
            WHERE 1=1
        `;
        
        const params = [];
        let paramIndex = 1;
        
        // Add filters efficiently
        if (filters.companyId) {
            query += ` AND d.company_id = $${paramIndex++}`;
            params.push(filters.companyId);
        }
        
        if (filters.documentType) {
            query += ` AND d.document_type = $${paramIndex++}`;
            params.push(filters.documentType);
        }
        
        if (filters.startDate) {
            query += ` AND d.fecha_emision >= $${paramIndex++}`;
            params.push(filters.startDate);
        }
        
        if (filters.endDate) {
            query += ` AND d.fecha_emision <= $${paramIndex++}`;
            params.push(filters.endDate);
        }
        
        // Add pagination
        query += ` ORDER BY d.created_at DESC LIMIT $${paramIndex++} OFFSET $${paramIndex++}`;
        params.push(filters.limit || 50, filters.offset || 0);
        
        // Execute with timeout
        const result = await pool.query(query, params);
        return result.rows;
    }
}

// Cache optimization
class CacheManager {
    constructor(redisClient) {
        this.redis = redisClient;
        this.defaultTTL = 300; // 5 minutes
    }
    
    async get(key) {
        try {
            const cached = await this.redis.get(key);
            return cached ? JSON.parse(cached) : null;
        } catch (error) {
            console.error('Cache get error:', error);
            return null;
        }
    }
    
    async set(key, value, ttl = this.defaultTTL) {
        try {
            await this.redis.setex(key, ttl, JSON.stringify(value));
        } catch (error) {
            console.error('Cache set error:', error);
        }
    }
    
    async invalidate(pattern) {
        try {
            const keys = await this.redis.keys(pattern);
            if (keys.length > 0) {
                await this.redis.del(...keys);
            }
        } catch (error) {
            console.error('Cache invalidate error:', error);
        }
    }
}

8.5. Capacity Planning

8.5.1. Resource Monitoring

Capacity Analysis Script

#!/bin/bash
# capacity-analysis.sh

echo "Starting capacity analysis..."

# CPU usage analysis
echo "=== CPU Usage Analysis ==="
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
echo "Current CPU usage: $CPU_USAGE%"

# Memory usage analysis
echo "=== Memory Usage Analysis ==="
MEMORY_INFO=$(free -h | grep "Mem:")
TOTAL_MEM=$(echo $MEMORY_INFO | awk '{print $2}')
USED_MEM=$(echo $MEMORY_INFO | awk '{print $3}')
echo "Memory usage: $USED_MEM / $TOTAL_MEM"

# Disk usage analysis
echo "=== Disk Usage Analysis ==="
df -h | grep -E "^/dev/" | while read -r line; do
    echo "$line"
done

# Database size analysis
echo "=== Database Size Analysis ==="
psql -U postgres -d dtem_prod -c "
    SELECT 
        pg_size_pretty(pg_database_size('dtem_prod')) as database_size,
        (SELECT count(*) FROM users) as users_count,
        (SELECT count(*) FROM dte_documents) as dte_count,
        (SELECT count(*) FROM cafs) as caf_count;
"

# Growth trends
echo "=== Growth Trends (Last 30 Days) ==="
psql -U postgres -d dtem_prod -c "
    SELECT 
        DATE_TRUNC('day', created_at) as date,
        COUNT(*) as new_dtes
    FROM dte_documents 
    WHERE created_at >= NOW() - INTERVAL '30 days'
    GROUP BY DATE_TRUNC('day', created_at)
    ORDER BY date DESC
    LIMIT 7;
"

# Performance metrics
echo "=== Performance Metrics ==="
psql -U postgres -d dtem_prod -c "
    SELECT 
        (SELECT count(*) FROM pg_stat_activity WHERE state = 'active') as active_connections,
        (SELECT count(*) FROM pg_stat_activity) as total_connections,
        (SELECT pg_size_pretty(pg_database_size('dtem_prod'))) as db_size;
"

echo "Capacity analysis completed"

8.5.2. Scaling Recommendations

Auto-scaling Configuration

# hpa-recommendations.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: dtem-api-gateway-hpa
  namespace: dtem-prod
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: api-gateway
  minReplicas: 3
  maxReplicas: 20
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80
  - type: Pods
    pods:
      metric:
        name: http_requests_per_second
      target:
        type: AverageValue
        averageValue: "100"
  behavior:
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
      - type: Percent
        value: 10
        periodSeconds: 60
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
      - type: Percent
        value: 50
        periodSeconds: 60
      - type: Pods
        value: 2
        periodSeconds: 60
      selectPolicy: Max

Próxima sección: 9. Disaster Recovery