9. Disaster Recovery
Esta sección cubre las estrategias de recuperación ante desastres, incluyendo RTO/RPO, planes de recuperación, testing y comunicación durante crisis.
9.1. Estrategia de Disaster Recovery
9.1.1. Definiciones y Objetivos
RTO/RPO por Componente
| Componente | RTO (Recovery Time Objective) | RPO (Recovery Point Objective) | Estrategia |
|---|---|---|---|
| Base de Datos Principal | 1 hora | 15 minutos | Hot Standby con replicación síncrona |
| Aplicaciones Web | 30 minutos | 0 minutos | Multi-region deployment |
| Cola de Mensajes | 15 minutos | 0 minutos | Multi-AZ con replicación |
| Cache (Redis) | 30 minutos | 1 hora | Backup diario + rebuild |
| Almacenamiento de Archivos | 2 horas | 1 hora | Cross-region replication |
| Configuración y Secretos | 15 minutos | 0 minutos | GitOps con múltiples réplicas |
Clasificación de Desastres
| Tipo | Descripción | Probabilidad | Impacto |
|---|---|---|---|
| Nivel 1 - Menor | Fallo de componente individual | Alta | Bajo |
| Nivel 2 - Mayor | Fallo de AZ completa | Media | Medio |
| Nivel 3 - Crítico | Fallo de región completa | Baja | Alto |
| Nivel 4 - Catastrófico | Múltiples regiones afectadas | Muy Baja | Crítico |
9.1.2. Arquitectura de Alta Disponibilidad
Multi-Region Architecture
┌─────────────────────────────────────────────────────────────┐
│ PRIMARY REGION │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Web App │ │ API GW │ │ Services │ │
│ │ (3x AZ) │ │ (3x AZ) │ │ (3x AZ) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │ │ │ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ DATABASE CLUSTER │ │
│ │ ┌─────────────┬─────────────┬─────────────┐ │ │
│ │ │ Primary │ Replica 1 │ Replica 2 │ │ │
│ │ │ (AZ-A) │ (AZ-B) │ (AZ-C) │ │ │
│ │ └─────────────┴─────────────┴─────────────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
│
┌─────────────────┐
│ Replication │
│ (Async) │
└─────────────────┘
│
┌─────────────────────────────────────────────────────────────┐
│ DR REGION │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Web App │ │ API GW │ │ Services │ │
│ │ (Standby) │ │ (Standby) │ │ (Standby) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │ │ │ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ DATABASE CLUSTER │ │
│ │ ┌─────────────┬─────────────┬─────────────┐ │ │
│ │ │ Replica 1 │ Replica 2 │ Replica 3 │ │ │
│ │ │ (AZ-D) │ (AZ-E) │ (AZ-F) │ │ │
│ │ └─────────────┴─────────────┴─────────────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
9.2. Plan de Recuperación
9.2.1. Procedimientos de Recuperación
Database Recovery Procedure
#!/bin/bash
# database-recovery.sh
set -euo pipefail
# Configuration
PRIMARY_REGION="us-east-1"
DR_REGION="us-west-2"
DB_INSTANCE="dtem-postgres"
BACKUP_BUCKET="dtem-backups"
RECOVERY_TIME="${1:-$(date +%Y-%m-%d\ %H:%M:%S)}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log() {
echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}
warn() {
echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}
error() {
echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
exit 1
}
# Step 1: Assess situation
assess_situation() {
log "Assessing disaster situation..."
# Check primary region status
if aws rds describe-db-clusters --region $PRIMARY_REGION --db-cluster-identifier $DB_INSTANCE >/dev/null 2>&1; then
warn "Primary region still accessible"
else
log "Primary region not accessible - proceeding with DR activation"
fi
# Check DR region status
if aws rds describe-db-clusters --region $DR_REGION --db-cluster-identifier $DB_INSTANCE >/dev/null 2>&1; then
log "DR region accessible"
else
error "DR region not accessible"
fi
}
# Step 2: Promote DR database
promote_dr_database() {
log "Promoting DR database to primary..."
# Promote read replica to primary
aws rds promote-read-replica \
--region $DR_REGION \
--db-instance-identifier "${DB_INSTANCE}-dr-replica" \
--backup-retention-period 7 \
--apply-immediately
# Wait for promotion to complete
log "Waiting for database promotion to complete..."
aws rds wait db-instance-available \
--region $DR_REGION \
--db-instance-identifier "${DB_INSTANCE}-dr-replica"
# Update DNS to point to DR database
log "Updating DNS to point to DR database..."
aws route53 change-resource-record-sets \
--hosted-zone-id $HOSTED_ZONE_ID \
--change-batch file://dns-update-dr.json
log "DR database promotion completed"
}
# Step 3: Restore from backup if needed
restore_from_backup() {
local target_time="$1"
log "Restoring database to point-in-time: $target_time"
# Find latest backup before target time
BACKUP_ID=$(aws rds describe-db-snapshots \
--region $DR_REGION \
--db-instance-identifier $DB_INSTANCE \
--query "DBSnapshots[?SnapshotTime<\`$target_time\`].DBSnapshotIdentifier | [-1]" \
--output text)
if [ -z "$BACKUP_ID" ]; then
error "No backup found before $target_time"
fi
log "Using backup: $BACKUP_ID"
# Restore from backup
aws rds restore-db-instance-from-db-snapshot \
--region $DR_REGION \
--db-instance-identifier "${DB_INSTANCE}-restored" \
--db-snapshot-identifier "$BACKUP_ID" \
--db-instance-class db.m5.large \
--multi-az \
--publicly-accessible \
--no-copy-tags-to-snapshot
# Wait for restore to complete
log "Waiting for database restore to complete..."
aws rds wait db-instance-available \
--region $DR_REGION \
--db-instance-identifier "${DB_INSTANCE}-restored"
log "Database restore completed"
}
# Step 4: Update application configuration
update_application_config() {
log "Updating application configuration..."
# Update Kubernetes secrets
kubectl create secret generic db-credentials \
--from-literal=host="${DB_INSTANCE}-restored.${DR_REGION}.rds.amazonaws.com" \
--from-literal=port=5432 \
--from-literal=database=dtem_prod \
--from-literal=username=dtem_user \
--from-literal=password="$DB_PASSWORD" \
--namespace dtem-prod \
--dry-run=client -o yaml | kubectl apply -f -
# Restart applications
kubectl rollout restart deployment/api-gateway -n dtem-prod
kubectl rollout restart deployment/dte-service -n dtem-prod
log "Application configuration updated"
}
# Step 5: Verify recovery
verify_recovery() {
log "Verifying recovery..."
# Wait for applications to be ready
kubectl wait --for=condition=available deployment/api-gateway -n dtem-prod --timeout=600s
kubectl wait --for=condition=available deployment/dte-service -n dtem-prod --timeout=600s
# Test database connectivity
for i in {1..30}; do
if kubectl exec -n dtem-prod deployment/api-gateway -- curl -f http://localhost:3000/health/db; then
log "Database connectivity verified"
break
fi
if [ $i -eq 30 ]; then
error "Database connectivity verification failed"
fi
sleep 10
done
# Test application functionality
if kubectl exec -n dtem-prod deployment/api-gateway -- curl -f http://localhost:3000/health; then
log "Application health verified"
else
error "Application health verification failed"
fi
log "Recovery verification completed successfully"
}
# Step 6: Notify stakeholders
notify_stakeholders() {
log "Notifying stakeholders..."
local message="🚨 Disaster Recovery Completed
System: DTEM
Recovery Time: $(date)
DR Region: $DR_REGION
Database: ${DB_INSTANCE}-restored
Status: Operational
Next Steps:
- Monitor system performance
- Validate all functionality
- Plan return to primary region when safe"
# Send Slack notification
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"$message\"}" \
"$SLACK_WEBHOOK_URL"
# Send email notification
echo "$message" | mail -s "DTEM Disaster Recovery Completed" \
ops-team@empresa.cl
log "Stakeholders notified"
}
# Main execution
main() {
local recovery_type="${2:-promote}"
log "Starting disaster recovery procedure..."
log "Recovery time: $RECOVERY_TIME"
log "Recovery type: $recovery_type"
assess_situation
case "$recovery_type" in
"promote")
promote_dr_database
;;
"restore")
restore_from_backup "$RECOVERY_TIME"
;;
*)
error "Invalid recovery type: $recovery_type"
;;
esac
update_application_config
verify_recovery
notify_stakeholders
log "Disaster recovery completed successfully"
}
# Execute main function
main "$@"
Application Recovery Procedure
#!/bin/bash
# application-recovery.sh
set -euo pipefail
DR_REGION="us-west-2"
NAMESPACE="dtem-prod"
log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1"
}
# Step 1: Activate DR region
activate_dr_region() {
log "Activating DR region..."
# Update DNS to point to DR region
aws route53 change-resource-record-sets \
--hosted-zone-id $HOSTED_ZONE_ID \
--change-batch file://dns-update-dr.json
# Scale up applications in DR region
kubectl scale deployment api-gateway --replicas=5 -n $NAMESPACE
kubectl scale deployment dte-service --replicas=10 -n $NAMESPACE
kubectl scale deployment web-frontend --replicas=3 -n $NAMESPACE
log "DR region activated"
}
# Step 2: Verify application health
verify_application_health() {
log "Verifying application health..."
# Wait for pods to be ready
kubectl wait --for=condition=available deployment/api-gateway -n $NAMESPACE --timeout=600s
kubectl wait --for=condition=available deployment/dte-service -n $NAMESPACE --timeout=600s
kubectl wait --for=condition=available deployment/web-frontend -n $NAMESPACE --timeout=600s
# Test endpoints
local endpoints=(
"https://api.dtem.empresa.cl/health"
"https://api.dtem.empresa.cl/health/db"
"https://api.dtem.empresa.cl/health/redis"
)
for endpoint in "${endpoints[@]}"; do
for i in {1..30}; do
if curl -f "$endpoint"; then
log "Endpoint healthy: $endpoint"
break
fi
if [ $i -eq 30 ]; then
log "ERROR: Endpoint unhealthy: $endpoint"
return 1
fi
sleep 10
done
done
log "Application health verified"
}
# Step 3: Test critical functionality
test_critical_functionality() {
log "Testing critical functionality..."
# Test user authentication
if curl -X POST https://api.dtem.empresa.cl/api/auth/login \
-H "Content-Type: application/json" \
-d '{"email":"test@empresa.cl","password":"test123"}' \
-f; then
log "Authentication test passed"
else
log "ERROR: Authentication test failed"
return 1
fi
# Test DTE creation
if curl -X POST https://api.dtem.empresa.cl/api/v1/dte/create \
-H "Authorization: Bearer $TEST_TOKEN" \
-H "Content-Type: application/json" \
-d '{"documentType":"33","rutReceptor":"12345678-9","montoTotal":10000}' \
-f; then
log "DTE creation test passed"
else
log "ERROR: DTE creation test failed"
return 1
fi
log "Critical functionality tests passed"
}
# Main execution
main() {
log "Starting application recovery..."
activate_dr_region
verify_application_health
test_critical_functionality
log "Application recovery completed successfully"
}
main "$@"
9.2.2. Runbook de Recuperación
Disaster Recovery Runbook
# Disaster Recovery Runbook
## Pre-requisites
- Access to AWS console in both regions
- kubectl access to DR cluster
- Slack access for notifications
- Database credentials
## Level 1 Disaster (Component Failure)
### Symptoms
- Single service unavailable
- Error rate > 5% for specific endpoint
- Database connection issues
### Actions
1. **Identify failed component**
```bash
kubectl get pods -n dtem-prod
kubectl logs -f deployment/failed-service -n dtem-prod
-
Restart failed service
kubectl rollout restart deployment/failed-service -n dtem-prod -
Verify recovery
kubectl wait --for=condition=available deployment/failed-service -n dtem-prod
Level 2 Disaster (AZ Failure)
Symptoms
- Multiple services unavailable
- High error rates across endpoints
- Database connectivity issues
Actions
-
Verify AZ status
aws ec2 describe-instance-status --filters Name=availability-zone,Values=us-east-1a -
Failover to other AZs
kubectl patch deployment api-gateway -p '{"spec":{"template":{"spec":{"affinity":{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"topology.kubernetes.io/zone","operator":"NotIn","values":["us-east-1a"]}]}]}}}}}}' -n dtem-prod -
Scale up services
kubectl scale deployment api-gateway --replicas=10 -n dtem-prod
Level 3 Disaster (Region Failure)
Symptoms
- Complete region outage
- DNS resolution failures
- No access to primary infrastructure
Actions
-
Declare disaster
- Notify management
- Activate war room
- Start incident response
-
Activate DR region
./scripts/disaster-recovery.sh "$(date)" "promote" -
Update DNS
aws route53 change-resource-record-sets --hosted-zone-id Z123456789 --change-batch file://dr-dns.json -
Verify services
./scripts/application-recovery.sh
Level 4 Disaster (Multi-Region)
Symptoms
- Multiple regions affected
- Widespread infrastructure failure
- External provider issues
Actions
-
Assess impact
- Identify available infrastructure
- Evaluate data loss
- Estimate recovery time
-
Activate emergency procedures
- Implement manual processes
- Activate third-party backups
- Establish communication channels
-
Recovery planning
- Prioritize critical services
- Plan phased recovery
- Coordinate with vendors
9.3. Testing de Disaster Recovery
9.3.1. DR Testing Schedule
Testing Matrix
| Test Type | Frequency | Duration | Participants | Success Criteria |
|---|---|---|---|---|
| Tabletop Exercise | Quarterly | 2 hours | All teams | Plan reviewed |
| Component Failover | Monthly | 1 hour | Ops team | Service restored |
| AZ Failover | Quarterly | 4 hours | Ops + Dev | Full functionality |
| Region Failover | Semi-annually | 8 hours | All teams | RTO/RPO met |
| Full DR Test | Annually | 24 hours | All teams | Complete recovery |
DR Test Script
#!/bin/bash
# dr-test.sh
set -euo pipefail
TEST_ID="DR-TEST-$(date +%Y%m%d-%H%M%S)"
LOG_FILE="/var/log/dr-tests/${TEST_ID}.log"
log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# Pre-test checks
pre_test_checks() {
log "Starting pre-test checks..."
# Verify primary region health
if ! curl -f https://api.dtem.empresa.cl/health; then
log "ERROR: Primary region not healthy"
return 1
fi
# Verify DR region readiness
if ! kubectl get nodes -n dtem-dr; then
log "ERROR: DR region not accessible"
return 1
fi
# Check backup availability
LATEST_BACKUP=$(aws s3 ls s3://dtem-backups/database/ | tail -n1 | awk '{print $4}')
if [ -z "$LATEST_BACKUP" ]; then
log "ERROR: No recent backup found"
return 1
fi
log "Pre-test checks passed"
}
# Simulate disaster
simulate_disaster() {
log "Simulating disaster scenario..."
# Scale down primary region services
kubectl scale deployment api-gateway --replicas=0 -n dtem-prod
kubectl scale deployment dte-service --replicas=0 -n dtem-prod
kubectl scale deployment web-frontend --replicas=0 -n dtem-prod
# Wait for scale down
kubectl wait --for=delete pod -l app=api-gateway -n dtem-prod --timeout=300s
log "Disaster simulation completed"
}
# Execute DR procedures
execute_dr_procedures() {
log "Executing DR procedures..."
local start_time=$(date +%s)
# Activate DR region
./scripts/disaster-recovery.sh "$(date)" "promote" >> "$LOG_FILE" 2>&1
local end_time=$(date +%s)
local rto=$((end_time - start_time))
log "DR procedures completed in ${rto} seconds"
# Check RTO compliance
if [ "$rto" -le 3600 ]; then # 1 hour RTO
log "✅ RTO compliance: ${rto}s (target: 3600s)"
else
log "❌ RTO violation: ${rto}s (target: 3600s)"
fi
}
# Validate recovery
validate_recovery() {
log "Validating recovery..."
# Test application functionality
local test_results=()
# Health check
if curl -f https://api.dtem.empresa.cl/health; then
test_results+=("Health: PASS")
else
test_results+=("Health: FAIL")
fi
# Database connectivity
if curl -f https://api.dtem.empresa.cl/health/db; then
test_results+=("Database: PASS")
else
test_results+=("Database: FAIL")
fi
# DTE creation test
if curl -X POST https://api.dtem.empresa.cl/api/v1/dte/test \
-H "Authorization: Bearer $TEST_TOKEN" \
-H "Content-Type: application/json" \
-d '{"documentType":"33","test":true}' \
-f; then
test_results+=("DTE Creation: PASS")
else
test_results+=("DTE Creation: FAIL")
fi
# Log results
for result in "${test_results[@]}"; do
log "Test: $result"
done
# Check overall success
local failed_tests=$(printf '%s\n' "${test_results[@]}" | grep -c "FAIL")
if [ "$failed_tests" -eq 0 ]; then
log "✅ All validation tests passed"
return 0
else
log "❌ $failed_tests validation tests failed"
return 1
fi
}
# Generate test report
generate_report() {
log "Generating test report..."
local report_file="/var/log/dr-tests/${TEST_ID}-report.md"
cat > "$report_file" << EOF
# Disaster Recovery Test Report
## Test Information
- **Test ID**: $TEST_ID
- **Date**: $(date)
- **Duration**: $(grep "DR procedures completed" "$LOG_FILE" | awk '{print $NF}')
- **Participants**: $(git log -1 --pretty=format:'%an')
## Test Results
$(grep "Test:" "$LOG_FILE" | sed 's/^/- /')
## RTO/RPO Metrics
- **RTO Achieved**: $(grep "RTO compliance" "$LOG_FILE" | awk '{print $3}')
- **RPO Achieved**: $(grep "Backup" "$LOG_FILE" | tail -n1 | awk '{print $NF}')
## Issues Identified
$(grep "ERROR\|FAIL" "$LOG_FILE" | sed 's/^/- /' || echo "None")
## Recommendations
$(grep "recommendation\|improvement" "$LOG_FILE" | sed 's/^/- /' || echo "None")
## Next Steps
- Schedule follow-up meeting
- Update DR procedures
- Plan next test cycle
EOF
log "Report generated: $report_file"
}
# Restore primary region
restore_primary_region() {
log "Restoring primary region..."
# Scale up primary services
kubectl scale deployment api-gateway --replicas=5 -n dtem-prod
kubectl scale deployment dte-service --replicas=10 -n dtem-prod
kubectl scale deployment web-frontend --replicas=3 -n dtem-prod
# Wait for services to be ready
kubectl wait --for=condition=available deployment/api-gateway -n dtem-prod --timeout=600s
kubectl wait --for=condition=available deployment/dte-service -n dtem-prod --timeout=600s
kubectl wait --for=condition=available deployment/web-frontend -n dtem-prod --timeout=600s
# Update DNS back to primary
aws route53 change-resource-record-sets \
--hosted-zone-id $HOSTED_ZONE_ID \
--change-batch file://dns-update-primary.json
log "Primary region restored"
}
# Main execution
main() {
log "Starting DR test: $TEST_ID"
# Create log directory
mkdir -p "$(dirname "$LOG_FILE")"
# Execute test phases
pre_test_checks || exit 1
simulate_disaster
execute_dr_procedures
validate_recovery || exit 1
generate_report
restore_primary_region
log "DR test completed successfully: $TEST_ID"
# Send notification
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"✅ DR Test Completed: $TEST_ID\nReport: $(basename "$report_file")\"}" \
"$SLACK_WEBHOOK_URL"
}
# Execute main function
main "$@"
9.4. Comunicación durante Crisis
9.4.1. Plan de Comunicación
Communication Matrix
| Stakeholder | Communication Method | Frequency | Content |
|---|---|---|---|
| Executive Team | Phone call + Email | Immediate | Impact assessment, ETA |
| Technical Team | Slack + War room | Continuous | Technical updates, actions |
| Customers | Email + Status Page | Every 30 min | Service status, impact |
| Partners | Email + Phone | Every hour | Integration status |
| Regulators | Formal Report | Within 24h | Compliance notification |
Communication Templates
# Initial Incident Notification
**TO**: All Stakeholders
**SUBJECT**: 🚨 CRITICAL: DTEM Service Outage
**TIME**: [Timestamp]
## Situation
We are currently experiencing a critical service outage affecting all DTEM services.
## Impact
- All DTE creation and processing functions are unavailable
- Customer portal is inaccessible
- API endpoints are not responding
## Current Actions
- Incident response team activated
- Disaster recovery procedures initiated
- Root cause investigation in progress
## ETA
We expect to restore service within [X] hours.
## Next Update
Next update will be provided within 30 minutes.
## Contact
- Technical Lead: [Name] - [Phone]
- Incident Commander: [Name] - [Phone]
---
# Service Restoration Notification
**TO**: All Stakeholders
**SUBJECT**: ✅ RESOLVED: DTEM Service Restored
**TIME**: [Timestamp]
## Resolution
All DTEM services have been successfully restored following the critical outage that began at [start time].
## Actions Taken
- [Brief description of actions taken]
- [Systems restored]
- [Verification completed]
## Current Status
- All services are operational
- Performance monitoring active
- Data integrity verified
## Follow-up Actions
- Post-incident review scheduled
- Preventive measures being implemented
- Full incident report to follow
## Support
If you experience any issues, please contact:
- Support: [Email/Phone]
- Status Page: [URL]
---
# Post-Incident Report Summary
**TO**: Management Team
**SUBJECT**: Post-Incident Report: [Incident ID]
**DATE**: [Date]
## Executive Summary
[Brief summary of incident, impact, and resolution]
## Timeline
- [Start time]: Incident detected
- [Time]: DR procedures initiated
- [Time]: Service restored
- Total downtime: [Duration]
## Root Cause
[Primary cause of the incident]
## Impact Assessment
- Customers affected: [Number]
- Transactions lost: [Number]
- Financial impact: [Amount]
## Lessons Learned
- [Key takeaways]
- [Improvement opportunities]
## Action Items
| Item | Owner | Due Date | Status |
|------|-------|----------|--------|
| [Action 1] | [Owner] | [Date] | [Status] |
| [Action 2] | [Owner] | [Date] | [Status] |
9.4.2. Status Page Management
Status Page Configuration
// status-page-service.js
class StatusPageService {
constructor() {
this.statuses = {
operational: 'Operational',
degraded_performance: 'Degraded Performance',
partial_outage: 'Partial Outage',
major_outage: 'Major Outage'
};
this.components = [
{ id: 'api', name: 'API Gateway', status: 'operational' },
{ id: 'database', name: 'Database', status: 'operational' },
{ id: 'webapp', name: 'Web Application', status: 'operational' },
{ id: 'queue', name: 'Message Queue', status: 'operational' },
{ id: 'cache', name: 'Cache', status: 'operational' }
];
this.incidents = [];
}
async updateComponentStatus(componentId, status, description = '') {
const component = this.components.find(c => c.id === componentId);
if (component) {
component.status = status;
component.description = description;
await this.publishUpdate();
}
}
async createIncident(title, description, impact) {
const incident = {
id: crypto.randomUUID(),
title,
description,
impact,
status: 'investigating',
createdAt: new Date().toISOString(),
updates: []
};
this.incidents.unshift(incident);
await this.publishUpdate();
return incident;
}
async addIncidentUpdate(incidentId, message, status) {
const incident = this.incidents.find(i => i.id === incidentId);
if (incident) {
incident.updates.push({
message,
status,
timestamp: new Date().toISOString()
});
if (status === 'resolved') {
incident.status = 'resolved';
incident.resolvedAt = new Date().toISOString();
}
await this.publishUpdate();
}
}
async publishUpdate() {
const statusData = {
status: this.calculateOverallStatus(),
components: this.components,
incidents: this.incidents.filter(i => i.status !== 'resolved'),
lastUpdated: new Date().toISOString()
};
// Update status page
await this.updateStatusPage(statusData);
// Send notifications
await this.sendNotifications(statusData);
}
calculateOverallStatus() {
const componentStatuses = this.components.map(c => c.status);
if (componentStatuses.includes('major_outage')) {
return 'major_outage';
} else if (componentStatuses.includes('partial_outage')) {
return 'partial_outage';
} else if (componentStatuses.includes('degraded_performance')) {
return 'degraded_performance';
} else {
return 'operational';
}
}
async updateStatusPage(data) {
// Update status page API
await fetch('https://status.dtem.empresa.cl/api/update', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(data)
});
}
async sendNotifications(data) {
// Send Slack notification
if (data.status !== 'operational') {
await this.sendSlackNotification(data);
}
// Send email notification for major incidents
if (data.status === 'major_outage') {
await this.sendEmailNotification(data);
}
}
async sendSlackNotification(data) {
const message = {
text: `DTEM Status Update: ${this.statuses[data.status]}`,
attachments: [{
color: this.getStatusColor(data.status),
fields: [
{ title: 'Overall Status', value: this.statuses[data.status], short: true },
{ title: 'Last Updated', value: new Date(data.lastUpdated).toLocaleString(), short: true }
],
fields: data.components.map(c => ({
title: c.name,
value: this.statuses[c.status],
short: true
}))
}]
};
await fetch(process.env.SLACK_WEBHOOK_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(message)
});
}
getStatusColor(status) {
const colors = {
operational: 'good',
degraded_performance: 'warning',
partial_outage: 'danger',
major_outage: 'danger'
};
return colors[status] || 'warning';
}
}
Próxima sección: 10. Apéndices