Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
255 lines
7.7 KiB
Go
255 lines
7.7 KiB
Go
package logger
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// ContextKey is used for storing values in context
|
|
type ContextKey string
|
|
|
|
const (
|
|
// CorrelationIDKey is the key for correlation ID in context
|
|
CorrelationIDKey ContextKey = "correlation_id"
|
|
)
|
|
|
|
// StructuredLogger provides standardized logging for the meteor compute service
|
|
type StructuredLogger struct {
|
|
logger zerolog.Logger
|
|
service string
|
|
version string
|
|
}
|
|
|
|
// LogEntry represents a standardized log entry
|
|
type LogEntry struct {
|
|
Timestamp string `json:"timestamp"`
|
|
Level string `json:"level"`
|
|
ServiceName string `json:"service_name"`
|
|
CorrelationID *string `json:"correlation_id"`
|
|
Message string `json:"message"`
|
|
Extra interface{} `json:",inline"`
|
|
}
|
|
|
|
// Field represents a key-value pair for structured logging
|
|
type Field struct {
|
|
Key string
|
|
Value interface{}
|
|
}
|
|
|
|
// NewStructuredLogger creates a new structured logger instance
|
|
func NewStructuredLogger(service, version string) *StructuredLogger {
|
|
// Configure zerolog based on environment
|
|
if os.Getenv("NODE_ENV") == "development" {
|
|
// Pretty printing for development
|
|
log.Logger = log.Output(zerolog.ConsoleWriter{
|
|
Out: os.Stdout,
|
|
TimeFormat: time.RFC3339,
|
|
NoColor: false,
|
|
})
|
|
} else {
|
|
// JSON output for production
|
|
zerolog.TimeFieldFormat = time.RFC3339
|
|
}
|
|
|
|
// Set log level
|
|
logLevel := os.Getenv("LOG_LEVEL")
|
|
switch logLevel {
|
|
case "debug":
|
|
zerolog.SetGlobalLevel(zerolog.DebugLevel)
|
|
case "info":
|
|
zerolog.SetGlobalLevel(zerolog.InfoLevel)
|
|
case "warn":
|
|
zerolog.SetGlobalLevel(zerolog.WarnLevel)
|
|
case "error":
|
|
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
|
|
default:
|
|
zerolog.SetGlobalLevel(zerolog.InfoLevel)
|
|
}
|
|
|
|
logger := log.With().
|
|
Str("service_name", service).
|
|
Str("version", version).
|
|
Logger()
|
|
|
|
return &StructuredLogger{
|
|
logger: logger,
|
|
service: service,
|
|
version: version,
|
|
}
|
|
}
|
|
|
|
// WithCorrelationID adds correlation ID to context
|
|
func WithCorrelationID(ctx context.Context, correlationID string) context.Context {
|
|
return context.WithValue(ctx, CorrelationIDKey, correlationID)
|
|
}
|
|
|
|
// GetCorrelationID retrieves correlation ID from context
|
|
func GetCorrelationID(ctx context.Context) *string {
|
|
if correlationID, ok := ctx.Value(CorrelationIDKey).(string); ok && correlationID != "" {
|
|
return &correlationID
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// createLogEvent creates a zerolog event with common fields
|
|
func (l *StructuredLogger) createLogEvent(level zerolog.Level, ctx context.Context) *zerolog.Event {
|
|
event := l.logger.WithLevel(level).
|
|
Timestamp().
|
|
Str("service_name", l.service)
|
|
|
|
if correlationID := GetCorrelationID(ctx); correlationID != nil {
|
|
event = event.Str("correlation_id", *correlationID)
|
|
}
|
|
|
|
return event
|
|
}
|
|
|
|
// Info logs an info level message
|
|
func (l *StructuredLogger) Info(ctx context.Context, message string, fields ...Field) {
|
|
event := l.createLogEvent(zerolog.InfoLevel, ctx)
|
|
for _, field := range fields {
|
|
event = event.Interface(field.Key, field.Value)
|
|
}
|
|
event.Msg(message)
|
|
}
|
|
|
|
// Warn logs a warning level message
|
|
func (l *StructuredLogger) Warn(ctx context.Context, message string, fields ...Field) {
|
|
event := l.createLogEvent(zerolog.WarnLevel, ctx)
|
|
for _, field := range fields {
|
|
event = event.Interface(field.Key, field.Value)
|
|
}
|
|
event.Msg(message)
|
|
}
|
|
|
|
// Error logs an error level message
|
|
func (l *StructuredLogger) Error(ctx context.Context, message string, err error, fields ...Field) {
|
|
event := l.createLogEvent(zerolog.ErrorLevel, ctx)
|
|
if err != nil {
|
|
event = event.Err(err)
|
|
}
|
|
for _, field := range fields {
|
|
event = event.Interface(field.Key, field.Value)
|
|
}
|
|
event.Msg(message)
|
|
}
|
|
|
|
// Debug logs a debug level message
|
|
func (l *StructuredLogger) Debug(ctx context.Context, message string, fields ...Field) {
|
|
event := l.createLogEvent(zerolog.DebugLevel, ctx)
|
|
for _, field := range fields {
|
|
event = event.Interface(field.Key, field.Value)
|
|
}
|
|
event.Msg(message)
|
|
}
|
|
|
|
// Business-specific logging methods
|
|
|
|
// ProcessingEvent logs event processing information
|
|
func (l *StructuredLogger) ProcessingEvent(ctx context.Context, eventID, stage string, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "event_id", Value: eventID},
|
|
Field{Key: "processing_stage", Value: stage},
|
|
)
|
|
l.Info(ctx, "Event processing stage", allFields...)
|
|
}
|
|
|
|
// ValidationEvent logs validation-related events
|
|
func (l *StructuredLogger) ValidationEvent(ctx context.Context, eventID, algorithm string, isValid bool, score float64, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "event_id", Value: eventID},
|
|
Field{Key: "validation_algorithm", Value: algorithm},
|
|
Field{Key: "is_valid", Value: isValid},
|
|
Field{Key: "validation_score", Value: score},
|
|
)
|
|
l.Info(ctx, "Event validation completed", allFields...)
|
|
}
|
|
|
|
// DatabaseEvent logs database operations
|
|
func (l *StructuredLogger) DatabaseEvent(ctx context.Context, operation string, duration time.Duration, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "database_operation", Value: operation},
|
|
Field{Key: "duration_ms", Value: duration.Milliseconds()},
|
|
)
|
|
l.Debug(ctx, "Database operation completed", allFields...)
|
|
}
|
|
|
|
// SQSEvent logs SQS-related events
|
|
func (l *StructuredLogger) SQSEvent(ctx context.Context, operation, messageID string, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "sqs_operation", Value: operation},
|
|
Field{Key: "sqs_message_id", Value: messageID},
|
|
)
|
|
l.Info(ctx, "SQS operation", allFields...)
|
|
}
|
|
|
|
// StartupEvent logs application startup events
|
|
func (l *StructuredLogger) StartupEvent(ctx context.Context, component string, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "startup_component", Value: component},
|
|
)
|
|
l.Info(ctx, "Component initialized", allFields...)
|
|
}
|
|
|
|
// HealthEvent logs health check events
|
|
func (l *StructuredLogger) HealthEvent(ctx context.Context, component string, healthy bool, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "health_component", Value: component},
|
|
Field{Key: "healthy", Value: healthy},
|
|
)
|
|
|
|
if healthy {
|
|
l.Debug(ctx, "Health check passed", allFields...)
|
|
} else {
|
|
l.Warn(ctx, "Health check failed", allFields...)
|
|
}
|
|
}
|
|
|
|
// SecurityEvent logs security-related events
|
|
func (l *StructuredLogger) SecurityEvent(ctx context.Context, event string, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "security_event", Value: event},
|
|
)
|
|
l.Warn(ctx, "Security event detected", allFields...)
|
|
}
|
|
|
|
// PerformanceEvent logs performance metrics
|
|
func (l *StructuredLogger) PerformanceEvent(ctx context.Context, operation string, duration time.Duration, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "performance_operation", Value: operation},
|
|
Field{Key: "duration_ms", Value: duration.Milliseconds()},
|
|
)
|
|
l.Info(ctx, "Performance metric", allFields...)
|
|
}
|
|
|
|
// MetricsEvent logs metrics and statistics
|
|
func (l *StructuredLogger) MetricsEvent(ctx context.Context, metric string, value interface{}, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "metric_name", Value: metric},
|
|
Field{Key: "metric_value", Value: value},
|
|
)
|
|
l.Info(ctx, "Metrics data", allFields...)
|
|
}
|
|
|
|
// WorkerEvent logs worker-specific events
|
|
func (l *StructuredLogger) WorkerEvent(ctx context.Context, workerID int, event string, fields ...Field) {
|
|
allFields := append(fields,
|
|
Field{Key: "worker_id", Value: workerID},
|
|
Field{Key: "worker_event", Value: event},
|
|
)
|
|
l.Info(ctx, "Worker event", allFields...)
|
|
}
|
|
|
|
// NewField creates a field for structured logging
|
|
func NewField(key string, value interface{}) Field {
|
|
return Field{Key: key, Value: value}
|
|
}
|
|
|
|
// GetZerologLogger returns the underlying zerolog.Logger for external integrations
|
|
func (l *StructuredLogger) GetZerologLogger() zerolog.Logger {
|
|
return l.logger
|
|
} |