grabbit ca7e92a1a1 🎉 Epic 3 Complete: Production Readiness & Observability
Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3:

**Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)**
- Instrumented NestJS web backend with CloudWatch metrics integration using prom-client
- Instrumented Go compute service with structured CloudWatch metrics reporting
- Created comprehensive Terraform infrastructure from scratch with modular design
- Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics
- Added proper error categorization and provider performance tracking

**Story 3.6: 关键故障告警 (Critical System Alerts)**
- Implemented SNS-based alerting infrastructure via Terraform
- Created critical alarms for NestJS 5xx error rate (>1% threshold)
- Created Go service processing failure rate alarm (>5% threshold)
- Created SQS queue depth alarm (>1000 messages threshold)
- Added actionable alarm descriptions with investigation guidance
- Configured email notifications with manual confirmation workflow

**Cross-cutting Infrastructure:**
- Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate)
- Structured logging implementation across all services (NestJS, Go, Rust)
- Metrics collection following "Golden Four Signals" observability approach
- Configurable thresholds and deployment-ready monitoring solution

The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-03 23:42:01 +08:00

373 lines
9.8 KiB
Go

package metrics
import (
"context"
"fmt"
"time"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/cloudwatch"
"github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
"github.com/rs/zerolog"
)
// MetricsClient wraps CloudWatch metrics functionality
type MetricsClient struct {
cw *cloudwatch.Client
logger zerolog.Logger
}
// NewMetricsClient creates a new metrics client
func NewMetricsClient(awsConfig aws.Config, logger zerolog.Logger) *MetricsClient {
return &MetricsClient{
cw: cloudwatch.NewFromConfig(awsConfig),
logger: logger,
}
}
// MessageProcessingMetrics holds metrics for message processing
type MessageProcessingMetrics struct {
ProcessingTime time.Duration
Success bool
MessageType string
ProviderName string
ErrorType string
}
// SendMessageProcessingMetrics sends message processing metrics to CloudWatch
func (m *MetricsClient) SendMessageProcessingMetrics(ctx context.Context, metrics MessageProcessingMetrics) error {
namespace := "MeteorApp/ComputeService"
timestamp := time.Now()
dimensions := []types.Dimension{
{
Name: aws.String("MessageType"),
Value: aws.String(metrics.MessageType),
},
{
Name: aws.String("ProviderName"),
Value: aws.String(metrics.ProviderName),
},
{
Name: aws.String("Success"),
Value: aws.String(fmt.Sprintf("%v", metrics.Success)),
},
}
// Add error type dimension if processing failed
if !metrics.Success && metrics.ErrorType != "" {
dimensions = append(dimensions, types.Dimension{
Name: aws.String("ErrorType"),
Value: aws.String(metrics.ErrorType),
})
}
metricData := []types.MetricDatum{
// Message processing count
{
MetricName: aws.String("MessageProcessingCount"),
Value: aws.Float64(1),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
},
// Processing duration
{
MetricName: aws.String("MessageProcessingDuration"),
Value: aws.Float64(float64(metrics.ProcessingTime.Milliseconds())),
Unit: types.StandardUnitMilliseconds,
Timestamp: &timestamp,
Dimensions: dimensions,
},
}
// Add success/error specific metrics
if metrics.Success {
metricData = append(metricData, types.MetricDatum{
MetricName: aws.String("MessageProcessingSuccess"),
Value: aws.Float64(1),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
})
} else {
metricData = append(metricData, types.MetricDatum{
MetricName: aws.String("MessageProcessingError"),
Value: aws.Float64(1),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
})
}
input := &cloudwatch.PutMetricDataInput{
Namespace: aws.String(namespace),
MetricData: metricData,
}
_, err := m.cw.PutMetricData(ctx, input)
if err != nil {
m.logger.Error().
Err(err).
Str("namespace", namespace).
Str("message_type", metrics.MessageType).
Str("provider_name", metrics.ProviderName).
Msg("Failed to send message processing metrics to CloudWatch")
return fmt.Errorf("failed to send message processing metrics: %w", err)
}
m.logger.Debug().
Str("namespace", namespace).
Str("message_type", metrics.MessageType).
Str("provider_name", metrics.ProviderName).
Bool("success", metrics.Success).
Dur("processing_time", metrics.ProcessingTime).
Msg("Successfully sent message processing metrics to CloudWatch")
return nil
}
// ValidationMetrics holds metrics for validation operations
type ValidationMetrics struct {
ValidationTime time.Duration
Success bool
ProviderName string
EventCount int
ErrorType string
}
// SendValidationMetrics sends validation metrics to CloudWatch
func (m *MetricsClient) SendValidationMetrics(ctx context.Context, metrics ValidationMetrics) error {
namespace := "MeteorApp/ComputeService"
timestamp := time.Now()
dimensions := []types.Dimension{
{
Name: aws.String("ProviderName"),
Value: aws.String(metrics.ProviderName),
},
{
Name: aws.String("Success"),
Value: aws.String(fmt.Sprintf("%v", metrics.Success)),
},
}
if !metrics.Success && metrics.ErrorType != "" {
dimensions = append(dimensions, types.Dimension{
Name: aws.String("ErrorType"),
Value: aws.String(metrics.ErrorType),
})
}
metricData := []types.MetricDatum{
// Validation count
{
MetricName: aws.String("ValidationCount"),
Value: aws.Float64(1),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
},
// Validation duration
{
MetricName: aws.String("ValidationDuration"),
Value: aws.Float64(float64(metrics.ValidationTime.Milliseconds())),
Unit: types.StandardUnitMilliseconds,
Timestamp: &timestamp,
Dimensions: dimensions,
},
// Event count processed
{
MetricName: aws.String("EventsProcessed"),
Value: aws.Float64(float64(metrics.EventCount)),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
},
}
// Add success/error specific metrics
if metrics.Success {
metricData = append(metricData, types.MetricDatum{
MetricName: aws.String("ValidationSuccess"),
Value: aws.Float64(1),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
})
} else {
metricData = append(metricData, types.MetricDatum{
MetricName: aws.String("ValidationError"),
Value: aws.Float64(1),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
})
}
input := &cloudwatch.PutMetricDataInput{
Namespace: aws.String(namespace),
MetricData: metricData,
}
_, err := m.cw.PutMetricData(ctx, input)
if err != nil {
m.logger.Error().
Err(err).
Str("namespace", namespace).
Str("provider_name", metrics.ProviderName).
Msg("Failed to send validation metrics to CloudWatch")
return fmt.Errorf("failed to send validation metrics: %w", err)
}
m.logger.Debug().
Str("namespace", namespace).
Str("provider_name", metrics.ProviderName).
Bool("success", metrics.Success).
Dur("validation_time", metrics.ValidationTime).
Int("event_count", metrics.EventCount).
Msg("Successfully sent validation metrics to CloudWatch")
return nil
}
// DatabaseMetrics holds metrics for database operations
type DatabaseMetrics struct {
Operation string
Duration time.Duration
Success bool
RecordCount int
ErrorType string
}
// SendDatabaseMetrics sends database metrics to CloudWatch
func (m *MetricsClient) SendDatabaseMetrics(ctx context.Context, metrics DatabaseMetrics) error {
namespace := "MeteorApp/ComputeService"
timestamp := time.Now()
dimensions := []types.Dimension{
{
Name: aws.String("Operation"),
Value: aws.String(metrics.Operation),
},
{
Name: aws.String("Success"),
Value: aws.String(fmt.Sprintf("%v", metrics.Success)),
},
}
if !metrics.Success && metrics.ErrorType != "" {
dimensions = append(dimensions, types.Dimension{
Name: aws.String("ErrorType"),
Value: aws.String(metrics.ErrorType),
})
}
metricData := []types.MetricDatum{
// Database operation count
{
MetricName: aws.String("DatabaseOperationCount"),
Value: aws.Float64(1),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
},
// Operation duration
{
MetricName: aws.String("DatabaseOperationDuration"),
Value: aws.Float64(float64(metrics.Duration.Milliseconds())),
Unit: types.StandardUnitMilliseconds,
Timestamp: &timestamp,
Dimensions: dimensions,
},
}
// Add record count if applicable
if metrics.RecordCount > 0 {
metricData = append(metricData, types.MetricDatum{
MetricName: aws.String("DatabaseRecordsProcessed"),
Value: aws.Float64(float64(metrics.RecordCount)),
Unit: types.StandardUnitCount,
Timestamp: &timestamp,
Dimensions: dimensions,
})
}
input := &cloudwatch.PutMetricDataInput{
Namespace: aws.String(namespace),
MetricData: metricData,
}
_, err := m.cw.PutMetricData(ctx, input)
if err != nil {
m.logger.Error().
Err(err).
Str("namespace", namespace).
Str("operation", metrics.Operation).
Msg("Failed to send database metrics to CloudWatch")
return fmt.Errorf("failed to send database metrics: %w", err)
}
m.logger.Debug().
Str("namespace", namespace).
Str("operation", metrics.Operation).
Bool("success", metrics.Success).
Dur("duration", metrics.Duration).
Int("record_count", metrics.RecordCount).
Msg("Successfully sent database metrics to CloudWatch")
return nil
}
// CustomMetric holds custom metric data
type CustomMetric struct {
Name string
Value float64
Unit types.StandardUnit
Dimensions map[string]string
}
// SendCustomMetric sends a custom metric to CloudWatch
func (m *MetricsClient) SendCustomMetric(ctx context.Context, metric CustomMetric) error {
namespace := "MeteorApp/ComputeService"
timestamp := time.Now()
dimensions := make([]types.Dimension, 0, len(metric.Dimensions))
for key, value := range metric.Dimensions {
dimensions = append(dimensions, types.Dimension{
Name: aws.String(key),
Value: aws.String(value),
})
}
input := &cloudwatch.PutMetricDataInput{
Namespace: aws.String(namespace),
MetricData: []types.MetricDatum{
{
MetricName: aws.String(metric.Name),
Value: aws.Float64(metric.Value),
Unit: metric.Unit,
Timestamp: &timestamp,
Dimensions: dimensions,
},
},
}
_, err := m.cw.PutMetricData(ctx, input)
if err != nil {
m.logger.Error().
Err(err).
Str("namespace", namespace).
Str("metric_name", metric.Name).
Msg("Failed to send custom metric to CloudWatch")
return fmt.Errorf("failed to send custom metric: %w", err)
}
m.logger.Debug().
Str("namespace", namespace).
Str("metric_name", metric.Name).
Float64("value", metric.Value).
Msg("Successfully sent custom metric to CloudWatch")
return nil
}