Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
373 lines
9.8 KiB
Go
373 lines
9.8 KiB
Go
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/aws/aws-sdk-go-v2/aws"
|
|
"github.com/aws/aws-sdk-go-v2/service/cloudwatch"
|
|
"github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
|
|
"github.com/rs/zerolog"
|
|
)
|
|
|
|
// MetricsClient wraps CloudWatch metrics functionality
|
|
type MetricsClient struct {
|
|
cw *cloudwatch.Client
|
|
logger zerolog.Logger
|
|
}
|
|
|
|
// NewMetricsClient creates a new metrics client
|
|
func NewMetricsClient(awsConfig aws.Config, logger zerolog.Logger) *MetricsClient {
|
|
return &MetricsClient{
|
|
cw: cloudwatch.NewFromConfig(awsConfig),
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
// MessageProcessingMetrics holds metrics for message processing
|
|
type MessageProcessingMetrics struct {
|
|
ProcessingTime time.Duration
|
|
Success bool
|
|
MessageType string
|
|
ProviderName string
|
|
ErrorType string
|
|
}
|
|
|
|
// SendMessageProcessingMetrics sends message processing metrics to CloudWatch
|
|
func (m *MetricsClient) SendMessageProcessingMetrics(ctx context.Context, metrics MessageProcessingMetrics) error {
|
|
namespace := "MeteorApp/ComputeService"
|
|
timestamp := time.Now()
|
|
|
|
dimensions := []types.Dimension{
|
|
{
|
|
Name: aws.String("MessageType"),
|
|
Value: aws.String(metrics.MessageType),
|
|
},
|
|
{
|
|
Name: aws.String("ProviderName"),
|
|
Value: aws.String(metrics.ProviderName),
|
|
},
|
|
{
|
|
Name: aws.String("Success"),
|
|
Value: aws.String(fmt.Sprintf("%v", metrics.Success)),
|
|
},
|
|
}
|
|
|
|
// Add error type dimension if processing failed
|
|
if !metrics.Success && metrics.ErrorType != "" {
|
|
dimensions = append(dimensions, types.Dimension{
|
|
Name: aws.String("ErrorType"),
|
|
Value: aws.String(metrics.ErrorType),
|
|
})
|
|
}
|
|
|
|
metricData := []types.MetricDatum{
|
|
// Message processing count
|
|
{
|
|
MetricName: aws.String("MessageProcessingCount"),
|
|
Value: aws.Float64(1),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
},
|
|
// Processing duration
|
|
{
|
|
MetricName: aws.String("MessageProcessingDuration"),
|
|
Value: aws.Float64(float64(metrics.ProcessingTime.Milliseconds())),
|
|
Unit: types.StandardUnitMilliseconds,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
},
|
|
}
|
|
|
|
// Add success/error specific metrics
|
|
if metrics.Success {
|
|
metricData = append(metricData, types.MetricDatum{
|
|
MetricName: aws.String("MessageProcessingSuccess"),
|
|
Value: aws.Float64(1),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
})
|
|
} else {
|
|
metricData = append(metricData, types.MetricDatum{
|
|
MetricName: aws.String("MessageProcessingError"),
|
|
Value: aws.Float64(1),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
})
|
|
}
|
|
|
|
input := &cloudwatch.PutMetricDataInput{
|
|
Namespace: aws.String(namespace),
|
|
MetricData: metricData,
|
|
}
|
|
|
|
_, err := m.cw.PutMetricData(ctx, input)
|
|
if err != nil {
|
|
m.logger.Error().
|
|
Err(err).
|
|
Str("namespace", namespace).
|
|
Str("message_type", metrics.MessageType).
|
|
Str("provider_name", metrics.ProviderName).
|
|
Msg("Failed to send message processing metrics to CloudWatch")
|
|
return fmt.Errorf("failed to send message processing metrics: %w", err)
|
|
}
|
|
|
|
m.logger.Debug().
|
|
Str("namespace", namespace).
|
|
Str("message_type", metrics.MessageType).
|
|
Str("provider_name", metrics.ProviderName).
|
|
Bool("success", metrics.Success).
|
|
Dur("processing_time", metrics.ProcessingTime).
|
|
Msg("Successfully sent message processing metrics to CloudWatch")
|
|
|
|
return nil
|
|
}
|
|
|
|
// ValidationMetrics holds metrics for validation operations
|
|
type ValidationMetrics struct {
|
|
ValidationTime time.Duration
|
|
Success bool
|
|
ProviderName string
|
|
EventCount int
|
|
ErrorType string
|
|
}
|
|
|
|
// SendValidationMetrics sends validation metrics to CloudWatch
|
|
func (m *MetricsClient) SendValidationMetrics(ctx context.Context, metrics ValidationMetrics) error {
|
|
namespace := "MeteorApp/ComputeService"
|
|
timestamp := time.Now()
|
|
|
|
dimensions := []types.Dimension{
|
|
{
|
|
Name: aws.String("ProviderName"),
|
|
Value: aws.String(metrics.ProviderName),
|
|
},
|
|
{
|
|
Name: aws.String("Success"),
|
|
Value: aws.String(fmt.Sprintf("%v", metrics.Success)),
|
|
},
|
|
}
|
|
|
|
if !metrics.Success && metrics.ErrorType != "" {
|
|
dimensions = append(dimensions, types.Dimension{
|
|
Name: aws.String("ErrorType"),
|
|
Value: aws.String(metrics.ErrorType),
|
|
})
|
|
}
|
|
|
|
metricData := []types.MetricDatum{
|
|
// Validation count
|
|
{
|
|
MetricName: aws.String("ValidationCount"),
|
|
Value: aws.Float64(1),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
},
|
|
// Validation duration
|
|
{
|
|
MetricName: aws.String("ValidationDuration"),
|
|
Value: aws.Float64(float64(metrics.ValidationTime.Milliseconds())),
|
|
Unit: types.StandardUnitMilliseconds,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
},
|
|
// Event count processed
|
|
{
|
|
MetricName: aws.String("EventsProcessed"),
|
|
Value: aws.Float64(float64(metrics.EventCount)),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
},
|
|
}
|
|
|
|
// Add success/error specific metrics
|
|
if metrics.Success {
|
|
metricData = append(metricData, types.MetricDatum{
|
|
MetricName: aws.String("ValidationSuccess"),
|
|
Value: aws.Float64(1),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
})
|
|
} else {
|
|
metricData = append(metricData, types.MetricDatum{
|
|
MetricName: aws.String("ValidationError"),
|
|
Value: aws.Float64(1),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
})
|
|
}
|
|
|
|
input := &cloudwatch.PutMetricDataInput{
|
|
Namespace: aws.String(namespace),
|
|
MetricData: metricData,
|
|
}
|
|
|
|
_, err := m.cw.PutMetricData(ctx, input)
|
|
if err != nil {
|
|
m.logger.Error().
|
|
Err(err).
|
|
Str("namespace", namespace).
|
|
Str("provider_name", metrics.ProviderName).
|
|
Msg("Failed to send validation metrics to CloudWatch")
|
|
return fmt.Errorf("failed to send validation metrics: %w", err)
|
|
}
|
|
|
|
m.logger.Debug().
|
|
Str("namespace", namespace).
|
|
Str("provider_name", metrics.ProviderName).
|
|
Bool("success", metrics.Success).
|
|
Dur("validation_time", metrics.ValidationTime).
|
|
Int("event_count", metrics.EventCount).
|
|
Msg("Successfully sent validation metrics to CloudWatch")
|
|
|
|
return nil
|
|
}
|
|
|
|
// DatabaseMetrics holds metrics for database operations
|
|
type DatabaseMetrics struct {
|
|
Operation string
|
|
Duration time.Duration
|
|
Success bool
|
|
RecordCount int
|
|
ErrorType string
|
|
}
|
|
|
|
// SendDatabaseMetrics sends database metrics to CloudWatch
|
|
func (m *MetricsClient) SendDatabaseMetrics(ctx context.Context, metrics DatabaseMetrics) error {
|
|
namespace := "MeteorApp/ComputeService"
|
|
timestamp := time.Now()
|
|
|
|
dimensions := []types.Dimension{
|
|
{
|
|
Name: aws.String("Operation"),
|
|
Value: aws.String(metrics.Operation),
|
|
},
|
|
{
|
|
Name: aws.String("Success"),
|
|
Value: aws.String(fmt.Sprintf("%v", metrics.Success)),
|
|
},
|
|
}
|
|
|
|
if !metrics.Success && metrics.ErrorType != "" {
|
|
dimensions = append(dimensions, types.Dimension{
|
|
Name: aws.String("ErrorType"),
|
|
Value: aws.String(metrics.ErrorType),
|
|
})
|
|
}
|
|
|
|
metricData := []types.MetricDatum{
|
|
// Database operation count
|
|
{
|
|
MetricName: aws.String("DatabaseOperationCount"),
|
|
Value: aws.Float64(1),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
},
|
|
// Operation duration
|
|
{
|
|
MetricName: aws.String("DatabaseOperationDuration"),
|
|
Value: aws.Float64(float64(metrics.Duration.Milliseconds())),
|
|
Unit: types.StandardUnitMilliseconds,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
},
|
|
}
|
|
|
|
// Add record count if applicable
|
|
if metrics.RecordCount > 0 {
|
|
metricData = append(metricData, types.MetricDatum{
|
|
MetricName: aws.String("DatabaseRecordsProcessed"),
|
|
Value: aws.Float64(float64(metrics.RecordCount)),
|
|
Unit: types.StandardUnitCount,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
})
|
|
}
|
|
|
|
input := &cloudwatch.PutMetricDataInput{
|
|
Namespace: aws.String(namespace),
|
|
MetricData: metricData,
|
|
}
|
|
|
|
_, err := m.cw.PutMetricData(ctx, input)
|
|
if err != nil {
|
|
m.logger.Error().
|
|
Err(err).
|
|
Str("namespace", namespace).
|
|
Str("operation", metrics.Operation).
|
|
Msg("Failed to send database metrics to CloudWatch")
|
|
return fmt.Errorf("failed to send database metrics: %w", err)
|
|
}
|
|
|
|
m.logger.Debug().
|
|
Str("namespace", namespace).
|
|
Str("operation", metrics.Operation).
|
|
Bool("success", metrics.Success).
|
|
Dur("duration", metrics.Duration).
|
|
Int("record_count", metrics.RecordCount).
|
|
Msg("Successfully sent database metrics to CloudWatch")
|
|
|
|
return nil
|
|
}
|
|
|
|
// CustomMetric holds custom metric data
|
|
type CustomMetric struct {
|
|
Name string
|
|
Value float64
|
|
Unit types.StandardUnit
|
|
Dimensions map[string]string
|
|
}
|
|
|
|
// SendCustomMetric sends a custom metric to CloudWatch
|
|
func (m *MetricsClient) SendCustomMetric(ctx context.Context, metric CustomMetric) error {
|
|
namespace := "MeteorApp/ComputeService"
|
|
timestamp := time.Now()
|
|
|
|
dimensions := make([]types.Dimension, 0, len(metric.Dimensions))
|
|
for key, value := range metric.Dimensions {
|
|
dimensions = append(dimensions, types.Dimension{
|
|
Name: aws.String(key),
|
|
Value: aws.String(value),
|
|
})
|
|
}
|
|
|
|
input := &cloudwatch.PutMetricDataInput{
|
|
Namespace: aws.String(namespace),
|
|
MetricData: []types.MetricDatum{
|
|
{
|
|
MetricName: aws.String(metric.Name),
|
|
Value: aws.Float64(metric.Value),
|
|
Unit: metric.Unit,
|
|
Timestamp: ×tamp,
|
|
Dimensions: dimensions,
|
|
},
|
|
},
|
|
}
|
|
|
|
_, err := m.cw.PutMetricData(ctx, input)
|
|
if err != nil {
|
|
m.logger.Error().
|
|
Err(err).
|
|
Str("namespace", namespace).
|
|
Str("metric_name", metric.Name).
|
|
Msg("Failed to send custom metric to CloudWatch")
|
|
return fmt.Errorf("failed to send custom metric: %w", err)
|
|
}
|
|
|
|
m.logger.Debug().
|
|
Str("namespace", namespace).
|
|
Str("metric_name", metric.Name).
|
|
Float64("value", metric.Value).
|
|
Msg("Successfully sent custom metric to CloudWatch")
|
|
|
|
return nil
|
|
} |