meteor_detection_system/infrastructure/cloudwatch.tf

# CloudWatch Dashboard for Meteor Application Monitoring
resource "aws_cloudwatch_dashboard" "meteor_dashboard" {
  dashboard_name = "${local.name_prefix}-monitoring-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      # Row 1: Application Overview
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["MeteorApp/WebBackend", "RequestCount", { "stat": "Sum" }],
            [".", "ErrorCount", { "stat": "Sum" }],
            ["MeteorApp/ComputeService", "MessageProcessingCount", { "stat": "Sum" }],
            [".", "MessageProcessingError", { "stat": "Sum" }]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Request and Processing Volume"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },
      {
        type   = "metric"
        x      = 12
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["MeteorApp/WebBackend", "RequestDuration", { "stat": "Average" }],
            [".", "RequestDuration", { "stat": "p95" }],
            ["MeteorApp/ComputeService", "MessageProcessingDuration", { "stat": "Average" }],
            [".", "MessageProcessingDuration", { "stat": "p95" }]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Response Time and Processing Latency"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },

      # Row 2: Error Rates and Success Metrics
      {
        type   = "metric"
        x      = 0
        y      = 6
        width  = 8
        height = 6

        properties = {
          metrics = [
            [{ "expression": "m1/m2*100", "label": "Web Backend Error Rate %" }],
            [{ "expression": "m3/m4*100", "label": "Compute Service Error Rate %" }],
            ["MeteorApp/WebBackend", "ErrorCount", { "id": "m1", "visible": false }],
            [".", "RequestCount", { "id": "m2", "visible": false }],
            ["MeteorApp/ComputeService", "MessageProcessingError", { "id": "m3", "visible": false }],
            [".", "MessageProcessingCount", { "id": "m4", "visible": false }]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Error Rates"
          period  = 300
          yAxis = {
            left = {
              min = 0
              max = 100
            }
          }
        }
      },
      {
        type   = "metric"
        x      = 8
        y      = 6
        width  = 8
        height = 6

        properties = {
          metrics = [
            ["MeteorApp/WebBackend", "AuthOperationCount", "Success", "true"],
            [".", "PaymentOperationCount", "Success", "true"],
            ["MeteorApp/ComputeService", "ValidationSuccess"]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Successful Operations"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },
      {
        type   = "metric"
        x      = 16
        y      = 6
        width  = 8
        height = 6

        properties = {
          metrics = [
            ["MeteorApp/ComputeService", "EventsProcessed", { "stat": "Sum" }],
            [".", "ValidationCount", { "stat": "Sum" }],
            ["MeteorApp/WebBackend", "EventProcessingCount", { "stat": "Sum" }]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Event Processing Volume"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },

      # Row 3: Infrastructure Metrics
      {
        type   = "metric"
        x      = 0
        y      = 12
        width  = 8
        height = 6

        properties = {
          metrics = concat(
            var.enable_rds ? [
              ["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", "${local.name_prefix}-postgres"],
              [".", "DatabaseConnections", "DBInstanceIdentifier", "${local.name_prefix}-postgres"]
            ] : [],
            [
              # Add external database metrics if available
            ]
          )
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Database Performance"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },
      {
        type   = "metric"
        x      = 8
        y      = 12
        width  = 8
        height = 6

        properties = {
          metrics = [
            ["AWS/SQS", "ApproximateNumberOfVisibleMessages", "QueueName", aws_sqs_queue.meteor_processing.name],
            [".", "ApproximateAgeOfOldestMessage", "QueueName", aws_sqs_queue.meteor_processing.name],
            [".", "ApproximateNumberOfVisibleMessages", "QueueName", aws_sqs_queue.meteor_processing_dlq.name]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "SQS Queue Metrics"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },
      {
        type   = "metric"
        x      = 16
        y      = 12
        width  = 8
        height = 6

        properties = {
          metrics = concat(
            var.enable_fargate ? [
              ["AWS/ECS", "CPUUtilization", "ServiceName", "${local.name_prefix}-web-backend"],
              [".", "MemoryUtilization", "ServiceName", "${local.name_prefix}-web-backend"],
              [".", "CPUUtilization", "ServiceName", "${local.name_prefix}-compute-service"],
              [".", "MemoryUtilization", "ServiceName", "${local.name_prefix}-compute-service"]
            ] : [],
            [
              # Placeholder for external container metrics
            ]
          )
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Container Resource Utilization"
          period  = 300
          yAxis = {
            left = {
              min = 0
              max = 100
            }
          }
        }
      },

      # Row 4: Business Metrics
      {
        type   = "metric"
        x      = 0
        y      = 18
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["MeteorApp/ComputeService", "ValidationDuration", "ProviderName", "classic_cv", { "stat": "Average" }],
            [".", "ValidationDuration", "ProviderName", "mvp", { "stat": "Average" }],
            [".", "ValidationCount", "ProviderName", "classic_cv"],
            [".", "ValidationCount", "ProviderName", "mvp"]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Validation Provider Performance"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },
      {
        type   = "metric"
        x      = 12
        y      = 18
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["MeteorApp/ComputeService", "DatabaseOperationDuration", "Operation", "CreateValidatedEvent"],
            [".", "DatabaseOperationDuration", "Operation", "GetRawEventByID"],
            [".", "DatabaseOperationCount", "Operation", "CreateValidatedEvent"],
            [".", "DatabaseOperationCount", "Operation", "GetRawEventByID"]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Database Operation Performance"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },

      # Row 5: Custom Metrics and Alerts
      {
        type   = "metric"
        x      = 0
        y      = 24
        width  = 8
        height = 6

        properties = {
          metrics = [
            ["AWS/S3", "BucketSizeBytes", "BucketName", aws_s3_bucket.meteor_events.bucket, "StorageType", "StandardStorage"],
            [".", "NumberOfObjects", "BucketName", aws_s3_bucket.meteor_events.bucket, "StorageType", "AllStorageTypes"]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "S3 Storage Metrics"
          period  = 86400 # Daily
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },
      {
        type   = "log"
        x      = 8
        y      = 24
        width  = 16
        height = 6

        properties = {
          query   = "SOURCE '/aws/lambda/${local.name_prefix}' | fields @timestamp, @message | filter @message like /ERROR/ | sort @timestamp desc | limit 20"
          region  = var.aws_region
          title   = "Recent Error Logs"
          view    = "table"
        }
      }
    ]
  })

  tags = merge(local.common_tags, {
    Name        = "${local.name_prefix}-dashboard"
    Description = "Comprehensive monitoring dashboard for Meteor application"
  })
}

# CloudWatch Log Groups
resource "aws_cloudwatch_log_group" "web_backend" {
  name              = "/aws/ecs/${local.name_prefix}-web-backend"
  retention_in_days = var.cloudwatch_log_retention_days

  tags = merge(local.common_tags, {
    Name        = "${local.name_prefix}-web-backend-logs"
    Description = "Log group for web backend service"
  })
}

resource "aws_cloudwatch_log_group" "compute_service" {
  name              = "/aws/ecs/${local.name_prefix}-compute-service"
  retention_in_days = var.cloudwatch_log_retention_days

  tags = merge(local.common_tags, {
    Name        = "${local.name_prefix}-compute-service-logs"
    Description = "Log group for compute service"
  })
}

# CloudWatch Alarms for Critical System Health

# Alarm for NestJS 5xx Error Rate (>1% over 5 minutes)
resource "aws_cloudwatch_metric_alarm" "nestjs_5xx_error_rate" {
  alarm_name          = "${local.name_prefix}-nestjs-5xx-error-rate"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = var.alarm_evaluation_periods
  treat_missing_data  = "notBreaching"

  metric_query {
    id          = "e1"
    return_data = false

    metric {
      metric_name = "ErrorCount"
      namespace   = "MeteorApp/WebBackend"
      period      = var.alarm_period_seconds
      stat        = "Sum"
    }
  }

  metric_query {
    id          = "e2"
    return_data = false

    metric {
      metric_name = "RequestCount"
      namespace   = "MeteorApp/WebBackend"
      period      = var.alarm_period_seconds
      stat        = "Sum"
    }
  }

  metric_query {
    id          = "e3"
    expression  = "SEARCH('{MeteorApp/WebBackend,StatusCode} ErrorCount StatusCode=5*', 'Sum', ${var.alarm_period_seconds})"
    label       = "5xx Errors"
    return_data = false
  }

  metric_query {
    id          = "e4"
    expression  = "(SUM(e3)/e2)*100"
    label       = "5xx Error Rate %"
    return_data = true
  }

  threshold         = var.nestjs_error_rate_threshold
  alarm_description = "CRITICAL: NestJS 5xx error rate exceeds ${var.nestjs_error_rate_threshold}% over 5 minutes. This indicates server errors that require immediate investigation. Check application logs and recent deployments."
  alarm_actions     = [aws_sns_topic.alerts.arn]
  ok_actions        = [aws_sns_topic.alerts.arn]

  tags = merge(local.common_tags, {
    Name     = "${local.name_prefix}-nestjs-5xx-error-rate"
    Severity = "Critical"
    Service  = "WebBackend"
  })
}

# Alarm for Go Service Processing Failure Rate (>5% over 5 minutes)
resource "aws_cloudwatch_metric_alarm" "go_service_failure_rate" {
  alarm_name          = "${local.name_prefix}-go-service-failure-rate"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = var.alarm_evaluation_periods
  treat_missing_data  = "notBreaching"

  metric_query {
    id          = "e1"
    return_data = false

    metric {
      metric_name = "MessageProcessingError"
      namespace   = "MeteorApp/ComputeService"
      period      = var.alarm_period_seconds
      stat        = "Sum"
    }
  }

  metric_query {
    id          = "e2"
    return_data = false

    metric {
      metric_name = "MessageProcessingCount"
      namespace   = "MeteorApp/ComputeService"
      period      = var.alarm_period_seconds
      stat        = "Sum"
    }
  }

  metric_query {
    id          = "e3"
    expression  = "(e1/e2)*100"
    label       = "Processing Failure Rate %"
    return_data = true
  }

  threshold         = var.go_service_failure_rate_threshold
  alarm_description = "CRITICAL: Go compute service processing failure rate exceeds ${var.go_service_failure_rate_threshold}% over 5 minutes. This indicates message processing issues. Check service logs, SQS dead letter queue, and validation providers."
  alarm_actions     = [aws_sns_topic.alerts.arn]
  ok_actions        = [aws_sns_topic.alerts.arn]

  tags = merge(local.common_tags, {
    Name     = "${local.name_prefix}-go-service-failure-rate"
    Severity = "Critical"
    Service  = "ComputeService"
  })
}

# Alarm for SQS Queue Depth (>1000 visible messages)
resource "aws_cloudwatch_metric_alarm" "sqs_queue_depth" {
  alarm_name          = "${local.name_prefix}-sqs-queue-depth"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = var.alarm_evaluation_periods
  metric_name         = "ApproximateNumberOfVisibleMessages"
  namespace           = "AWS/SQS"
  period              = var.alarm_period_seconds
  statistic           = "Average"
  threshold           = var.sqs_queue_depth_threshold
  treat_missing_data  = "notBreaching"
  alarm_description   = "CRITICAL: SQS queue depth exceeds ${var.sqs_queue_depth_threshold} messages. This indicates message processing backlog. Check compute service health, scaling, and processing capacity."
  alarm_actions       = [aws_sns_topic.alerts.arn]
  ok_actions          = [aws_sns_topic.alerts.arn]

  dimensions = {
    QueueName = aws_sqs_queue.meteor_processing.name
  }

  tags = merge(local.common_tags, {
    Name     = "${local.name_prefix}-sqs-queue-depth"
    Severity = "Critical"
    Service  = "SQS"
  })
}