Explorar o código

feat(logsys): 收敛Vector采集并增强异常日志可观测性

- 仅采集 shop-recycle 命名空间且 app.kubernetes.io/name=shop-recycle 的日志\n- 过滤健康检查与探针噪声(kube-probe、actuator/health、actuator/prometheus)\n- 精简落库字段并保留 kubernetes_pod_name/kubernetes_container_image\n- 对 ERROR 级日志保留 stack_trace,长度截断为 8KB\n- 调整 Loki labels 为 app/env/level/status/event_class
demo-user hai 2 meses
pai
achega
c7ff80226b
Modificáronse 1 ficheiros con 110 adicións e 64 borrados
  1. 110 64
      k8s/helm/logsys/templates/configmap-vector.yaml

+ 110 - 64
k8s/helm/logsys/templates/configmap-vector.yaml

@@ -16,8 +16,15 @@ data:
     [transforms.parse_and_enrich]
     type = "remap"
     inputs = ["kubernetes_logs"]
-    drop_on_abort = false
+    drop_on_abort = true
     source = """
+    ns = to_string(.kubernetes.pod_namespace) ?? ""
+    pod_labels = .kubernetes.pod_labels
+    if pod_labels == null { abort }
+
+    app_name = to_string(.kubernetes.pod_labels."app.kubernetes.io/name") ?? ""
+    if ns != "shop-recycle" || app_name != "shop-recycle" { abort }
+
     log_line = .log
     if log_line == null { log_line = .message }
     if log_line == null { log_line = .msg }
@@ -29,6 +36,8 @@ data:
     }
 
     if parsed != null && is_object(parsed) {
+      if parsed.ts != null { .ts = parsed.ts }
+
       msg_val = parsed.message
       if msg_val == null { msg_val = parsed.msg }
       if msg_val != null { .message = msg_val }
@@ -36,89 +45,127 @@ data:
       if parsed.level != null { .level = parsed.level }
       if parsed.app != null { .app = parsed.app }
       if parsed.env != null { .env = parsed.env }
-      if parsed.event_class != null { .event_class = parsed.event_class }
-      if parsed.event != null { .event_class = parsed.event }
       if parsed.uri != null { .uri = parsed.uri }
-      if parsed.uri_group != null { .uri_group = parsed.uri_group }
       if parsed.status != null { .status = parsed.status }
-      if parsed.duration != null { .duration = parsed.duration }
+      if parsed.event_class != null { .event_class = parsed.event_class }
+      if parsed.event != null && .event_class == null { .event_class = parsed.event }
+
       if parsed.traceId != null { .traceId = parsed.traceId }
       if parsed.userId != null { .userId = parsed.userId }
       if parsed.orderId != null { .orderId = parsed.orderId }
       if parsed.error != null { .error = parsed.error }
-      if parsed.logger_name != null { .logger = parsed.logger_name }
-      if parsed.thread_name != null { .thread = parsed.thread_name }
+      if parsed.stack_trace != null { .stack_trace = parsed.stack_trace }
+
+      if parsed.duration_ms != null { .duration_ms = parsed.duration_ms }
+      if parsed.duration != null { .duration = parsed.duration }
     }
 
     if .message == null { .message = log_line }
 
-    app_val = .app
-    if app_val == null { app_val = .kubernetes.labels.app }
-    if app_val == null { app_val = .kubernetes.pod_labels.app }
-    if app_val == null { app_val = .kubernetes.container_name }
-    if app_val == null { app_val = .kubernetes.pod_name }
-    if app_val == null { app_val = "unknown" }
-    .app = app_val
+    if .app == null {
+      app_val = .kubernetes.pod_labels.app
+      if app_val == null { app_val = .kubernetes.container_name }
+      if app_val == null { app_val = "unknown" }
+      .app = app_val
+    }
 
     env_str = to_string(.env) ?? ""
-    if contains(env_str, "prod") {
-      .env = "prod"
-    } else if contains(env_str, "stage") {
-      .env = "stage"
-    } else if contains(env_str, "dev") {
-      .env = "dev"
-    } else if contains(env_str, "local") {
-      .env = "local"
-    } else if env_str != "" {
-      .env = env_str
-    } else {
-      .env = "unknown"
-    }
+    if env_str == "" { .env = "unknown" } else { .env = env_str }
+
+    level_str = to_string(.level) ?? ""
+    if level_str == "" { .level = "INFO" } else { .level = level_str }
 
     uri_str = to_string(.uri) ?? ""
-    if uri_str != "" { .uri = uri_str }
-
-    if !exists(.uri_group) || .uri_group == "" {
-      if uri_str != "" {
-        if starts_with(uri_str, "/order/") {
-          .uri_group = "/order/*"
-        } else if starts_with(uri_str, "/payment/") {
-          .uri_group = "/payment/*"
-        } else if starts_with(uri_str, "/gateway/") {
-          .uri_group = "/gateway/*"
-        } else {
-          .uri_group = "/other"
-        }
-      } else {
-        .uri_group = "/other"
-      }
-    }
+    if uri_str == "" { .uri = "-" } else { .uri = uri_str }
 
-    if !exists(.event_class) || .event_class == "" {
-      if uri_str != "" {
-        if contains(uri_str, "order") {
-          .event_class = "order"
-        } else if contains(uri_str, "payment") {
-          .event_class = "payment"
-        } else if contains(uri_str, "login") {
-          .event_class = "auth"
-        } else {
-          .event_class = "api"
-        }
+    status_str = to_string(.status) ?? ""
+    if status_str == "" { .status = "unknown" } else { .status = status_str }
+
+    event_class_str = to_string(.event_class) ?? ""
+    if event_class_str == "" {
+      if contains(uri_str, "order") {
+        .event_class = "order"
+      } else if contains(uri_str, "payment") {
+        .event_class = "payment"
+      } else if contains(uri_str, "login") {
+        .event_class = "auth"
       } else {
         .event_class = "api"
       }
+    } else {
+      .event_class = event_class_str
     }
 
-    level_str = to_string(.level) ?? ""
-    if level_str != "" { .level = level_str } else { .level = "INFO" }
+    if .ts == null { .ts = .timestamp }
+    .ts = to_string(.ts) ?? ""
 
-    status_str = to_string(.status) ?? ""
-    if status_str != "" { .status = status_str } else { .status = "unknown" }
+    if .duration_ms == null {
+      duration_str = to_string(.duration) ?? ""
+      duration_str = replace(duration_str, "ms", "")
+      .duration_ms = to_int(duration_str) ?? null
+    } else {
+      .duration_ms = to_int(.duration_ms) ?? null
+    }
+
+    app_str = to_string(.app) ?? "unknown"
+    if app_str == "" { .app = "unknown" } else { .app = app_str }
+    msg_str = to_string(.message) ?? log_line
+    .message = msg_str
+    msg_lower = downcase(msg_str)
+    uri_lower = downcase(uri_str)
+
+    # 过滤健康检查探针日志,避免污染业务检索视图
+    if contains(msg_lower, "kube-probe/") { abort }
+    if uri_lower == "/actuator/health" || starts_with(uri_lower, "/actuator/health/") { abort }
+    if uri_lower == "/actuator/prometheus" || starts_with(uri_lower, "/actuator/prometheus/") { abort }
+    if uri_lower == "/health" || uri_lower == "/healthz" || uri_lower == "/readyz" || uri_lower == "/livez" { abort }
+    if contains(msg_lower, "/actuator/prometheus") { abort }
+
+    pod_name = to_string(.kubernetes.pod_name) ?? "-"
+    image_name = to_string(.kubernetes.container_image) ?? "-"
+    if pod_name == "" { pod_name = "-" }
+    if image_name == "" { image_name = "-" }
 
-    duration_str = to_string(.duration) ?? ""
-    duration_str = replace!(duration_str, "ms", "")
-    .duration_ms = to_int(duration_str) ?? null
+    trace_val = to_string(.traceId) ?? ""
+    user_val = to_string(.userId) ?? ""
+    order_val = to_string(.orderId) ?? ""
+    error_val = to_string(.error) ?? ""
+
+    if trace_val == "" { .traceId = null } else { .traceId = trace_val }
+    if user_val == "" { .userId = null } else { .userId = user_val }
+    if order_val == "" { .orderId = null } else { .orderId = order_val }
+    if error_val == "" { .error = null } else { .error = error_val }
+    if .level != "ERROR" || !is_string(.stack_trace) || .stack_trace == "" {
+      .stack_trace = null
+    } else {
+      .stack_trace = truncate(.stack_trace, 8192)
+    }
+
+    . = {
+      "ts": .ts,
+      "level": .level,
+      "app": .app,
+      "env": .env,
+      "event_class": .event_class,
+      "uri": .uri,
+      "status": .status,
+      "duration_ms": .duration_ms,
+      "traceId": .traceId,
+      "userId": .userId,
+      "orderId": .orderId,
+      "kubernetes_pod_name": pod_name,
+      "kubernetes_container_image": image_name,
+      "message": .message,
+      "error": .error,
+      "stack_trace": .stack_trace
+    }
+
+    if .duration_ms == null { del(.duration_ms) }
+    if .traceId == null { del(.traceId) }
+    if .userId == null { del(.userId) }
+    if .orderId == null { del(.orderId) }
+    if .error == null { del(.error) }
+    if .stack_trace == null { del(.stack_trace) }
     """
 
     [sinks.loki]
@@ -132,6 +179,5 @@ data:
     env = '{{ "{{ env }}" }}'
     app = '{{ "{{ app }}" }}'
     level = '{{ "{{ level }}" }}'
-    event_class = '{{ "{{ event_class }}" }}'
-    uri_group = '{{ "{{ uri_group }}" }}'
     status = '{{ "{{ status }}" }}'
+    event_class = '{{ "{{ event_class }}" }}'