Browse Source

日志方案改动

DevOps Team 2 months ago
parent
commit
d1f96fce2f
1 changed files with 430 additions and 0 deletions
  1. 430 0
      logs/Log.md

+ 430 - 0
logs/Log.md

@@ -0,0 +1,430 @@
+# 生产级日志标签规划与监控方案(零侵入 · 防爆炸 · 生产就绪)
+
+---
+
+## ▸ 目标成果(结果导向)
+
+### 获得能力
+1. **自动 QPS 监控**:按照 `uri_group`、`event_class` 实时统计请求速率,支持 Grafana 告警
+2. **订单可观测性**:日志自动贴 `order` 事件标签,支持订单成功率/失败率/异常原因查询
+3. **延时分布洞察**:p50/p95/p99 延时指标直接从 `duration` 字段提取,无需额外 APM
+4. **基数防爆**:CI 阶段自动检测待上 label 的唯一值,超限即阻断发布
+5. **低侵入**:业务代码 0 改动,只在公共 Filter + logback 配置层统一处理 MDC 字段
+
+### 成本对标
+| 阶段 | CPU | 内存 | 采集延迟 |
+| --- | ---: | ---: | ---: |
+| 原始全量 JSON 解析(Promtail/无过滤) | 150-200% | 400MB | >30s |
+| 本方案(Vector + AsyncAppender) | 8-15% | 80-100MB | <2s |
+
+---
+
+## ▸ 日志监控架构图(ASCII 版)
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                        应用端(Spring 微服务)                  │
+│                                                             │
+│  请求进来 → LoggingFilter/Interceptor(统一设置 MDC)        │
+│              → TraceId、uri、uri_group、event、start_time     │
+│              → 业务代码(无改动)                            │
+│              → log.info 等出 JSON(Logback Async JSON)       │
+└─────────────────────────────────────────────────────────────┘
+                              ↓
+┌──────────────┐         ┌──────────────────────────────┐
+│   容器运行时   │ → 日志流  → │ 采集端(Vector Agent)         │
+└──────────────┘         │  parse_json → remap VRL → filter │
+                         │  ├─ 导出 Loki tags(env/app/event_class/uri_group/status)
+                         │  ├─ 聚合出 Prometheus metrics(requests_total/orders_failed)
+                         │  └─ 采样/限流/drop DEBUG TRACE
+                         └──────────────┬──────────────┘
+                                        ↓
+                          ┌───────────────────────────────┐
+                          │ Loki(日志存储)                │
+                          │ Prometheus(metric store)       │
+                          └──────────────┬────────────────┘
+                                         ↓
+                          ┌───────────────────────────────┐
+                          │ Grafana(Dashboards + Alerts) │
+                          │  • QPS / orders / latency        │
+                          │  • 基数门禁误差可视化            │
+                          └───────────────────────────────┘
+```
+
+---
+
+## ▸ 日志标签设计
+
+### 标签分类
+| 名称 | 说明 | 预期基数 | 是否做 labels |
+| --- | --- | ---: | --- |
+| `env` | 环境:prod/stage/dev | ~3 | 是 |
+| `app` | 服务名(shop-recycle-xxx) | <50 | 是 |
+| `level` | 日志级别 INFO/WARN/ERROR | ~5 | 是 |
+| `event_class` | 业务事件类别(order/api/audit) | <20 | 是 |
+| `uri_group` | 规范化 URI(/order/*) | <100 | 是 |
+| `status` | success/client_error/server_error | ~5 | 是 |
+
+### 高基数字段(保留在 JSON,而不做 label)
+- `userId`、`orderId`:高基数,留在 message 供查询
+- `traceId`、`spanId`:跟踪系统用,不做标签
+- `error`、`exception`:自由文本,保留原文
+- `msg`:应用日志内容,不做 label
+
+### 基数门禁(CI 阶段)
+业务部署前运行脚本,统计 candidate labels(`event_class`、`uri_group` 等)24h 唯一值,超过 5000 即阻断发布。
+
+```bash
+#!/bin/bash
+set -euo pipefail
+LABELS=("event_class" "uri_group" "status")
+for label in "${LABELS[@]}"; do
+  COUNT=$(logcli query '{env="prod"}' --since=24h -o raw |
+    jq -r ".[][$label]" | sort | uniq | wc -l)
+  if [ "$COUNT" -gt 5000 ]; then
+    echo "❌ label $label 基数 $COUNT 超过 5000,拒绝发布"
+    exit 1
+  fi
+done
+echo "✅ label 基数检查通过"
+```
+
+---
+
+## ▸ 实现方案(最小化代码入侵)
+
+### 第 1 步:应用层 Interceptor(统一 MDC)
+
+```java
+package com.ssm.shop.common.web.interceptor;
+
+import org.slf4j.MDC;
+import org.springframework.web.servlet.HandlerInterceptor;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import java.util.UUID;
+
+public class LoggingMdcInterceptor implements HandlerInterceptor {
+    @Override
+    public boolean preHandle(HttpServletRequest request, HttpServletResponse response, Object handler) {
+        String traceId = MDC.get("traceId");
+        if (traceId == null) {
+            traceId = request.getHeader("X-B3-TraceId");
+            if (traceId == null) {
+                traceId = UUID.randomUUID().toString();
+            }
+            MDC.put("traceId", traceId);
+        }
+        String uri = request.getRequestURI();
+        MDC.put("uri", uri);
+        MDC.put("uri_group", normalizeUri(uri));
+        String userId = request.getHeader("X-User-Id");
+        if (userId != null) {
+            MDC.put("userId", userId);
+        }
+        MDC.put("event_class", deriveEventClass(uri));
+        MDC.put("start_time", String.valueOf(System.currentTimeMillis()));
+        return true;
+    }
+
+    @Override
+    public void afterCompletion(HttpServletRequest request, HttpServletResponse response, Object handler, Exception ex) {
+        String startTime = MDC.get("start_time");
+        if (startTime != null) {
+            long duration = System.currentTimeMillis() - Long.parseLong(startTime);
+            MDC.put("duration", String.valueOf(duration));
+        }
+        if (ex != null || response.getStatus() >= 400) {
+            MDC.put("error", ex != null ? ex.toString() : "HTTP_" + response.getStatus());
+        }
+        if (response.getStatus() >= 400) {
+            MDC.put("status", response.getStatus() >= 500 ? "server_error" : "client_error");
+        } else {
+            MDC.put("status", "success");
+        }
+        MDC.clear();
+    }
+
+    private String normalizeUri(String uri) {
+        if (uri.startsWith("/order/")) {
+            return "/order/*";
+        }
+        if (uri.startsWith("/payment/")) {
+            return "/payment/*";
+        }
+        return uri;
+    }
+
+    private String deriveEventClass(String uri) {
+        if (uri.contains("order")) {
+            return "order";
+        }
+        if (uri.contains("login")) {
+            return "auth";
+        }
+        return "api";
+    }
+}
+```
+
+### 第 2 步:Logback 配置(统一 Async JSON)
+
+```xml
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+    <springProperty name="app" source="spring.application.name" />
+    <springProperty name="env" source="spring.profiles.active" defaultValue="prod" />
+
+    <appender name="ASYNC_JSON" class="ch.qos.logback.classic.AsyncAppender">
+        <queueSize>1024</queueSize>
+        <discardingThreshold>0</discardingThreshold>
+        <appender-ref ref="JSON" />
+    </appender>
+
+    <appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
+        <encoder class="net.logstash.logback.encoder.LogstashEncoder">
+            <includeContext>true</includeContext>
+            <customFields>{"app":"${app}","env":"${env}"}</customFields>
+            <provider class="net.logstash.logback.composite.loggingevent.LoggingEventPatternJsonProvider">
+                <pattern>
+{
+  "ts":"%d{yyyy-MM-dd'T'HH:mm:ss.SSS'Z'}",
+  "level":"%level",
+  "logger":"%logger",
+  "msg":"%msg",
+  "traceId":"%X{traceId:-}",
+  "uri":"%X{uri:-}",
+  "uri_group":"%X{uri_group:-}",
+  "duration":"%X{duration:-}",
+  "userId":"%X{userId:-}",
+  "event":"%X{event_class:-}",
+  "error":"%X{error:-}",
+  "status":"%X{status:-}",
+  "thread":"%thread"
+}
+                </pattern>
+            </provider>
+        </encoder>
+    </appender>
+
+    <root level="INFO">
+        <appender-ref ref="ASYNC_JSON" />
+    </root>
+</configuration>
+```
+
+### 第 3 步:Vector 配置(采集端写在一个 toml)
+
+```toml
+[sources.kubernetes_logs]
+type = "kubernetes_logs"
+labels_to_fields = ["namespace", "pod" ]
+
+[transforms.parse_json]
+type = "remap"
+inputs = ["kubernetes_logs"]
+source = '''
+parsed = parse_json!(.message)
+.ts = parsed.ts
+.level = parsed.level
+.app = parsed.app
+.env = parsed.env
+.traceId = parsed.traceId
+.uri = parsed.uri
+.uri_group = parsed.uri_group
+.duration_ms = to_int!(parsed.duration, 0)
+.userId = parsed.userId
+.event = parsed.event
+.error = parsed.error
+.status = parsed.status
+.event_class = parsed.event
+
+if starts_with(.uri_group, "/order") {
+  .event_class = "order"
+}
+'''
+
+[transforms.filter_levels]
+type = "filter"
+inputs = ["parse_json"]
+condition = '.level != "DEBUG" && .level != "TRACE"'
+
+[sinks.loki]
+type = "loki"
+inputs = ["filter_levels"]
+endpoint = "http://loki:3100"
+encoding.codec = "json"
+labels.env = "{{ env }}"
+labels.app = "{{ app }}"
+labels.level = "{{ level }}"
+labels.event_class = "{{ event_class }}"
+labels.uri_group = "{{ uri_group }}"
+labels.status = "{{ status }}"
+
+[transforms.to_metrics]
+type = "log_to_metric"
+inputs = ["filter_levels"]
+
+[[transforms.to_metrics.metrics]]
+type = "counter"
+field = "message"
+name = "shop_recycle_requests_total"
+tags.app = "{{ app }}"
+tags.env = "{{ env }}"
+tags.uri_group = "{{ uri_group }}"
+
+[[transforms.to_metrics.metrics]]
+type = "counter"
+field = "message"
+name = "shop_recycle_requests_errors_total"
+filter.condition = '.status == "error"'
+tags.app = "{{ app }}"
+tags.env = "{{ env }}"
+
+[[transforms.to_metrics.metrics]]
+type = "histogram"
+field = "duration_ms"
+name = "shop_recycle_request_duration_ms"
+tags.app = "{{ app }}"
+tags.uri_group = "{{ uri_group }}"
+
+[[transforms.to_metrics.metrics]]
+type = "counter"
+field = "message"
+name = "shop_recycle_orders_total"
+filter.condition = '.event_class == "order"'
+tags.app = "{{ app }}"
+
+[[transforms.to_metrics.metrics]]
+type = "counter"
+field = "message"
+name = "shop_recycle_orders_failed_total"
+filter.condition = '.event_class == "order" && status == "error"'
+tags.app = "{{ app }}"
+
+[sinks.prometheus]
+type = "prometheus_exporter"
+inputs = ["to_metrics"]
+address = "0.0.0.0:9598"
+default_namespace = "shop_recycle"
+```
+
+### 第 4 步:业务代码无需改动
+
+```java
+@Service
+public class OrderService {
+    private static final Logger log = LoggerFactory.getLogger(OrderService.class);
+
+    public void createOrder(OrderCreateRequest req) {
+        log.info("订单创建逻辑开始");
+        try {
+            log.info("订单创建完成");
+            MDC.put("event", "order_created");
+            log.info("订单已入库");
+        } catch (Exception e) {
+            MDC.put("event", "order_create_failed");
+            log.error("订单创建失败", e);
+            throw e;
+        }
+    }
+}
+```
+
+---
+
+## ▸ 获得的监控查询与告警
+
+### Loki LogQL 示例
+
+**QPS(按 uri_group):**
+```logql
+sum by (uri_group) (
+  rate({app="shop-recycle-order-center", level!="DEBUG"}[1m])
+)
+```
+
+**订单成功率:**
+```logql
+sum(rate({event_class="order", status="success"}[5m]))
+/
+sum(rate({event_class="order"}[5m]))
+```
+
+**延时分布(P95):**
+```logql
+quantile_over_time(0.95, {app="shop-recycle-order-center"} | json | unwrap duration_ms [5m]) by (uri_group)
+```
+
+### Prometheus 告警规则示例
+
+```yaml
+groups:
+- name: shop_recycle_alerts
+  rules:
+  - alert: HighErrorRate
+    expr: (
+      rate(shop_recycle_requests_errors_total[5m])
+      /
+      rate(shop_recycle_requests_total[5m])
+    ) > 0.05
+    for: 2m
+    annotations:
+      summary: "{{ $labels.app }} 错误率超过 5%"
+  - alert: OrderCreationFailureRate
+    expr: (
+      rate(shop_recycle_orders_failed_total[5m])
+      /
+      rate(shop_recycle_orders_total[5m])
+    ) > 0.01
+    for: 2m
+    annotations:
+      summary: "订单失败率超过 1%"
+  - alert: SlowRequests
+    expr: histogram_quantile(0.99, rate(shop_recycle_request_duration_ms_bucket[5m])) > 2
+    for: 3m
+    annotations:
+      summary: "{{ $labels.uri_group }} P99 延迟超过 2 秒"
+```
+
+---
+
+## ▸ 迁移步骤(低风险)
+
+1. **Phase 1 - Staging 双写(72h)**:部署 Vector,双写旧采集器 + Loki+Prometheus;对比查询结果;观察 `parsing_errors`。
+2. **Phase 2 - Canary(10% 节点)**:小范围部署,监控 orders/latency/error 指标;确认稳定后扩大。
+3. **Phase 3 - 全量切换**:蓝绿推进,旧采集器并行运行 24h;异常即 `helm rollback`  回退。
+
+```bash
+helm rollback vector -n logging
+kubectl rollout undo deployment/vector -n logging
+```
+
+---
+
+## ▸ 基数防爆清单
+
+| 字段 | 预期基数 | 检测方式 | 上 label 限制 |
+| --- | ---: | --- | --- |
+| `env` | ~3 | 枚举 | ✅ |
+| `app` | <50 | 服务列表 | ✅ |
+| `level` | ~5 | 枚举 | ✅ |
+| `event_class` | <20 | 业务分类 | ✅ |
+| `uri_group` | <100 | 规则化 | ✅ |
+| `status` | ~5 | 枚举 | ✅ |
+| `userId` | ~百万 | 保留 JSON | ❌ |
+| `orderId` | ~千万 | 保留 JSON | ❌ |
+
+---
+
+## ▸ 资源对标
+
+**应用端开销(Interceptor + AsyncAppender):**
+- CPU 增量:<1%
+- 内存:+<10MB
+- 延时:<1ms(异步)
+
+**采集端(Vector vs Promtail):**
+- Vector CPU:10-15%,Promtail 30-40%
+- Vector 内存:80-100MB,Promtail 120-150MB
+- 吞吐:Vector >100K events/s(4C 8G 节点)