Quellcode durchsuchen

Move logs into logs/ and add examples; fix CI script

DevOps Team vor 2 Monaten
Ursprung
Commit
a2fd98901c
6 geänderte Dateien mit 149 neuen und 111 gelöschten Zeilen
  1. 24 0
      ci/check_cardinality.sh
  2. 16 0
      config/logback-spring.xml
  3. 43 0
      examples/LoggingFilter.java
  4. 0 111
      Log-operation.md
  5. 45 0
      logs/LoggingFilter.java
  6. 21 0
      logs/logback-spring.xml

+ 24 - 0
ci/check_cardinality.sh

@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# CI helper: check cardinality of a given label field over the last 24h using logcli
+# Usage: ./check_cardinality.sh event 5000
+
+FIELD=${1:-event}
+THRESHOLD=${2:-5000}
+
+if ! command -v logcli >/dev/null 2>&1; then
+  echo "logcli not found in PATH"
+  exit 2
+fi
+
+echo "Checking cardinality for label: $FIELD"
+
+RESULT=$(logcli query '{env="prod"}' --since=24h | jq -r ".[].${FIELD}" | sort | uniq -c | awk -v t=$THRESHOLD '$1>t{print $2, $1}')
+
+if [ -n "$RESULT" ]; then
+  echo "High cardinality detected for field '$FIELD':"
+  echo "$RESULT"
+  exit 1
+else
+  echo "Cardinality OK"
+  exit 0
+fi

+ 16 - 0
config/logback-spring.xml

@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+  <appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
+    <encoder class="net.logstash.logback.encoder.LogstashEncoder">
+      <includeContext>true</includeContext>
+    </encoder>
+  </appender>
+
+  <appender name="ASYNC_JSON" class="ch.qos.logback.classic.AsyncAppender">
+    <appender-ref ref="JSON" />
+  </appender>
+
+  <root level="INFO">
+    <appender-ref ref="ASYNC_JSON" />
+  </root>
+</configuration>

+ 43 - 0
examples/LoggingFilter.java

@@ -0,0 +1,43 @@
+package com.fx.monitor.examples;
+
+import javax.servlet.*;
+import javax.servlet.http.HttpServletRequest;
+import java.io.IOException;
+import org.slf4j.MDC;
+
+/**
+ * 简单的 Servlet Filter 示例:提取/生成 traceId、记录 uri 与 duration 到 MDC。
+ * 业务需将 MDC 中的字段通过 logback encoder 输出为 JSON 字段。
+ */
+public class LoggingFilter implements Filter {
+
+    @Override
+    public void init(FilterConfig filterConfig) throws ServletException { }
+
+    @Override
+    public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException {
+        HttpServletRequest req = (HttpServletRequest) request;
+        String traceId = extractOrGenerate(req);
+        try {
+            MDC.put("traceId", traceId);
+            MDC.put("uri", req.getRequestURI());
+            long start = System.currentTimeMillis();
+            chain.doFilter(request, response);
+            long duration = System.currentTimeMillis() - start;
+            MDC.put("duration", String.valueOf(duration));
+        } finally {
+            MDC.remove("traceId");
+            MDC.remove("uri");
+            MDC.remove("duration");
+        }
+    }
+
+    @Override
+    public void destroy() { }
+
+    private String extractOrGenerate(HttpServletRequest req) {
+        String incoming = req.getHeader("X-Trace-Id");
+        if (incoming != null && !incoming.isEmpty()) return incoming;
+        return java.util.UUID.randomUUID().toString();
+    }
+}

+ 0 - 111
Log-operation.md

@@ -102,114 +102,3 @@ public class LoggingFilter implements Filter {
 ```
 
 ---
-
-## Agent 选择与 Promtail 指南
-
-- 何时用 Vector:当每节点日志吞吐大(数十 MB/s)或对 CPU/内存敏感,优先使用 Vector 做高效 JSON 解析与标签抽取。
-- Promtail 场景:路径收集或小规模集群。若用 Promtail,尽量把复杂解析移到 Vector/后端。
-
-Promtail 配置要点(原则):
-- 先用非常廉价的判断(按 `level` 快速 drop DEBUG/TRACE),避免复杂正则。
-- 在深度解析前完成 drop/采样,减少 JSON parse 次数。
-- 只将低基数字段作为 label(例:`level`、`namespace`、`event`、`exception_type`)。
-- 把 `duration` 作为 numeric 字段供 `unwrap`/`unpack` 使用,不作为标签。
-
-(保留原方案“三段减压 + 哈希采样 + 异常/慢调用/审计分支”思路,详见 `Log.md`。)
-
----
-
-## 资源占用参考(4C8G 节点)
-
-阶段 | CPU | 内存 | 说明
----|---:|---:|---
-原始全量 JSON 解析 | 150-200% | 400 MB | 无过滤
-加三段减压阀后 | 30-40% | 120 MB | 同集群实测
-换 Vector 解析 | 10-15% | 100 MB | Promtail 仅转发
-
-测试提示:在 staging 用真实流量做 A/B 对比(72 小时)量化效果。
-
----
-
-## 存储与爆炸兜底
-
-1. 标签基数门禁:上线前运行基数扫描脚本,任一计划作为 label 的字段 24h 唯一值 > 5000 则阻断发布。
-
-2. Loki 整流:
-
-```yaml
-limits_config:
-  per_stream_rate_limit: 3MB
-  per_stream_rate_limit_burst: 5MB
-  ingestion_rate_mb: 10
-  ingestion_burst_size_mb: 20
-```
-
-超限将被丢弃并暴露 `rate_limit_discarded_bytes` 指标。
-
-3. Retention:`audit` 流 365d,其余 7d,持久化到 S3/OSS。
-
-基数扫描示例(部署前/CI)
-
-```bash
-logcli query '{env="prod"}' --since=24h | jq -r '.[].event' | sort | uniq -c | awk '$1>5000{print $2, $1}'
-```
-
----
-
-## 告警模板(LogQL)
-
-保留原有:异常突增、慢调用 P99、审计事件下降。
-
-补充监控:agent 解析错误与丢弃量
-
-```logql
-# agent parsing errors
-sum(rate(promtail_parsing_errors_total[5m])) > 0
-
-# Loki 丢弃流量
-rate(loki_ingester_discarded_bytes_total[5m]) > 0
-
-# Promtail/Vector 本地丢弃量(采样)
-rate(promtail_dropped_bytes_total[5m]) > 0
-```
-
----
-
-## 运维 Checklist
-
-每日
-- `kubectl top pod -l app=promtail` 单核 CPU <500m
-- 部署前运行基数扫描脚本(CI/PR 阶段)
-
-每周
-- 检查 `rate(promtail_dropped_bytes_total[5m]) > 0`(确认采样生效)
-
-每月
-- 回顾 `per_stream_rate_limit` 丢弃量并调整采样策略
-
-新增:在 Helm/Chart CI 中加入基数检测脚本,发现高基数阻断发布并在 PR 中给出异常样例。
-
----
-
-## 迁移与回滚简要流程
-
-阶段:
-1. Staging:双写(旧流 & 新流)72 小时,验证告警与检索一致性。
-2. Canary:10% 节点,逐步增加采样并监控 `parsing_errors`、`dropped_bytes`、Loki ingress。
-3. 全量切换并监控 24-72 小时;若异常立即回滚并恢复双写。
-
-回滚:通过 Git/Helm release 快速回退到上一个成功版本。
-
----
-
-## 附录:脚本与示例文件
-
-如果需要我可以把以下内容作为独立文件加入仓库:
-- `LoggingFilter.java`(完整示例)
-- `logback-spring.xml` 模板
-- 基数检测脚本(CI 用)
-- 完整迁移 Playbook(含 Helm 示例)
-
----
-
-如果希望我现在把示例文件写入仓库,请回复我需要的文件清单(例如:`LoggingFilter.java`, `logback-spring.xml`, `ci/check_cardinality.sh`)。

+ 45 - 0
logs/LoggingFilter.java

@@ -0,0 +1,45 @@
+package com.example.logging;
+
+import javax.servlet.*;
+import javax.servlet.http.HttpServletRequest;
+import java.io.IOException;
+import org.slf4j.MDC;
+
+public class LoggingFilter implements Filter {
+
+    @Override
+    public void init(FilterConfig filterConfig) throws ServletException {}
+
+    @Override
+    public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain)
+            throws IOException, ServletException {
+        HttpServletRequest req = (HttpServletRequest) request;
+        String traceId = extractOrGenerate(req);
+        try {
+            MDC.put("traceId", traceId);
+            MDC.put("uri", req.getRequestURI());
+            // 如果能从认证/上下文拿到 userId,填入 MDC
+            String userId = req.getHeader("X-User-Id");
+            if (userId != null) MDC.put("userId", userId);
+
+            long start = System.currentTimeMillis();
+            chain.doFilter(request, response);
+            long duration = System.currentTimeMillis() - start;
+            MDC.put("duration", String.valueOf(duration));
+        } finally {
+            MDC.remove("traceId");
+            MDC.remove("uri");
+            MDC.remove("userId");
+            MDC.remove("duration");
+        }
+    }
+
+    @Override
+    public void destroy() {}
+
+    private String extractOrGenerate(HttpServletRequest req) {
+        String t = req.getHeader("X-Trace-Id");
+        if (t != null && !t.isEmpty()) return t;
+        return java.util.UUID.randomUUID().toString();
+    }
+}

+ 21 - 0
logs/logback-spring.xml

@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+
+  <appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
+    <encoder class="net.logstash.logback.encoder.LogstashEncoder">
+      <includeContext>true</includeContext>
+      <provider class="net.logstash.logback.composite.loggingevent.LoggingEventPatternJsonProvider">
+        <pattern>{"ts":"%d{yyyy-MM-dd'T'HH:mm:ss.SSS'Z'}","level":"%level","logger":"%logger","msg":"%msg","traceId":"%X{traceId:-}","uri":"%X{uri:-}","duration":"%X{duration:-0}","userId":"%X{userId:-}","event":"%X{event:-}","error":"%X{error:-}"}</pattern>
+      </provider>
+    </encoder>
+  </appender>
+
+  <appender name="ASYNC_JSON" class="ch.qos.logback.classic.AsyncAppender">
+    <appender-ref ref="JSON" />
+  </appender>
+
+  <root level="INFO">
+    <appender-ref ref="ASYNC_JSON"/>
+  </root>
+
+</configuration>