diff --git a/backend/Makefile b/backend/Makefile index fdfc464..0bb73a1 100644 --- a/backend/Makefile +++ b/backend/Makefile @@ -1,7 +1,7 @@ # TopFans Backend Makefile # 用于简化开发流程 -.PHONY: help install-swagger gen-swagger update-swagger start-swagger start-all stop-all clean build run all +.PHONY: help install-swagger gen-swagger update-swagger start-swagger start-all stop-all clean build run all loadgen-build loadgen-test loadgen-vet loadgen-ci # 默认目标 help: @@ -23,6 +23,11 @@ help: @echo " make run - 运行 Gateway" @echo " make all - 安装依赖 + 生成文档 + 构建" @echo "" + @echo "压测工具:" + @echo " make loadgen-build - 编译 seed + loadgen 到 bin/" + @echo " make loadgen-test - 运行 loadgen 单元测试" + @echo " make loadgen-vet - go vet 静态检查" + @echo "" @echo "清理:" @echo " make clean - 清理生成的文件" @echo "" @@ -37,6 +42,11 @@ help: @echo " make run - 运行 Gateway" @echo " make all - 安装依赖 + 生成文档 + 构建" @echo "" + @echo "压测工具:" + @echo " make loadgen-build - 编译 seed + loadgen 到 bin/" + @echo " make loadgen-test - 运行 loadgen 单元测试" + @echo " make loadgen-vet - go vet 静态检查" + @echo "" @echo "清理:" @echo " make clean - 清理生成的文件" @@ -92,8 +102,32 @@ clean: @rm -rf backend/gateway/docs/*.go @rm -rf backend/gateway/docs/*.json @rm -rf backend/gateway/docs/*.yaml + @rm -rf backend/bin/ @echo "✅ 清理完成" +# ==================== Loadgen / 压测工具 ==================== + +# 编译 seed 和 loadgen 二进制到 bin/ +loadgen-build: + @echo "编译 loadgen 工具..." + @mkdir -p bin + @go build -ldflags="-s -w" -o bin/seed ./scripts/loadgen/seed/ + @go build -ldflags="-s -w" -o bin/loadgen ./scripts/loadgen/loadgen/ + @echo "✅ seed + loadgen → bin/" + +# 运行 loadgen 单元测试 (当前 23 个测试, 应全过) +loadgen-test: + @echo "运行 loadgen 单元测试..." + @go test -count=1 ./scripts/loadgen/... + +# go vet 静态检查 +loadgen-vet: + @echo "go vet loadgen..." + @go vet ./scripts/loadgen/... + +# loadgen 完整 CI 入口: vet + test + build +loadgen-ci: loadgen-vet loadgen-test loadgen-build + # 全部:安装依赖 + 生成文档 + 构建 all: install-swagger gen-swagger build @echo "" diff --git a/backend/reports/S1.json b/backend/reports/S1.json new file mode 100644 index 0000000..c0d0b5c --- /dev/null +++ b/backend/reports/S1.json @@ -0,0 +1,23 @@ +{ + "scenario": "S1", + "total_requests": 8, + "errors": 0, + "five_xx": 0, + "p50_us": 73919, + "p95_us": 83071, + "p99_us": 83071, + "max_us": 83071, + "stages": [ + { + "stage_idx": 1, + "target_rps": 1, + "total_requests": 8, + "errors": 0, + "five_xx": 0, + "p50_us": 73919, + "p95_us": 83071, + "p99_us": 83071, + "max_us": 83071 + } + ] +} diff --git a/backend/reports/S2.json b/backend/reports/S2.json new file mode 100644 index 0000000..7bac804 --- /dev/null +++ b/backend/reports/S2.json @@ -0,0 +1,23 @@ +{ + "scenario": "S2", + "total_requests": 8, + "errors": 8, + "five_xx": 0, + "p50_us": 1552, + "p95_us": 2909, + "p99_us": 2909, + "max_us": 2909, + "stages": [ + { + "stage_idx": 1, + "target_rps": 1, + "total_requests": 8, + "errors": 8, + "five_xx": 0, + "p50_us": 1552, + "p95_us": 2909, + "p99_us": 2909, + "max_us": 2909 + } + ] +} diff --git a/backend/reports/S4.json b/backend/reports/S4.json new file mode 100644 index 0000000..d9e0110 --- /dev/null +++ b/backend/reports/S4.json @@ -0,0 +1,45 @@ +{ + "scenario": "S4", + "total_requests": 18, + "errors": 18, + "five_xx": 0, + "p50_us": 1210, + "p95_us": 2161, + "p99_us": 2161, + "max_us": 2161, + "stages": [ + { + "stage_idx": 1, + "target_rps": 1, + "total_requests": 3, + "errors": 3, + "five_xx": 0, + "p50_us": 4143, + "p95_us": 8943, + "p99_us": 8943, + "max_us": 8943 + }, + { + "stage_idx": 2, + "target_rps": 2, + "total_requests": 6, + "errors": 6, + "five_xx": 0, + "p50_us": 1314, + "p95_us": 2044, + "p99_us": 2044, + "max_us": 2044 + }, + { + "stage_idx": 3, + "target_rps": 3, + "total_requests": 9, + "errors": 9, + "five_xx": 0, + "p50_us": 1210, + "p95_us": 2161, + "p99_us": 2161, + "max_us": 2161 + } + ] +} diff --git a/backend/reports/baseline.csv b/backend/reports/baseline.csv new file mode 100644 index 0000000..bcb4015 --- /dev/null +++ b/backend/reports/baseline.csv @@ -0,0 +1,4 @@ +scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms,stages +S1,8,0,0,73.91,83.07,83.07,83.07,1 +S2,8,8,0,1.55,2.90,2.90,2.90,1 +S4,18,18,0,1.20,2.16,2.16,2.16,3 diff --git a/backend/reports/final-report.md b/backend/reports/final-report.md new file mode 100644 index 0000000..8502670 --- /dev/null +++ b/backend/reports/final-report.md @@ -0,0 +1,227 @@ +# TopFans 压测报告 + +## 📋 运行信息 + +| 项 | 值 | +|---|---| +| **生成时间** | 2026-06-15 20:05:56 CST | +| **压测开始** | 2026-06-15 20:05:47 CST | +| **压测结束** | 2026-06-15 20:05:56 CST | +| **总耗时** | 9s | +| **目标地址** | `http://localhost:8080` | +| **测试场景** | S4 | +| **阶梯模式** | step (`1,2,3`) | +| **JWT 签名密钥** | `topfans-***` (前 8 位) | +| **监控模式** | off | +| **总请求数** | 34 | +| **总错误数** | 26 (76.47%) | +| **5xx 数** | 0 (0.00%) | + +--- + +## 🎯 执行摘要 + +**总览**: ✅ 1 健康 / ⚠️ 0 警告 / 🚨 2 严重 (共 3) + +🚨 **关键问题** (2 个): + +- **S2 (浏览资产详情)**: 错误率 100.00% +- **S4 (资产铸造 (mint))**: 错误率 100.00% + +**场景速览**: + +- ✅ **S1 用户登录** — p99=83ms, err 0.00% +- 🚨 **S2 浏览资产详情** — p99=3ms, err 100.00% +- 🚨 **S4 资产铸造 (mint)** — p99=2ms, err 100.00% + +--- + +## 📊 总览表 + +| 场景 | 描述 | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 拐点 RPS | 状态 | +|------|------|-------|-----|-----|-------|-------|-------|-------|---------|------| +| **S1** | 用户登录 | 8 | 0 (0.00%) | 0 (0.00%) | 74 | 83 | 83 | 83 | — | ✅ | +| **S2** | 浏览资产详情 | 8 | 8 (100.00%) | 0 (0.00%) | 2 | 3 | 3 | 3 | — | 🚨 | +| **S4** | 资产铸造 (mint) | 18 | 18 (100.00%) | 0 (0.00%) | 1 | 2 | 2 | 2 | — | 🚨 | + +> 说明: Err 包含 4xx + 5xx,5xx 是子集。错误率 = Err / Total。 + +## 🔬 跨场景瓶颈分析 + +✅ **无明显瓶颈**,所有场景 P99 都在阈值内。 + +**P99 / 阈值 比率** (从高到低): + +- S1: 0.08x (83ms) +- S2: 0.01x (3ms) +- S4: 0.00x (2ms) + +--- + +## ✅ S1 用户登录 + +### 📌 测试说明 + +| 项 | 值 | +|---|---| +| **API** | `POST /api/v1/auth/login` | +| **负载类型** | ✏️ 轻写 | +| **业务说明** | 用户身份认证,签发 JWT | +| **影响范围** | 🔴 所有用户必经路径,失败 = 用户进不来 | + +### 📈 性能指标 vs 健康阈值 + +| 指标 | 实测 | 阈值 | 判定 | +|------|------|------|------| +| P50ms | 74 | ≤100 | ✅ | +| P95ms | 83 | ≤300 | ✅ | +| P99ms | 83 | ≤1000 | ✅ | +| Maxms | 83 | — | ℹ️ 参考 | +| 错误率 | 0.00% | ≤1.00% | ✅ | +| 5xx 率 | 0.00% | ≤0.10% | ✅ | + +### 📍 拐点分析 + +ℹ️ 仅 1 个 stage,未做阶梯测试,无法判断拐点。 + +### 🔢 阶梯结果 + +| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 | +|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------| +| 1 | 1 | 8 | 0 | 0 | 74 | 83 | 83 | 83 | | + +### 🎯 行动项 + +✅ 无需行动项 — 所有指标在阈值内。 + +### 📉 图表 + +![S1 RPS / P99 / Error](.//s1.png) + +--- + +## 🚨 S2 浏览资产详情 + +### 📌 测试说明 + +| 项 | 值 | +|---|---| +| **API** | `GET /api/v1/assets/{id}` | +| **负载类型** | 📖 读 | +| **业务说明** | 高频读路径,典型缓存命中场景 | +| **影响范围** | 🟢 单用户最高频操作,影响页面加载体验 | + +### 📈 性能指标 vs 健康阈值 + +| 指标 | 实测 | 阈值 | 判定 | +|------|------|------|------| +| P50ms | 2 | ≤50 | ✅ | +| P95ms | 3 | ≤150 | ✅ | +| P99ms | 3 | ≤500 | ✅ | +| Maxms | 3 | — | ℹ️ 参考 | +| 错误率 | 100.00% | ≤1.00% | 🚨 | +| 5xx 率 | 0.00% | ≤0.10% | ✅ | + +### 📍 拐点分析 + +ℹ️ 仅 1 个 stage,未做阶梯测试,无法判断拐点。 + +### 🔢 阶梯结果 + +| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 | +|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------| +| 1 | 1 | 8 | 8 | 0 | 2 | 3 | 3 | 3 | | + +### 🎯 行动项 + +- [ ] **🟡 P1**: 错误率 100.00% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失 + +### 📉 图表 + +![S2 RPS / P99 / Error](.//s2.png) + +--- + +## 🚨 S4 资产铸造 (mint) + +### 📌 测试说明 + +| 项 | 值 | +|---|---| +| **API** | `POST /api/v1/assets/mints/precreate` | +| **负载类型** | 🛠️ 重写 | +| **业务说明** | 写重路径:OSS 上传 + 签名 + 事务落库 | +| **影响范围** | 🟡 核心交易,影响创作者产出节奏 | + +### 📈 性能指标 vs 健康阈值 + +| 指标 | 实测 | 阈值 | 判定 | +|------|------|------|------| +| P50ms | 1 | ≤300 | ✅ | +| P95ms | 2 | ≤800 | ✅ | +| P99ms | 2 | ≤2000 | ✅ | +| Maxms | 2 | — | ℹ️ 参考 | +| 错误率 | 100.00% | ≤1.00% | 🚨 | +| 5xx 率 | 0.00% | ≤0.10% | ✅ | + +### 📍 拐点分析 + +✅ **拐点未触发** — 全程 3 个 stage 健康运行,最高 3 RPS p99=2ms。 + +### 🔢 阶梯结果 + +| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 | +|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------| +| 1 | 1 | 3 | 3 | 0 | 4 | 9 | 9 | 9 | | +| 2 | 2 | 6 | 6 | 0 | 1 | 2 | 2 | 2 | -77% | +| 3 | 3 | 9 | 9 | 0 | 1 | 2 | 2 | 2 | +6% | + +### 🎯 行动项 + +- [ ] **🟡 P1**: 错误率 100.00% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失 + +### 📉 图表 + +![S4 RPS / P99 / Error](.//s4.png) + +--- + +## 📎 附录 + +### 健康阈值说明 + +- **P50/P95/P99**: 百分位延迟 (毫秒),值越小越好 +- **错误率**: 4xx+5xx 请求占比,健康 < 1% +- **5xx 率**: 服务端错误率,健康 < 0.1% +- **拐点**: 阶梯测试中,p99 相对前一 stage 涨幅 > 50% 的第一个 stage + +### 文件清单 + +``` +reports/ +├── final-report.md (本文件) +├── baseline.csv (Excel 可打开的汇总) +├── s1.json +├── s1.png +├── s2.json +├── s2.png +├── s3.json +├── s3.png +├── s4.json +├── s4.png +├── s5.json +├── s5.png +├── s6.json +├── s6.png +├── s7.json +├── s7.png +``` + +### 如何复现 + +```bash +cd /opt/topfans/loadtest +./loadgen --cmd=run --scenarios=S4 --stage=step --step-schedule='1,2,3' \ + --target=http://localhost:8080 \ + --monitor=off \ +``` diff --git a/backend/reports/run-metadata.json b/backend/reports/run-metadata.json new file mode 100644 index 0000000..281e7b3 --- /dev/null +++ b/backend/reports/run-metadata.json @@ -0,0 +1,12 @@ +{ + "start_time": "2026-06-15T20:05:47.357522+08:00", + "end_time": "2026-06-15T20:05:56.380495+08:00", + "target": "http://localhost:8080", + "scenarios": [ + "S4" + ], + "step_schedule": "1,2,3", + "jwt_secret_hint": "topfans-", + "monitor_mode": "off", + "stage_mode": "step" +} \ No newline at end of file diff --git a/backend/reports/s1.png b/backend/reports/s1.png new file mode 100644 index 0000000..6c82264 Binary files /dev/null and b/backend/reports/s1.png differ diff --git a/backend/reports/s2.png b/backend/reports/s2.png new file mode 100644 index 0000000..17e495a Binary files /dev/null and b/backend/reports/s2.png differ diff --git a/backend/reports/s4.png b/backend/reports/s4.png new file mode 100644 index 0000000..e837a74 Binary files /dev/null and b/backend/reports/s4.png differ diff --git a/backend/scripts/loadgen/README.md b/backend/scripts/loadgen/README.md index 498c872..f3f4150 100644 --- a/backend/scripts/loadgen/README.md +++ b/backend/scripts/loadgen/README.md @@ -1,69 +1,129 @@ -# 后端服务压测工具 +# 后端服务压测工具 (loadgen) -为部署在阿里云单机(4G/2C)的 TopFans 后端微服务设计。 +> 给阿里云单机 (4G/2C) TopFans 后端微服务用的压测 + 数据准备工具集。 +> 凌晨 02:00-06:00 业务低峰执行,数据物理隔离 `star_id=999900`。 -## 目录 +--- + +## 📚 文档地图 + +| 文档 | 用途 | 谁要看 | +|------|------|--------| +| **README.md** (本文) | 工具集概览 + 5 分钟入门 | 所有人 | +| [RUNBOOK.md](RUNBOOK.md) | 凌晨压测**一步一步**操作手册 | on-call 工程师 | +| [REPORT_GUIDE.md](REPORT_GUIDE.md) | 压测报告**怎么读** + 瓶颈定位 + 行动项模板 | 看报告的工程师 / TL | +| [seed/README.md](seed/README.md) | seed 工具细节 (数据准备) | 第一次跑压测的人 | + +--- + +## 🧰 工具集概览 ``` -backend/scripts/loadgen/ -├── seed/ # 数据准备工具(CLI) -│ ├── main.go # seed CLI 入口 -│ ├── stars.go users.go profiles.go assets.go -│ ├── slots_and_exhibits.go friendships.go -│ ├── tokens.go sequences.go cleanup.go -│ ├── seed_test.go # 单元测试 -│ └── README.md -├── loadgen/ # 压测主程序 -│ ├── main.go # loadgen CLI 入口 -│ ├── preflight.go verify.go # 7 项开压前检查 + 压后验证 -│ ├── lib/ # 核心库(16 个测试全过) -│ │ ├── csv.go client.go hdr.go log.go ramp.go -│ │ ├── circuit.go ssh_metrics.go config.go -│ │ └── *_test.go -│ ├── scenarios/ # 7 个场景(已注册) -│ │ ├── s1_login.go s2_read.go s3_like.go s4_mint.go -│ │ ├── s5_dashboard.go s6_ranking.go s7_place.go -│ │ ├── common.go scenarios.go -│ │ └── scenarios_test.go -│ └── reporter/ # 报告生成 -│ ├── json.go csv.go plot.go markdown.go -├── monitor/ # 监控栈 -│ ├── sample.sh # 后台采样(写到 metrics-feed.jsonl) -│ ├── docker-compose.monitor.yml -│ ├── prometheus.yml -│ └── grafana-dashboards/ # 4 个预置面板 -├── recover/ # 一键灭火 + 备份还原 -│ ├── emergency-stop.sh -│ └── restore-from-backup.sh -├── scripts/ # 部署到 prod -│ └── mint_reset.sh -└── reports/ # 跑测产出(gitignore) +loadgen/ +├── seed/ # 数据准备 CLI (生成 1000 个测试用户 + 资产 + JWT) +├── loadgen/ # 压测主程序 (7 个场景,6 维熔断,带 reporter) +├── monitor/ # 监控栈 (Prometheus + Grafana,可选) +├── recover/ # 紧急灭火 (一键停 + 数据库恢复) +├── scripts/ # 部署到 prod 的辅助脚本 +└── reports/ # 跑测产出 (gitignore,scp 拉回本地) ``` -## 编译 +### 核心 CLI: `bin/seed` + `bin/loadgen` + +| 命令 | 作用 | +|------|------| +| `./bin/seed` | 灌测试数据 → `users.csv` + 数据库 | +| `./bin/seed --cleanup` | 清理测试数据 (保留 1000 用户) | +| `./bin/seed --cleanup --full` | 全部删掉 (账号本身) | +| `./bin/seed --reset-tokens` | 只重签 JWT (跨周压测用) | +| `./bin/loadgen --cmd=preflight` | 7 项开压前检查 | +| `./bin/loadgen --cmd=run --scenarios=S1` | 跑场景 | +| `./bin/loadgen --cmd=report` | 生成 markdown 报告 + PNG 图表 | + +### 7 个场景 + +| ID | 场景 | 默认 RPS | 写/读 | 关键 API | +|----|------|---------|------|---------| +| S1 | Login | 15 | 写(轻) | `POST /api/v1/auth/login` | +| S2 | Read | 250 | 读 | `GET /api/v1/assets/{id}` | +| S3 | Like | 50 | 写(轻) | `POST/DELETE /api/v1/social/assets/{id}/like` | +| S4 | Mint | 1-5 | **写(重)** | `POST /api/v1/assets/mints/precreate` | +| S5 | Dashboard | — | 读聚合 | (dashboard 聚合) | +| S6 | Ranking | 300 | 读 | `GET /api/v1/rankings/hot` | +| S7 | Place | 1-5 | **写(重)** | (摆展事务) | + +--- + +## 🚀 5 分钟入门 (本地 docker) + +```bash +# 1. 编译 (Linux prod 部署用,本地 darwin 直接 go build) +cd backend +make loadgen-build + +# 2. 准备数据 (需要本地 docker postgres) +cd scripts/loadgen/seed +# 生成 bcrypt 哈希 (与 tokens.go 硬编码的 "Test@123" 匹配) +python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \ + > loadtest_bcrypt.txt +# 跑 seed (用本地 docker 的 env) +DB_PASSWORD=123456 \ +JWT_SECRET=topfans-secret-key-local-dev-only \ +/Users/liulujian/Documents/code/TopFansByGithub/backend/bin/seed \ + --db-name=top-fans --db-host=localhost --db-port=15432 --db-user=postgres + +# 3. 复制 users.csv 到 backend 目录 +cp users.csv ../../../users.csv + +# 4. 开压前检查 +cd ../../../ # = backend +JWT_SECRET=topfans-secret-key-local-dev-only \ + ./bin/loadgen --cmd=preflight --target=http://localhost:8080 + +# 5. 烟雾测试 (30 秒,1 RPS) +JWT_SECRET=topfans-secret-key-local-dev-only \ + ./bin/loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \ + --target=http://localhost:8080 --monitor=off + +# 6. 生成报告 +JWT_SECRET=topfans-secret-key-local-dev-only \ + ./bin/loadgen --cmd=report --input=./reports --output=./reports/final-report.md +open reports/final-report.md # macOS +``` + +--- + +## 🔨 编译 ```bash cd backend +make loadgen-build # 编译 seed + loadgen 到 bin/ +make loadgen-test # 单元测试 (23 个) +make loadgen-vet # go vet +make loadgen-ci # vet + test + build (CI 单步) +``` + +手动编译 (Linux prod): +```bash GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/seed ./scripts/loadgen/seed/ GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/loadgen ./scripts/loadgen/loadgen/ ``` -## 测试 +--- -```bash -cd backend -go test ./scripts/loadgen/... -``` +## 🛡️ 安全设计 -**当前测试状态** (截至 Phase 7 完结): -- `seed` 包: 5/5 PASS -- `loadgen/lib` 包: 16/16 PASS -- `loadgen/scenarios` 包: 2/2 PASS -- 共 23 个测试全过 +### 数据隔离 +所有测试数据用 `star_id = 999900` 物理隔离,**不影响**真实业务 star_id (87, 88, 91, 93, 94, 95)。 -## 关键特性 +### CLAUDE.md 序列重置 +seed 工具末尾自动同步所有相关表的 PG 序列(避免后续 GORM 插入报 duplicate key)。 -### 1. 6 维红线判停(自动熔断) +### 凌晨窗口 +执行窗口:**02:00 - 06:00** 业务低峰。 +紧急灭火: `recover/emergency-stop.sh` 一键停 + `restore-from-backup.sh` 5-8min 还原。 + +### 6 维红线熔断 (自动停) | # | 红线 | 阈值 | 数据源 | |---|------|------|--------| @@ -74,20 +134,108 @@ go test ./scripts/loadgen/... | R5 | 磁盘空闲 | < 5GB 持续 30s | metrics-feed | | R6 | OOM 事件 | 瞬时触发 | metrics-feed | -### 2. CLAUDE.md 序列重置 +--- -seed 工具自动同步所有相关表的 PG 序列(避免后续 GORM 插入报 duplicate key)。 +## 📊 报告产出 -### 3. 数据隔离 +跑完 + `--cmd=report` 后,`reports/` 下: -所有测试数据用 `star_id = 999900` 物理隔离,不影响真实业务 star_id (87, 88, 91, 93, 94, 95)。 +``` +reports/ +├── S1.json # 原始数据 (含 stages) +├── S2.json +├── S4.json +├── baseline.csv # Excel 友好的汇总 +├── s1.png # RPS / P99 / Error 曲线 +├── s2.png +├── s4.png +└── final-report.md # ← 主要看这个 +``` -### 4. 凌晨窗口 +`final-report.md` 包含: +1. **总览表** (所有场景一行一个,7 列) +2. **每个场景的 ⚠️ 拐点 RPS** (自动算:第一个 p99 涨 >50% 的 stage) +3. **阶梯结果表** (每 stage 的 RPS / p50 / p95 / p99 / err / 5xx) +4. **PNG 曲线图** (RPS / P99 / Error 三条线) -执行窗口:凌晨 02:00-06:00 业务低峰。emergency-stop 一键回滚,restore-from-backup.sh 5-8min 还原。 +详细读法见 [REPORT_GUIDE.md](REPORT_GUIDE.md)。 -## 详细文档 +--- + +## 🧪 测试状态 + +``` +seed: 5/5 PASS +loadgen/lib: 16/16 PASS +scenarios: 2/2 PASS +TOTAL: 23/23 PASS +``` + +--- + +## 📁 完整目录 + +``` +backend/scripts/loadgen/ +├── README.md # ← 你在这里 +├── RUNBOOK.md # ← 凌晨压测操作手册 +├── REPORT_GUIDE.md # ← 报告怎么读 +├── seed/ # 数据准备工具 +│ ├── main.go # CLI 入口 +│ ├── stars.go users.go profiles.go assets.go +│ ├── slots_and_exhibits.go friendships.go +│ ├── tokens.go sequences.go cleanup.go +│ ├── seed_test.go # 单元测试 +│ ├── loadtest_bcrypt.txt # Test@123 哈希 (与 tokens.go 匹配) +│ └── README.md +├── loadgen/ # 压测主程序 +│ ├── main.go # CLI 入口 +│ ├── preflight.go verify.go # 7 项开压前检查 + 压后验证 +│ ├── lib/ # 核心库 +│ │ ├── csv.go # users.csv 解析 +│ │ ├── client.go # HTTP client +│ │ ├── hdr.go # 延迟直方图 + per-stage 计数 +│ │ ├── log.go ramp.go # 日志 + 阶梯调度 +│ │ ├── circuit.go # 6 维熔断 +│ │ ├── ssh_metrics.go # prod server metrics 抓取 +│ │ ├── config.go +│ │ └── *_test.go # 16 个测试 +│ ├── scenarios/ # 7 个场景 +│ │ ├── s1_login.go +│ │ ├── s2_read.go +│ │ ├── s3_like.go +│ │ ├── s4_mint.go # 支持多 stage +│ │ ├── s5_dashboard.go +│ │ ├── s6_ranking.go +│ │ ├── s7_place.go +│ │ ├── common.go # doRequest + DefaultBaseURL +│ │ ├── scenarios.go # 注册表 +│ │ ├── helpers.go +│ │ └── scenarios_test.go +│ └── reporter/ # 报告生成 +│ ├── json.go # RunReport + StageReport +│ ├── csv.go # baseline.csv +│ ├── plot.go # PNG 曲线 (gonum) +│ ├── markdown.go # final-report.md +│ └── knee.go # KneeRPS 自动算 +├── monitor/ # 监控栈 (可选) +│ ├── sample.sh # 后台采样到 metrics-feed.jsonl +│ ├── docker-compose.monitor.yml +│ ├── prometheus.yml +│ └── grafana-dashboards/ # 4 个预置面板 +├── recover/ # 紧急灭火 +│ ├── emergency-stop.sh +│ └── restore-from-backup.sh +├── scripts/ # prod 辅助 +│ ├── mint_reset.sh # S4 之间的 mint 数据清理 +│ └── prod_seed.sh # 一键跑 seed (读 prod env) +└── reports/ # 跑测产出 (gitignore) +``` + +--- + +## 详细设计 - **设计文档**: `docs/superpowers/specs/2026-06-12-load-testing-design.md` - **实施计划**: `docs/superpowers/plans/2026-06-12-load-testing.md` -- **seed 工具说明**: `seed/README.md` +- **seed 工具说明**: [seed/README.md](seed/README.md) diff --git a/backend/scripts/loadgen/REPORT_GUIDE.md b/backend/scripts/loadgen/REPORT_GUIDE.md new file mode 100644 index 0000000..30c3b29 --- /dev/null +++ b/backend/scripts/loadgen/REPORT_GUIDE.md @@ -0,0 +1,266 @@ +# REPORT_GUIDE — 压测报告怎么读 + +> **目标读者**:看完压测报告后,需要判断"系统能扛住吗"+"哪里是瓶颈"+"下一步改什么"的工程师 +> **报告路径**:`reports/final-report.md` (主) + `reports/{scenario}.json` (原始) + `reports/{scenario}.png` (图) + +--- + +## 1. 报告目录结构 + +``` +reports/ +├── S1.json # 场景 1 原始数据 (程序读) +├── S2.json # 场景 2 +├── S4.json # 场景 4 +├── baseline.csv # Excel 可打开的汇总表 +├── s1.png # 场景 1 曲线图 (RPS / P99 / Error) +├── s2.png +├── s4.png +└── final-report.md # ← 你要看的总报告 +``` + +--- + +## 2. 三步读完报告 + +### 第 1 步:看汇总表 (1 分钟) + +```markdown +| Scenario | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | Stages | +|----------|-------|-----|-----|-------|-------|-------|-------|--------| +| S1 | 12500 | 0 | 0 | 86.59 | 119.23 | 200.50 | 450 | 5 | +| S2 | 25000 | 5 | 0 | 12.30 | 35.00 | 88.00 | 250 | 5 | +| S4 | 600 | 12 | 2 | 200.00 | 500.00 | 850.00 | 1200 | 4 | +``` + +**每个字段的含义**: + +| 字段 | 含义 | 健康参考 (4G/2C prod) | +|------|------|----------------------| +| `Scenario` | 场景 ID (S1=登录, S2=读, S3=点赞, S4=铸造, ...) | — | +| `Total` | 该场景总请求数 | 越大越好,代表你扛住了 | +| `Err` | 客户端+服务端错误总和 | **< 1%** | +| `5xx` | 服务端错误 (500-599) | **< 0.1%** (1‰) | +| `P50ms` | 50% 请求在这个时间内 | < 100ms | +| `P95ms` | 95% 请求在这个时间内 | < 300ms | +| `P99ms` | 99% 请求在这个时间内 | < 1000ms (S4 写重可放宽到 2000ms) | +| `Maxms` | 最慢的一次请求 | 一般 3-5x P99 | +| `Stages` | 阶梯测试的阶段数 | = step-schedule 的元素数 | + +**判断模板**: +- ✅ 全绿 → 系统扛得住,准备上线 +- ⚠️ 某个 S* Err > 1% → 优先看那个场景 +- 🚨 某个 S* 5xx > 1% → 服务端有问题,看 §3 定位 + +--- + +### 第 2 步:看拐点 (KneeRPS) (2 分钟) + +每个 scenario 标题下会出现一行: + +```markdown +**⚠️ 拐点**: stage 3 @ 3 RPS (p99 暴涨 514%) +``` + +**含义**: 当 RPS 升到 3 时,p99 延迟比 stage 2 暴涨 514% (5.14 倍)。 + +**判定逻辑** (在 `reporter/knee.go`): +- 逐 stage 比 p99 +- 第一次涨幅 > 50% 时,标记为拐点 +- 全程没涨 > 50% → 显示 "✅ 拐点未触发" + +**怎么用这个数字**: +- **S1 拐点 RPS = 15** → 你的登录服务,超过 15 QPS 就开始劣化。生产预估峰值 10 QPS,留 50% buffer +- **S4 拐点 RPS = 2** → 铸造接口很重,2 QPS 就劣化了。要么优化,要么限流 + +**举例**: +| 拐点 RPS | 业务含义 | 行动项 | +|---------|---------|--------| +| ≥ 期望峰值的 2x | ✅ 健康 | 上线,加监控 | +| ≈ 期望峰值 | ⚠️ 临界 | 加缓存 / 异步化 / 限流 | +| < 期望峰值 | 🚨 不达标 | 重构 + 复测 | + +--- + +### 第 3 步:看阶梯表 + 曲线图 (5 分钟) + +**阶梯表** (md 里每个场景下): + +```markdown +### 阶梯结果 +| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | +|-------|-----------|-------|-----|-----|-------|-------|-------|-------| +| 1 | 2 | 600 | 0 | 0 | 80 | 100 | 110 | 130 | +| 2 | 5 | 1500 | 0 | 0 | 82 | 105 | 115 | 140 | +| 3 | 10 | 3000 | 0 | 0 | 85 | 110 | 130 | 180 | +| 4 | 15 | 4500 | 0 | 0 | 95 | 130 | 200 | 350 | +| 5 | 20 | 6000 | 5 | 0 | 120 | 200 | 450 | 800 | +``` + +**怎么读**: + +- **Total** 应该是 `TargetRPS × Duration` (近似,因为有误差) +- **P99ms** 应该随 TargetRPS 上升**平滑增加** (10-30% 涨幅/stage 是正常) +- **Err / 5xx** 应该全程 < 1% +- **如果某 stage 突然 P99 翻倍** → 拐点,看上面 KneeRPS + +**曲线图** (`s1.png` 等): + +- **X 轴**: Stage 编号 (1, 2, 3, ...) +- **Y 轴**: 三个值 — RPS (蓝)、P99ms (绿)、Error% (红) +- **怎么看**: + - 三条线**平稳上升** = 正常 + - **P99 突然陡升** = 拐点 + - **Error% 突然跳起来** = 服务挂了 + +--- + +## 3. 定位瓶颈 — 常见模式 + +### 模式 1: P99 阶梯上升,但 Error 一直 0 +**含义**: 系统扛得住,但在变慢。 +**原因**: GC 抖动 / DB 慢查询 / 锁竞争。 +**行动**: +1. 看 PG 慢查询日志: `pg_stat_statements` ORDER BY `mean_exec_time` DESC +2. 看应用层 profile: `pprof` heap + cpu +3. 检查连接池配置: 可能太小 + +### 模式 2: P99 阶梯上升 + Error 也开始涨 +**含义**: 系统到极限。 +**原因**: 资源耗尽 (CPU 100%, 连接池满, DB 锁)。 +**行动**: +1. 看 server metrics feed: `tail -f metrics-feed.jsonl` +2. `top` 看 CPU/内存,`iostat` 看 IO +3. 检查是否有连接泄漏 (`netstat | grep TIME_WAIT`) + +### 模式 3: 阶梯早期就 5xx > 5% +**含义**: 系统本身有问题,不是负载问题。 +**原因**: 代码 bug / 配置错误 / 依赖缺失。 +**行动**: +1. 看 5xx 的具体响应体 (在 log 里) +2. 检查 error 码,对照业务错误码定义 +3. 看是不是 auth/JWT 过期 + +### 模式 4: 第一个 stage P99 很高,后续反而低 +**含义**: 热身不够 / 缓存没预热。 +**原因**: Redis 冷启动 / JIT 编译 / DB 连接池启动慢。 +**行动**: +1. 第一次 stage 加长 (例如先 2min 预热) +2. 或者用 `--rps=1` 先跑 1-2min 预热,再开阶梯 + +### 模式 5: S4 (Mint) 在很低的 RPS 就拐 +**含义**: 写路径太重。 +**原因**: 铸造涉及事务 / 签名 / OSS 上传,本身就是慢操作。 +**行动**: +1. 检查 mint 是不是同步阻塞 (能不能异步化?) +2. 看 mint 数据是否需要落库 (能否用 append-only?) +3. 考虑限流: 服务端拒绝 > 2 QPS 的 mint 请求 + +--- + +## 4. 怎么写出行动项 + +读完报告,应该能回答三个问题: + +### Q1: 系统能扛住业务预期峰值吗? +- 业务预期峰值 → 比对拐点 RPS +- 拐点 ≥ 2x 峰值 → ✅ 可以上线 +- 拐点 ≈ 1x 峰值 → ⚠️ 加监控告警,谨慎上线 +- 拐点 < 峰值 → 🚨 必须先优化 + +### Q2: 拐点在哪里?为什么? +看哪个 stage 拐的,然后: +- **CPU 100%** → 计算密集,优化算法或加机器 +- **DB CPU 100%** → 慢查询,加索引或读写分离 +- **PG 连接数满** → 连接池配置 / 服务降级 +- **PG 锁等待** → 事务设计问题 +- **磁盘 IO 满** → 加 SSD 或缓存 + +### Q3: 下一步改什么? + +行动项模板: + +```markdown +## [Loadtest 2026-06-15] 行动项 + +### P0 (上线前必修) +- [ ] **S2 Read 拐点 100 RPS < 业务预期 150 RPS** + - 根因: PG `assets` 表全表扫描,10 万行 + - 修复: 加 `idx_assets_star_id_status` 索引 + - Owner: @dba + +### P1 (1 周内修) +- [ ] **S4 Mint 拐点 2 RPS** + - 根因: 同步写 OSS + 同步落库 + - 修复: mint 流程拆成 precreate + 后台 worker + - Owner: @backend + +### P2 (技术债) +- [ ] 压测期间 CPU 持续 80%,考虑扩容到 4C +``` + +--- + +## 5. JSON 原始数据怎么读 (高级) + +`reports/S1.json` 长这样: + +```json +{ + "scenario": "S1", + "total_requests": 12500, + "errors": 5, + "five_xx": 0, + "p50_us": 86591, + "p95_us": 119231, + "p99_us": 200502, + "max_us": 450000, + "stages": [ + { + "stage_idx": 1, + "target_rps": 2, + "total_requests": 600, + "errors": 0, + "five_xx": 0, + "p50_us": 80000, + "p95_us": 100000, + "p99_us": 110000, + "max_us": 130000 + }, + ... + ] +} +``` + +**单位说明**: +- 所有 `_us` 后缀 = microseconds (微秒,1ms = 1000us) +- 例: `p99_us: 200502` = 200.5 ms + +**怎么用**: +- 画自己的图 (用 Excel/Google Sheets 打开 baseline.csv 最方便) +- 跟历史报告对比 (跨版本性能回归) +- CI 集成: 解析 JSON,断言 P99 < 某个阈值 + +--- + +## 6. 常见问题 + +### Q: "5xx=0 但 Err=5" 是什么意思? +A: 5xx 是服务端错,Err 是总错 (含 4xx)。Err > 5xx 表示有客户端错 (一般是 401/403/404)。看 log 里具体错误码。 + +### Q: 为什么 P50 很低但 P99 很高? +A: 正常 — 长尾效应。99% 都快但 1% 慢。如果 P99 太高说明有少数请求卡住,看是不是 GC / 锁 / IO 抖动。 + +### Q: Max 比 P99 高很多,是不是异常? +A: 可能是单个网络抖动,正常。Max / P99 < 5x 都是健康。 + +### Q: 同一个场景不同次跑,数据差很多? +A: 检查 prod 是否有其他流量在跑 (业务)。压测应在凌晨,业务低峰。 + +--- + +## 7. 进一步 + +- 想优化场景,见 `seed/README.md` +- 想加新场景,在 `scenarios/` 新建 `s8_xxx.go`,模仿 s1_login.go 的 BeginStage/EndStage 模式 +- 想加新的红线指标,见 `lib/circuit.go` diff --git a/backend/scripts/loadgen/RUNBOOK.md b/backend/scripts/loadgen/RUNBOOK.md new file mode 100644 index 0000000..ae18133 --- /dev/null +++ b/backend/scripts/loadgen/RUNBOOK.md @@ -0,0 +1,366 @@ +# RUNBOOK — 凌晨压测执行手册 + +> **目标读者**:负责 prod 凌晨压测的 on-call 工程师 +> **执行窗口**:02:00 - 06:00 (业务低峰) +> **预计总耗时**:1.5 - 4 小时 (按场景数) +> **风险等级**:🟡 中 (会写 23k+ 测试数据,但物理隔离 star_id=999900) + +--- + +## 0. 前置检查 (T-1 天) + +### 0.1 确认 prod 状态 +```bash +# SSH 到 prod +ssh root@101.132.250.62 + +# 确认 prod 网关正常 +curl -sS http://localhost:8080/health +# 期望: {"service":"top-fans-gateway","status":"ok"} + +# 确认磁盘空间 > 10GB (R5 红线需要) +df -h /opt +# 期望: Avail > 10G +``` + +### 0.2 确认阿里云快照 < 24h +- 登录 ECS 控制台 → 实例 → 磁盘与镜像 → 快照 +- 必须有 < 24h 的快照,**否则不要开压** +- 没有的话先手动触发:实例 → 更多 → 磁盘和镜像 → 创建快照 + +### 0.3 备份数据库 +```bash +ssh root@101.132.250.62 +mkdir -p /opt/topfans/backups +pg_dump -h localhost -U postgres topfans > /opt/topfans/backups/pre-loadtest-$(date +%Y%m%d-%H%M).sql +ls -lh /opt/topfans/backups/pre-loadtest-*.sql +# 期望: 文件 > 50MB +``` + +--- + +## 1. 上传/确认工具 (T-30min) + +### 1.1 确认工具已上传到 prod +```bash +ssh root@101.132.250.62 +ls -la /opt/topfans/loadtest/ +# 必须看到: +# seed (二进制) +# loadgen (二进制) +# loadtest_bcrypt.txt +# scripts/prod_seed.sh +# README.md +# reports/ (空目录) +``` + +如果文件缺失,本地重新上传: +```bash +# 本地 (从 backend 目录) +cd /Users/liulujian/Documents/code/TopFansByGithub/backend + +# 重新编译 +make loadgen-build + +# 上传 +scp bin/seed bin/loadgen root@101.132.250.62:/opt/topfans/loadtest/ +scp scripts/loadgen/seed/loadtest_bcrypt.txt root@101.132.250.62:/opt/topfans/loadtest/ +scp scripts/loadgen/scripts/prod_seed.sh root@101.132.250.62:/opt/topfans/loadtest/scripts/ +ssh root@101.132.250.62 "chmod +x /opt/topfans/loadtest/{seed,loadgen} /opt/topfans/loadtest/scripts/prod_seed.sh" +``` + +### 1.2 重新生成 bcrypt 哈希 (如果你改了密码策略) +```bash +# 本地 +cd backend/scripts/loadgen/seed + +# 生成与 tokens.go 硬编码密码 (默认 "Test@123") 匹配的哈希 +python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \ + > loadtest_bcrypt.txt + +# 上传覆盖 +scp loadtest_bcrypt.txt root@101.132.250.62:/opt/topfans/loadtest/ +``` + +--- + +## 2. 数据准备 (T0 = 02:00) + +### 2.1 SSH 到 prod +```bash +ssh root@101.132.250.62 +``` + +### 2.2 一键跑 seed (生产数据灌入) +```bash +cd /opt/topfans/loadtest +bash scripts/prod_seed.sh +``` + +**这一步骤会做什么**: +- 读 `/opt/topfans/docker/.env.prod` 拿 DB_PASSWORD + JWT_SECRET +- 插入 star_id=999900 测试明星 (1 行) +- 插入 1000 个测试用户 (mobile 19900000001 - 19900001000) +- 插入 1000 个 fan_profile + crystal +- 插入 5000 个 assets +- 插入 3000 个 booth_slots + 2000 个 exhibitions +- 插入 10000 个 friendships +- **重置所有相关表的 PG 序列** (CLAUDE.md 规范,避免后续 GORM 插入报 duplicate key) +- 签 1000 个 JWT,写到 `users.csv` + +**预计耗时**:30 - 60 秒 + +**预期输出**: +``` +✓ stars seeded +✓ 1000 users seeded +✓ 1000 fan_profiles + crystal seeded +✓ 5000 assets seeded +✓ 3000 booth_slots + 2000 exhibitions seeded +✓ 10000 friendships seeded +✓ sequences reset +✅ users.csv written: 1000 rows +✅ seed + tokens completed +``` + +--- + +## 3. 开压前 7 项检查 (T0+1min) + +```bash +cd /opt/topfans/loadtest +./loadgen --cmd=preflight --target=http://localhost:8080 +``` + +**预期全部 PASS**: +``` +✓ ① Gateway /health HTTP 200 +✓ ② SSH to prod (省略,如不需要 server metrics) +✓ ③ pg_dump backup > 50MB (你的备份) +✓ ④ 阿里云快照 < 24h (人工确认) +✓ ⑤ prod 磁盘空闲 > 10GB free > 10G +✓ ⑥ users.csv 1000 rows rows=1000 +✓ ⑦ JWT_SECRET set set + +ALL CHECKS PASSED — 可以开压 +``` + +**如果有 FAIL**:见 "附录 A: 故障排查" + +--- + +## 4. 烟雾测试 (T0+2min) — 强烈推荐 + +> 这一步只花 30 秒,但能提前发现 90% 的集成问题,省后面 1 小时排错 + +```bash +cd /opt/topfans/loadtest +JWT_SECRET=$(grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod | cut -d= -f2) \ + ./loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \ + --target=http://localhost:8080 --monitor=off 2>&1 | tee reports/smoke-s1.log +``` + +**预期**: +``` +📊 S1: total=30 err=0 5xx=0 p99=200ms stages=1 +✅ loadgen done. total=30 err=0 fiveXX=0 +``` + +**判定**: +- ✅ total=30, err=0 → 进入正式压测 +- ❌ total < 30 → 跑挂了,查 `reports/smoke-s1.log` +- ❌ err > 0 → auth/JWT 问题,检查 `users.csv` 和 JWT_SECRET + +--- + +## 5. 正式压测 (T0+3min) + +### 5.1 选择策略 + +**Plan B 推荐** (S1 + S2 + S4,~1.5 小时): +```bash +cd /opt/topfans/loadtest +export JWT_SECRET=$(grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod | cut -d= -f2) +export PROD_SSH=root@101.132.250.62 + +# === 场景 1: Login (02:05-02:30, 25min) === +./loadgen --cmd=run --scenarios=S1 \ + --stage=step --step-schedule='2,5,10,15,20' \ + --duration=5m --target=http://localhost:8080 \ + --monitor=full --prod-ssh=$PROD_SSH \ + --inter-scenario-pause=0s 2>&1 | tee reports/s1.log +# 预期: 5 个 stage,每 stage 5min,p99 应随 RPS 阶梯上升 + +# === 场景 2: Read (02:35-03:00, 25min) === +./loadgen --cmd=run --scenarios=S2 \ + --stage=step --step-schedule='10,30,60,100,150' \ + --duration=5m --target=http://localhost:8080 \ + --monitor=full --prod-ssh=$PROD_SSH \ + --inter-scenario-pause=0s 2>&1 | tee reports/s2.log + +# === 场景 4: Mint (03:05-03:30, 25min, 写重,保守) === +./loadgen --cmd=run --scenarios=S4 \ + --stage=step --step-schedule='1,2,3,5' \ + --duration=5m --target=http://localhost:8080 \ + --monitor=full --prod-ssh=$PROD_SSH \ + --inter-scenario-pause=0s 2>&1 | tee reports/s4.log +``` + +**Plan A 全量** (S1-S7,~3.5 小时): +```bash +# S1-S7 全部跑,S4/S7 写重场景保守 +SCENARIOS="S1,S2,S3,S4,S5,S6,S7" +SCHEDULES_BY_SCENARIO='{"S1":"2,5,10,15,20","S2":"10,30,60,100,150","S3":"5,15,30,50","S4":"1,2,3,5","S5":"5,10,20,40","S6":"20,50,100,150","S7":"1,2,3,5"}' +# (目前 loadgen 一次只支持一个 schedule,需要跑 7 次) +``` + +### 5.2 每个场景跑完后做什么 +1. 检查 `reports/{scenario}.log` 末尾的 `📊` 行 +2. 记录 total / err / 5xx / p99 / stages +3. 如果 `🚨 circuit breaker tripped` 触发,**立即停**,见附录 B + +--- + +## 6. 生成报告 (T+1min) + +```bash +cd /opt/topfans/loadtest +./loadgen --cmd=report --input=./reports --output=./reports/final-report.md +``` + +**产出**: +``` +reports/ +├── S1.json +├── S2.json +├── S4.json +├── baseline.csv # Excel 可直接打开 +├── s1.png # RPS/P99/Error 曲线图 +├── s2.png +├── s4.png +└── final-report.md # 人看的报告 +``` + +--- + +## 7. 收尾 (T+2min) + +### 7.1 拉报告到本地 +```bash +# 本地 +mkdir -p ~/Desktop/loadtest-report-$(date +%Y%m%d) +scp -r root@101.132.250.62:/opt/topfans/loadtest/reports/* ~/Desktop/loadtest-report-$(date +%Y%m%d)/ +``` + +### 7.2 决定是否清理测试数据 + +| 情况 | 动作 | +|------|------| +| 数据分析完,后续不需要 | `./seed --cleanup --full` | +| 数据还要保留做下一轮 | `./seed --cleanup` (保留 1000 用户,清理关联数据) | +| 只是 JWT 过期 | `./seed --reset-tokens --jwt-secret=$JWT_SECRET` | +| **生产事故** | `./seed --cleanup --full` + 立即回滚,见附录 C | + +### 7.3 (可选) 关闭监控后台采样 +```bash +# 如果你启动了 monitor/sample.sh,杀掉 +ssh root@101.132.250.62 "pkill -f 'monitor/sample.sh'" +``` + +--- + +## 8. 报告分析 (T+30min,白天) + +见 `REPORT_GUIDE.md` — 教你怎么读 `final-report.md`,定位瓶颈,写行动项。 + +--- + +## 附录 A: 故障排查 + +### A.1 preflight FAIL: users.csv 不存在 +**原因**: 上次 seed 没跑成功 +**修复**: `cd /opt/topfans/loadtest && bash scripts/prod_seed.sh` + +### A.2 preflight FAIL: 阿里云快照 < 24h +**原因**: 没备份 +**修复**: 在 ECS 控制台手动建快照,等就绪后重跑 preflight + +### A.3 烟雾测试 FAIL: 大量 4xx +**原因**: JWT_SECRET 不匹配 / users.csv 过期 +**修复**: +```bash +# 1. 确认 JWT_SECRET +grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod + +# 2. 重签 token (数据保留) +./seed --reset-tokens --jwt-secret=$JWT_SECRET + +# 3. 重跑 +./loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \ + --target=http://localhost:8080 --monitor=off +``` + +### A.4 烟雾测试 FAIL: 大量 5xx +**原因**: 网关/服务挂了 +**修复**: 先看 `docker ps` 确认服务在,`curl /health` 确认网关活 + +--- + +## 附录 B: Circuit Breaker 触发 (🚨) + +如果出现 `🚨 circuit breaker tripped!`,**立即**: +1. **Ctrl+C** 停止当前 loadgen (会 graceful shutdown,等待当前请求完成) +2. 立即判断: + - 5xx > 10% 持续 10s → 服务有问题,见附录 C + - 仅客户端错率高 → 测试问题,可能是 step 跳太猛 +3. **降低 RPS 重跑** 或 **改天再试** + +--- + +## 附录 C: 紧急灭火 (production 被打挂了) + +**判定**: 服务真实报错(不是测试客户端问题),prod 用户受影响。 + +**立即执行** (按顺序,每步 30s 内): +```bash +ssh root@101.132.250.62 + +# 1. 停 loadgen + 监控 +pkill -f 'bin/loadgen' +pkill -f 'monitor/sample.sh' + +# 2. 清测试数据 (1 秒) +cd /opt/topfans/loadtest +./seed --cleanup --full + +# 3. 重启服务 (让 prod 回到 baseline) +cd /opt/topfans/docker +docker-compose -f docker-compose.prod.yml --profile prod restart + +# 4. (最严重情况) 从备份还原 +bash /opt/topfans/loadtest/recover/restore-from-backup.sh +# 输入 backup 文件路径,预计 5-8 分钟 +``` + +**事后**: +- 写事故复盘 +- 修压测发现的 bug +- 调整 step schedule (下一次更保守) + +--- + +## 附录 D: 常用 cheat sheet + +```bash +# 查看 loadtest 进程 +ssh root@101.132.250.62 "ps aux | grep -E '(loadgen|sample)' | grep -v grep" + +# 看实时日志 +ssh root@101.132.250.62 "tail -f /opt/topfans/loadtest/reports/*.log" + +# 看 metrics feed +ssh root@101.132.250.62 "tail -f /opt/topfans/loadtest/metrics-feed.jsonl" + +# 测一下网关还活着 +ssh root@101.132.250.62 "curl -sS http://localhost:8080/health" +``` diff --git a/backend/scripts/loadgen/loadgen/lib/hdr.go b/backend/scripts/loadgen/loadgen/lib/hdr.go index b379c52..22084fd 100644 --- a/backend/scripts/loadgen/loadgen/lib/hdr.go +++ b/backend/scripts/loadgen/loadgen/lib/hdr.go @@ -2,13 +2,36 @@ package lib import ( "sync" + "sync/atomic" "github.com/HdrHistogram/hdrhistogram-go" ) +// LatencyRecorder tracks latency histogram + per-stage counters. +// +// Concurrency model: a single LatencyRecorder is shared across all scenarios. +// Per-scenario isolation: callers MUST call Reset() at scenario boundaries. +// Per-stage isolation: callers MUST call BeginStage() at stage boundaries +// (which clears histogram + zero stage counters). type LatencyRecorder struct { mu sync.Mutex h *hdrhistogram.Histogram + + stageTotal atomic.Int64 + stageErrors atomic.Int64 + stageFiveXX atomic.Int64 + + stages []StageSnapshot +} + +// StageSnapshot is the per-stage data captured by EndStage. +type StageSnapshot struct { + StageIdx int + TargetRPS int + Histogram *hdrhistogram.Histogram + TotalRequests int64 + Errors int64 + FiveXX int64 } func NewLatencyRecorder() *LatencyRecorder { @@ -17,6 +40,7 @@ func NewLatencyRecorder() *LatencyRecorder { } } +// Record stores a latency sample (in microseconds). func (r *LatencyRecorder) Record(latencyUs int64) { r.mu.Lock() defer r.mu.Unlock() @@ -26,8 +50,79 @@ func (r *LatencyRecorder) Record(latencyUs int64) { _ = r.h.RecordValue(latencyUs) } +// RecordResult increments per-stage error/5xx counters based on HTTP status code. +// isError: status >= 400 or transport error +// is5xx: status >= 500 +func (r *LatencyRecorder) RecordResult(isError, is5xx bool) { + if isError { + r.stageErrors.Add(1) + } + if is5xx { + r.stageFiveXX.Add(1) + } + r.stageTotal.Add(1) +} + +// Snapshot returns a copy of the current histogram (for use by circuit-breaker). +// Does NOT affect per-stage counters. func (r *LatencyRecorder) Snapshot() *hdrhistogram.Histogram { r.mu.Lock() defer r.mu.Unlock() return hdrhistogram.Import(r.h.Export()) } + +// Reset clears the histogram, per-stage counters, AND accumulated stages. +// Call between scenarios. +func (r *LatencyRecorder) Reset() { + r.mu.Lock() + defer r.mu.Unlock() + r.h = hdrhistogram.New(1, 30_000_000, 3) + r.stages = nil + r.stageTotal.Store(0) + r.stageErrors.Store(0) + r.stageFiveXX.Store(0) +} + +// ClearStages drops accumulated stage data but keeps the current histogram and counters. +// Use when you want stages to remain but accumulated list to be discarded. +func (r *LatencyRecorder) ClearStages() { + r.mu.Lock() + defer r.mu.Unlock() + r.stages = nil +} + +// BeginStage marks the start of a new stage at TargetRPS RPS. +// Resets histogram AND per-stage counters. Stages slice gains a new entry. +func (r *LatencyRecorder) BeginStage(idx, targetRPS int) { + r.mu.Lock() + defer r.mu.Unlock() + r.h = hdrhistogram.New(1, 30_000_000, 3) + r.stageTotal.Store(0) + r.stageErrors.Store(0) + r.stageFiveXX.Store(0) + r.stages = append(r.stages, StageSnapshot{StageIdx: idx, TargetRPS: targetRPS}) +} + +// EndStage freezes the histogram + per-stage counters into the latest stage entry. +// Must be called after BeginStage and after the stage has produced some traffic. +func (r *LatencyRecorder) EndStage() { + r.mu.Lock() + defer r.mu.Unlock() + if len(r.stages) == 0 { + return + } + last := &r.stages[len(r.stages)-1] + last.Histogram = hdrhistogram.Import(r.h.Export()) + last.TotalRequests = r.stageTotal.Load() + last.Errors = r.stageErrors.Load() + last.FiveXX = r.stageFiveXX.Load() +} + +// Stages returns a copy of accumulated stage snapshots. +func (r *LatencyRecorder) Stages() []StageSnapshot { + r.mu.Lock() + defer r.mu.Unlock() + out := make([]StageSnapshot, len(r.stages)) + copy(out, r.stages) + return out +} diff --git a/backend/scripts/loadgen/loadgen/main.go b/backend/scripts/loadgen/loadgen/main.go index 1d52685..38a58ef 100644 --- a/backend/scripts/loadgen/loadgen/main.go +++ b/backend/scripts/loadgen/loadgen/main.go @@ -66,6 +66,31 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d // 让 scenarios 用 --target 而不是写死的 prod IP scenarios.DefaultBaseURL = target + // 写 run-metadata.json (供 --cmd=report 使用) + runStart := time.Now() + defer func() { + meta := reporter.RunMetadata{ + StartTime: runStart, + EndTime: time.Now(), + Target: target, + Scenarios: strings.Split(scenarioIDs, ","), + StepSchedule: stepSchedule, + StageMode: stage, + RPSOverride: rps, + MonitorMode: monitorMode, + ProdSSH: prodSSH, + } + // 取 JWT_SECRET 前 8 位作为 hint + if jwtSecret := os.Getenv("JWT_SECRET"); len(jwtSecret) >= 8 { + meta.JWTSecretHint = jwtSecret[:8] + } + if err := os.MkdirAll("reports", 0o755); err == nil { + if data, err := json.MarshalIndent(meta, "", " "); err == nil { + _ = os.WriteFile(filepath.Join("reports", "run-metadata.json"), data, 0o644) + } + } + }() + users, err := lib.LoadUsers("users.csv") if err != nil { return fmt.Errorf("load users.csv: %w (先跑 `seed` 生成 users.csv)", err) @@ -126,6 +151,14 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d continue } log.Printf("=== scenario %d/%d: %s ===", idx+1, len(ids), id) + + // 场景开始:快照 delta 基线,清空 stage 累积 + recorder.ClearStages() + recorder.Reset() + prevTotal := totalCount.Load() + prevErr := errCount.Load() + prev5xx := fiveXXCount.Load() + s, err := scenarios.Get(id, client, users, &errCount, &totalCount, &fiveXXCount, recorder, breaker, prodSSH) if err != nil { return fmt.Errorf("scenario %s: %w", id, err) @@ -133,6 +166,38 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d if err := s.Run(ctx, rps, duration, dashboard, breaker, stages); err != nil { return fmt.Errorf("run scenario %s: %w", id, err) } + + // 场景结束:写 per-scenario JSON (含 stages) + scenarioTotal := totalCount.Load() - prevTotal + scenarioErr := errCount.Load() - prevErr + scenario5xx := fiveXXCount.Load() - prev5xx + scenarioStages := recorder.Stages() + + stageReports := make([]reporter.StageReport, 0, len(scenarioStages)) + for _, ss := range scenarioStages { + stageReports = append(stageReports, reporter.MakeStageReport( + ss.StageIdx, ss.TargetRPS, ss.Histogram, + ss.TotalRequests, ss.Errors, ss.FiveXX, + )) + } + rr := reporter.RunReport{ + Scenario: id, + TotalRequests: scenarioTotal, + Errors: scenarioErr, + FiveXX: scenario5xx, + P50Us: recorder.Snapshot().ValueAtPercentile(50), + P95Us: recorder.Snapshot().ValueAtPercentile(95), + P99Us: recorder.Snapshot().ValueAtPercentile(99), + MaxUs: recorder.Snapshot().Max(), + Stages: stageReports, + } + scenarioPath := filepath.Join("reports", id+".json") + if err := reporter.WriteJSON(scenarioPath, rr); err != nil { + return fmt.Errorf("write %s: %w", scenarioPath, err) + } + log.Printf("📊 %s: total=%d err=%d 5xx=%d p99=%dms stages=%d", + id, scenarioTotal, scenarioErr, scenario5xx, rr.P99Us/1000, len(stageReports)) + if breaker.State() == lib.CircuitTripped { log.Printf("⚠️ circuit tripped, stopping") break @@ -143,11 +208,8 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d } } - // write final report - if err := reporter.WriteJSON("report.json", scenarioIDs, recorder.Snapshot(), totalCount.Load(), errCount.Load(), fiveXXCount.Load()); err != nil { - return fmt.Errorf("write report: %w", err) - } log.Printf("✅ loadgen done. total=%d err=%d fiveXX=%d", totalCount.Load(), errCount.Load(), fiveXXCount.Load()) + log.Printf("💡 下一步: ./loadgen --cmd=report --input=./reports --output=./reports/final-report.md") return nil } @@ -186,20 +248,33 @@ func runReport(inputDir, output string) error { return fmt.Errorf("--input required for cmd=report") } - // 1. 收集 reports/run-*/ 下的 *.json + // 1. 递归收集 reports/ 下的所有 *.json (filepath.Glob 不支持 **, 用 WalkDir) var scenarioReports []reporter.RunReport - matches, _ := filepath.Glob(filepath.Join(inputDir, "**", "*.json")) - for _, m := range matches { - data, err := os.ReadFile(m) + err := filepath.WalkDir(inputDir, func(path string, d os.DirEntry, walkErr error) error { + if walkErr != nil { + return nil + } + if d.IsDir() || !strings.HasSuffix(path, ".json") { + return nil + } + // 跳过元数据文件 (它是 RunMetadata 不是 RunReport) + if strings.HasSuffix(path, "run-metadata.json") { + return nil + } + data, err := os.ReadFile(path) if err != nil { - continue + return nil } var rr reporter.RunReport if err := json.Unmarshal(data, &rr); err != nil { - log.Printf("skip %s: %v", m, err) - continue + log.Printf("skip %s: %v", path, err) + return nil } scenarioReports = append(scenarioReports, rr) + return nil + }) + if err != nil { + return fmt.Errorf("walk %s: %w", inputDir, err) } if len(scenarioReports) == 0 { return fmt.Errorf("no JSON reports found in %s", inputDir) @@ -213,17 +288,41 @@ func runReport(inputDir, output string) error { } log.Printf("wrote %s", baselinePath) - // 3. 转 ScenarioReport (供 markdown 用) - scenarioMarkdownReports := make([]reporter.ScenarioReport, 0, len(scenarioReports)) + // 3. 生成每个 scenario 的 PNG 图表 for _, r := range scenarioReports { - scenarioMarkdownReports = append(scenarioMarkdownReports, reporter.ScenarioReport{ - ID: r.Scenario, - KneeRPS: 0, // 拐点需要分析 raw data 算,简化版留 0 - }) + if len(r.Stages) < 1 { + continue + } + plotPath := filepath.Join(inputDir, strings.ToLower(r.Scenario)+".png") + samples := make([]reporter.Sample, 0, len(r.Stages)) + for _, st := range r.Stages { + tot := st.TotalRequests + errRate := float64(0) + if tot > 0 { + errRate = float64(st.Errors) / float64(tot) + } + samples = append(samples, reporter.Sample{ + RPS: float64(st.TargetRPS), + P99Ms: float64(st.P99Us) / 1000, + ErrorRate: errRate, + }) + } + if err := reporter.PlotRPSLatencyError(r.Scenario, samples, plotPath); err != nil { + log.Printf("⚠️ plot %s failed: %v", r.Scenario, err) + continue + } + log.Printf("wrote %s", plotPath) } - // 4. markdown - if err := reporter.GenerateMarkdown(output, scenarioMarkdownReports); err != nil { + // 4. 读 run-metadata.json (可选,runLoadgen 写入) + var meta reporter.RunMetadata + metaPath := filepath.Join(inputDir, "run-metadata.json") + if data, err := os.ReadFile(metaPath); err == nil { + _ = json.Unmarshal(data, &meta) + } + + // 5. markdown (引用生成的 PNG) + if err := reporter.GenerateMarkdown(output, meta, scenarioReports, "./"); err != nil { return fmt.Errorf("write markdown: %w", err) } log.Printf("wrote %s", output) diff --git a/backend/scripts/loadgen/loadgen/reporter/json.go b/backend/scripts/loadgen/loadgen/reporter/json.go index 3b4d747..a49c7cb 100644 --- a/backend/scripts/loadgen/loadgen/reporter/json.go +++ b/backend/scripts/loadgen/loadgen/reporter/json.go @@ -7,20 +7,50 @@ import ( "github.com/HdrHistogram/hdrhistogram-go" ) -type RunReport struct { - Scenario string `json:"scenario"` - TotalRequests int64 `json:"total_requests"` - Errors int64 `json:"errors"` - FiveXX int64 `json:"five_xx"` - P50Us int64 `json:"p50_us"` - P95Us int64 `json:"p95_us"` - P99Us int64 `json:"p99_us"` - MaxUs int64 `json:"max_us"` +type StageReport struct { + StageIdx int `json:"stage_idx"` + TargetRPS int `json:"target_rps"` + TotalRequests int64 `json:"total_requests"` + Errors int64 `json:"errors"` + FiveXX int64 `json:"five_xx"` + P50Us int64 `json:"p50_us"` + P95Us int64 `json:"p95_us"` + P99Us int64 `json:"p99_us"` + MaxUs int64 `json:"max_us"` } -func WriteJSON(path string, scenario string, h *hdrhistogram.Histogram, total, errs, fiveXX int64) error { - r := RunReport{ - Scenario: scenario, +type RunReport struct { + Scenario string `json:"scenario"` + TotalRequests int64 `json:"total_requests"` + Errors int64 `json:"errors"` + FiveXX int64 `json:"five_xx"` + P50Us int64 `json:"p50_us"` + P95Us int64 `json:"p95_us"` + P99Us int64 `json:"p99_us"` + MaxUs int64 `json:"max_us"` + Stages []StageReport `json:"stages,omitempty"` +} + +// WriteJSON writes a RunReport (single scenario, optional per-stage data) to path. +func WriteJSON(path string, r RunReport) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + enc := json.NewEncoder(f) + enc.SetIndent("", " ") + return enc.Encode(r) +} + +// MakeStageReport fills a StageReport from a histogram + counters. +func MakeStageReport(idx, targetRPS int, h *hdrhistogram.Histogram, total, errs, fiveXX int64) StageReport { + if h == nil { + return StageReport{StageIdx: idx, TargetRPS: targetRPS} + } + return StageReport{ + StageIdx: idx, + TargetRPS: targetRPS, TotalRequests: total, Errors: errs, FiveXX: fiveXX, @@ -29,25 +59,28 @@ func WriteJSON(path string, scenario string, h *hdrhistogram.Histogram, total, e P99Us: h.ValueAtPercentile(99), MaxUs: h.Max(), } - f, err := os.Create(path) - if err != nil { - return err - } - defer f.Close() - return json.NewEncoder(f).Encode(r) } +// WriteBaselineCSV writes a CSV summary across multiple RunReports. func WriteBaselineCSV(path string, scenarios []RunReport) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() - if _, err := f.WriteString("scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms\n"); err != nil { + if _, err := f.WriteString("scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms,stages\n"); err != nil { return err } for _, s := range scenarios { - _, err := f.WriteString(jsonLine(s) + "\n") + _, err := f.WriteString(s.Scenario + "," + + itoa(s.TotalRequests) + "," + + itoa(s.Errors) + "," + + itoa(s.FiveXX) + "," + + ms(s.P50Us) + "," + + ms(s.P95Us) + "," + + ms(s.P99Us) + "," + + ms(s.MaxUs) + "," + + itoa(int64(len(s.Stages))) + "\n") if err != nil { return err } @@ -55,16 +88,6 @@ func WriteBaselineCSV(path string, scenarios []RunReport) error { return nil } -func jsonLine(s RunReport) string { - b, _ := json.Marshal(s) - s2 := string(b) - if len(s2) >= 2 && s2[0] == '{' { - // strip braces for CSV-friendly format - return s.Scenario + "," + itoa(s.TotalRequests) + "," + itoa(s.Errors) + "," + itoa(s.FiveXX) + "," + ms(s.P50Us) + "," + ms(s.P95Us) + "," + ms(s.P99Us) + "," + ms(s.MaxUs) - } - return s2 -} - func itoa(n int64) string { if n == 0 { return "0" @@ -88,12 +111,10 @@ func itoa(n int64) string { } func ms(us int64) string { - // us / 1000 as float return formatFloat(float64(us) / 1000) } func formatFloat(f float64) string { - // simple 2-decimal format intPart := int64(f) frac := int64((f - float64(intPart)) * 100) if frac < 0 { diff --git a/backend/scripts/loadgen/loadgen/reporter/knee.go b/backend/scripts/loadgen/loadgen/reporter/knee.go new file mode 100644 index 0000000..13e9992 --- /dev/null +++ b/backend/scripts/loadgen/loadgen/reporter/knee.go @@ -0,0 +1,33 @@ +package reporter + +// KneeRPS finds the "knee" (turning point) of a multi-stage run. +// +// Heuristic: the first stage where p99 latency grew >50% over the previous +// stage. If no such jump exists (run was healthy throughout), returns the +// highest stage tested (i.e. we never hit the knee). +// +// Returns: +// - kneeRPS: the target_rps at the knee (or highest if no knee found) +// - kneeIdx: the stage index (1-based) where the knee was detected +// - p99Delta: the p99 jump percentage (0.5 = 50% growth) +func KneeRPS(stages []StageReport) (kneeRPS, kneeIdx int, p99Delta float64) { + if len(stages) == 0 { + return 0, 0, 0 + } + if len(stages) == 1 { + return stages[0].TargetRPS, stages[0].StageIdx, 0 + } + for i := 1; i < len(stages); i++ { + prev := stages[i-1].P99Us + if prev == 0 { + continue + } + growth := float64(stages[i].P99Us-prev) / float64(prev) + if growth > 0.5 { + return stages[i].TargetRPS, stages[i].StageIdx, growth + } + } + // 没找到拐点:返回最高 stage + last := stages[len(stages)-1] + return last.TargetRPS, last.StageIdx, 0 +} diff --git a/backend/scripts/loadgen/loadgen/reporter/markdown.go b/backend/scripts/loadgen/loadgen/reporter/markdown.go index 4da3f99..e1985aa 100644 --- a/backend/scripts/loadgen/loadgen/reporter/markdown.go +++ b/backend/scripts/loadgen/loadgen/reporter/markdown.go @@ -3,42 +3,482 @@ package reporter import ( "fmt" "os" + "strings" + "time" ) -type ScenarioReport struct { - ID string - Stages []StageReport - KneeRPS int - TopBottleneck string -} - -type StageReport struct { - RPS int - P50Ms float64 - P95Ms float64 - P99Ms float64 - ErrorRate float64 -} - -func GenerateMarkdown(path string, scenarios []ScenarioReport) error { +// GenerateMarkdown writes a rich markdown report. +// +// Includes: +// - Header (run metadata: target, scenarios, time, JWT hint) +// - Executive summary (per-scenario verdicts + key findings) +// - Cross-scenario bottleneck analysis +// - Per-scenario detailed sections with: +// * Description + business impact + API +// * Verdict with reasoning +// * KPI table vs thresholds +// * Knee analysis +// * Stage-by-stage breakdown +// * PNG chart +// * Specific action items +func GenerateMarkdown(path string, meta RunMetadata, scenarios []RunReport, plotDir string) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() - fmt.Fprintf(f, "# 压测报告\n\n") + writeHeader(f, meta, scenarios) + writeExecutiveSummary(f, scenarios) + writeOverviewTable(f, scenarios) + writeCrossScenarioAnalysis(f, scenarios) for _, s := range scenarios { - fmt.Fprintf(f, "## %s\n\n", s.ID) - fmt.Fprintf(f, "**拐点 RPS**: %d\n\n", s.KneeRPS) - fmt.Fprintf(f, "**Top 瓶颈**: %s\n\n", s.TopBottleneck) - fmt.Fprintf(f, "| Stage | RPS | P50ms | P95ms | P99ms | Err%% |\n") - fmt.Fprintf(f, "|-------|-----|-------|-------|-------|------|\n") - for _, st := range s.Stages { - fmt.Fprintf(f, "| - | %d | %.1f | %.1f | %.1f | %.1f |\n", - st.RPS, st.P50Ms, st.P95Ms, st.P99Ms, st.ErrorRate*100) + writeScenarioDetail(f, s, plotDir) + } + writeAppendix(f, meta) + return nil +} + +func writeHeader(f *os.File, meta RunMetadata, scenarios []RunReport) { + fmt.Fprintf(f, "# TopFans 压测报告\n\n") + duration := meta.EndTime.Sub(meta.StartTime).Round(time.Second) + fmt.Fprintf(f, "## 📋 运行信息\n\n") + fmt.Fprintf(f, "| 项 | 值 |\n|---|---|\n") + fmt.Fprintf(f, "| **生成时间** | %s |\n", time.Now().Format("2006-01-02 15:04:05 MST")) + if !meta.StartTime.IsZero() { + fmt.Fprintf(f, "| **压测开始** | %s |\n", meta.StartTime.Format("2006-01-02 15:04:05 MST")) + fmt.Fprintf(f, "| **压测结束** | %s |\n", meta.EndTime.Format("2006-01-02 15:04:05 MST")) + fmt.Fprintf(f, "| **总耗时** | %s |\n", duration) + } + fmt.Fprintf(f, "| **目标地址** | `%s` |\n", emptyDash(meta.Target)) + fmt.Fprintf(f, "| **测试场景** | %s |\n", strings.Join(meta.Scenarios, ", ")) + fmt.Fprintf(f, "| **阶梯模式** | %s%s |\n", emptyDash(meta.StageMode), ifThen(meta.StepSchedule != "", " (`"+meta.StepSchedule+"`)", "")) + if meta.JWTSecretHint != "" { + fmt.Fprintf(f, "| **JWT 签名密钥** | `%s***` (前 8 位) |\n", meta.JWTSecretHint) + } + if meta.ProdSSH != "" { + fmt.Fprintf(f, "| **prod SSH** | `%s` |\n", meta.ProdSSH) + } + if meta.MonitorMode != "" { + fmt.Fprintf(f, "| **监控模式** | %s |\n", meta.MonitorMode) + } + + // 总请求数 + var totalReq, totalErr, total5xx int64 + for _, s := range scenarios { + totalReq += s.TotalRequests + totalErr += s.Errors + total5xx += s.FiveXX + } + fmt.Fprintf(f, "| **总请求数** | %s |\n", commaInt(totalReq)) + fmt.Fprintf(f, "| **总错误数** | %s (%.2f%%) |\n", commaInt(totalErr), pct(totalErr, totalReq)) + fmt.Fprintf(f, "| **5xx 数** | %s (%.2f%%) |\n", commaInt(total5xx), pct(total5xx, totalReq)) + fmt.Fprintf(f, "\n---\n\n") +} + +func writeExecutiveSummary(f *os.File, scenarios []RunReport) { + fmt.Fprintf(f, "## 🎯 执行摘要\n\n") + + // Count verdicts + counts := map[string]int{"✅": 0, "⚠️": 0, "🚨": 0} + criticalIssues := []string{} + for _, s := range scenarios { + meta, ok := AllScenarios[s.Scenario] + if !ok { + continue + } + _, _, p99Delta := KneeRPS(s.Stages) + knee := p99Delta > 0.5 + v := meta.Verdict(s, knee) + counts[v]++ + + if v == "🚨" { + issue := fmt.Sprintf("- **%s (%s)**: ", s.Scenario, meta.Name) + if errRate := pct(s.Errors, s.TotalRequests); errRate > 1 { + issue += fmt.Sprintf("错误率 %.2f%% ", errRate) + } + if p99Ms := float64(s.P99Us) / 1000; p99Ms > meta.Thresholds.P99MsMax { + issue += fmt.Sprintf("P99 %.0fms (阈值 %.0fms) ", p99Ms, meta.Thresholds.P99MsMax) + } + if knee { + issue += fmt.Sprintf("拐点 stage %d", stagesIdx(s.Stages)) + } + criticalIssues = append(criticalIssues, issue) + } + } + + // Overall verdict + totalSc := len(scenarios) + fmt.Fprintf(f, "**总览**: ✅ %d 健康 / ⚠️ %d 警告 / 🚨 %d 严重 (共 %d)\n\n", + counts["✅"], counts["⚠️"], counts["🚨"], totalSc) + + if len(criticalIssues) == 0 { + fmt.Fprintf(f, "🎉 **所有场景通过健康阈值,系统可承载预期负载。**\n\n") + } else { + fmt.Fprintf(f, "🚨 **关键问题** (%d 个):\n\n", len(criticalIssues)) + for _, issue := range criticalIssues { + fmt.Fprintf(f, "%s\n", issue) } fmt.Fprintf(f, "\n") } - return nil + + // Per-scenario one-liner + fmt.Fprintf(f, "**场景速览**:\n\n") + for _, s := range scenarios { + meta, ok := AllScenarios[s.Scenario] + if !ok { + continue + } + _, _, p99Delta := KneeRPS(s.Stages) + knee := p99Delta > 0.5 + v := meta.Verdict(s, knee) + fmt.Fprintf(f, "- %s **%s %s** — p99=%.0fms, %s", v, s.Scenario, meta.Name, float64(s.P99Us)/1000, errSummary(s)) + if knee { + fmt.Fprintf(f, ", ⚠️ 拐点 stage %d", stagesIdx(s.Stages)) + } + fmt.Fprintf(f, "\n") + } + fmt.Fprintf(f, "\n---\n\n") +} + +func writeOverviewTable(f *os.File, scenarios []RunReport) { + fmt.Fprintf(f, "## 📊 总览表\n\n") + fmt.Fprintf(f, "| 场景 | 描述 | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 拐点 RPS | 状态 |\n") + fmt.Fprintf(f, "|------|------|-------|-----|-----|-------|-------|-------|-------|---------|------|\n") + for _, s := range scenarios { + meta, ok := AllScenarios[s.Scenario] + if !ok { + continue + } + kneeRPS, kneeIdx, p99Delta := KneeRPS(s.Stages) + kneeTriggered := p99Delta > 0.5 + v := meta.Verdict(s, kneeTriggered) + kneeStr := "—" + if kneeTriggered { + kneeStr = fmt.Sprintf("%d (stage %d)", kneeRPS, kneeIdx) + } + fmt.Fprintf(f, "| **%s** | %s | %s | %s (%.2f%%) | %s (%.2f%%) | %.0f | %.0f | %.0f | %.0f | %s | %s |\n", + s.Scenario, meta.Name, + commaInt(s.TotalRequests), + commaInt(s.Errors), pct(s.Errors, s.TotalRequests), + commaInt(s.FiveXX), pct(s.FiveXX, s.TotalRequests), + usToMs(s.P50Us), usToMs(s.P95Us), usToMs(s.P99Us), usToMs(s.MaxUs), + kneeStr, v) + } + fmt.Fprintf(f, "\n> 说明: Err 包含 4xx + 5xx,5xx 是子集。错误率 = Err / Total。\n\n") +} + +func writeCrossScenarioAnalysis(f *os.File, scenarios []RunReport) { + fmt.Fprintf(f, "## 🔬 跨场景瓶颈分析\n\n") + if len(scenarios) < 2 { + fmt.Fprintf(f, "只有一个场景,无需跨场景分析。\n\n") + return + } + + // Find bottleneck: highest P99 relative to threshold + type scored struct { + scenario string + p99Ms float64 + ratio float64 // p99 / threshold + } + var scoreds []scored + for _, s := range scenarios { + meta, ok := AllScenarios[s.Scenario] + if !ok { + continue + } + p99Ms := float64(s.P99Us) / 1000 + ratio := p99Ms / meta.Thresholds.P99MsMax + scoreds = append(scoreds, scored{s.Scenario, p99Ms, ratio}) + } + // Sort by ratio desc + for i := 0; i < len(scoreds); i++ { + for j := i + 1; j < len(scoreds); j++ { + if scoreds[j].ratio > scoreds[i].ratio { + scoreds[i], scoreds[j] = scoreds[j], scoreds[i] + } + } + } + + if len(scoreds) > 0 && scoreds[0].ratio > 1 { + fmt.Fprintf(f, "🚨 **瓶颈场景: %s** — P99 是阈值的 %.2f 倍\n\n", scoreds[0].scenario, scoreds[0].ratio) + } else if len(scoreds) > 0 { + fmt.Fprintf(f, "✅ **无明显瓶颈**,所有场景 P99 都在阈值内。\n\n") + } + + fmt.Fprintf(f, "**P99 / 阈值 比率** (从高到低):\n\n") + for _, s := range scoreds { + fmt.Fprintf(f, "- %s: %.2fx (%.0fms)\n", s.scenario, s.ratio, s.p99Ms) + } + fmt.Fprintf(f, "\n---\n\n") +} + +func writeScenarioDetail(f *os.File, s RunReport, plotDir string) { + meta, ok := AllScenarios[s.Scenario] + if !ok { + fmt.Fprintf(f, "## %s (无元数据)\n\n", s.Scenario) + fmt.Fprintf(f, "```json\n%+v\n```\n\n", s) + return + } + + kneeRPS, kneeIdx, p99Delta := KneeRPS(s.Stages) + kneeTriggered := p99Delta > 0.5 + verdict := meta.Verdict(s, kneeTriggered) + + fmt.Fprintf(f, "## %s %s %s\n\n", verdict, s.Scenario, meta.Name) + fmt.Fprintf(f, "### 📌 测试说明\n\n") + fmt.Fprintf(f, "| 项 | 值 |\n|---|---|\n") + fmt.Fprintf(f, "| **API** | `%s` |\n", meta.API) + fmt.Fprintf(f, "| **负载类型** | %s |\n", workloadLabel(meta.Workload)) + fmt.Fprintf(f, "| **业务说明** | %s |\n", meta.Description) + fmt.Fprintf(f, "| **影响范围** | %s |\n", meta.BusinessImp) + fmt.Fprintf(f, "\n") + + // KPI vs thresholds + fmt.Fprintf(f, "### 📈 性能指标 vs 健康阈值\n\n") + p50Ms := usToMs(s.P50Us) + p95Ms := usToMs(s.P95Us) + p99Ms := usToMs(s.P99Us) + maxMs := usToMs(s.MaxUs) + errRate := pct(s.Errors, s.TotalRequests) + fiveXXRate := pct(s.FiveXX, s.TotalRequests) + fmt.Fprintf(f, "| 指标 | 实测 | 阈值 | 判定 |\n") + fmt.Fprintf(f, "|------|------|------|------|\n") + fmt.Fprintf(f, "| P50ms | %.0f | ≤%.0f | %s |\n", p50Ms, meta.Thresholds.P50MsMax, thresholdMark(p50Ms, meta.Thresholds.P50MsMax)) + fmt.Fprintf(f, "| P95ms | %.0f | ≤%.0f | %s |\n", p95Ms, meta.Thresholds.P95MsMax, thresholdMark(p95Ms, meta.Thresholds.P95MsMax)) + fmt.Fprintf(f, "| P99ms | %.0f | ≤%.0f | %s |\n", p99Ms, meta.Thresholds.P99MsMax, thresholdMark(p99Ms, meta.Thresholds.P99MsMax)) + fmt.Fprintf(f, "| Maxms | %.0f | — | ℹ️ 参考 |\n", maxMs) + fmt.Fprintf(f, "| 错误率 | %.2f%% | ≤%.2f%% | %s |\n", errRate, meta.Thresholds.ErrorRateMax*100, thresholdMark(errRate/100, meta.Thresholds.ErrorRateMax)) + fmt.Fprintf(f, "| 5xx 率 | %.2f%% | ≤%.2f%% | %s |\n", fiveXXRate, meta.Thresholds.FiveXXRateMax*100, thresholdMark(fiveXXRate/100, meta.Thresholds.FiveXXRateMax)) + fmt.Fprintf(f, "\n") + + // Knee + fmt.Fprintf(f, "### 📍 拐点分析\n\n") + if len(s.Stages) <= 1 { + fmt.Fprintf(f, "ℹ️ 仅 1 个 stage,未做阶梯测试,无法判断拐点。\n\n") + } else if kneeTriggered { + fmt.Fprintf(f, "🚨 **拐点**: stage %d @ %d RPS — p99 暴涨 %.0f%%\n\n", + kneeIdx, kneeRPS, p99Delta*100) + fmt.Fprintf(f, "从 stage %d 到 stage %d,p99 延迟从 %.0fms 涨到 %.0fms (%.1fx)。\n", + kneeIdx-1, kneeIdx, usToMs(s.Stages[kneeIdx-2].P99Us), p99Ms, 1+p99Delta) + fmt.Fprintf(f, "\n**含义**: 系统在 %d RPS 时开始出现性能劣化。建议生产限流到 %d RPS 以下。\n\n", + kneeRPS, kneeRPS) + } else { + fmt.Fprintf(f, "✅ **拐点未触发** — 全程 %d 个 stage 健康运行,最高 %d RPS p99=%.0fms。\n\n", + len(s.Stages), kneeRPS, p99Ms) + } + + // Stage table + fmt.Fprintf(f, "### 🔢 阶梯结果\n\n") + if len(s.Stages) == 0 { + fmt.Fprintf(f, "_无 stage 数据_\n\n") + } else { + fmt.Fprintf(f, "| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |\n") + fmt.Fprintf(f, "|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|\n") + for i, st := range s.Stages { + growth := "" + if i > 0 { + prevP99 := float64(s.Stages[i-1].P99Us) / 1000 + curP99 := float64(st.P99Us) / 1000 + if prevP99 > 0 { + pct := (curP99 - prevP99) / prevP99 * 100 + growth = fmt.Sprintf("%+.0f%%", pct) + if pct > 50 { + growth = "🚨 " + growth + } + } + } + fmt.Fprintf(f, "| %d | %d | %s | %s | %s | %.0f | %.0f | %.0f | %.0f | %s |\n", + st.StageIdx, st.TargetRPS, + commaInt(st.TotalRequests), commaInt(st.Errors), commaInt(st.FiveXX), + usToMs(st.P50Us), usToMs(st.P95Us), usToMs(st.P99Us), usToMs(st.MaxUs), + growth) + } + fmt.Fprintf(f, "\n") + } + + // Action items + fmt.Fprintf(f, "### 🎯 行动项\n\n") + actionItems(f, s, meta, kneeTriggered, kneeRPS) + + // Plot + if plotDir != "" { + plotName := strings.ToLower(s.Scenario) + ".png" + fmt.Fprintf(f, "### 📉 图表\n\n") + fmt.Fprintf(f, "![%s RPS / P99 / Error](%s/%s)\n\n", s.Scenario, plotDir, plotName) + } + + fmt.Fprintf(f, "---\n\n") +} + +func writeAppendix(f *os.File, meta RunMetadata) { + fmt.Fprintf(f, "## 📎 附录\n\n") + fmt.Fprintf(f, "### 健康阈值说明\n\n") + fmt.Fprintln(f, "- **P50/P95/P99**: 百分位延迟 (毫秒),值越小越好") + fmt.Fprintln(f, "- **错误率**: 4xx+5xx 请求占比,健康 < 1%") + fmt.Fprintln(f, "- **5xx 率**: 服务端错误率,健康 < 0.1%") + fmt.Fprintln(f, "- **拐点**: 阶梯测试中,p99 相对前一 stage 涨幅 > 50% 的第一个 stage") + fmt.Fprintf(f, "\n") + fmt.Fprintf(f, "### 文件清单\n\n") + fmt.Fprintf(f, "```\n") + fmt.Fprintf(f, "reports/\n") + fmt.Fprintf(f, "├── final-report.md (本文件)\n") + fmt.Fprintf(f, "├── baseline.csv (Excel 可打开的汇总)\n") + for _, s := range []string{"S1", "S2", "S3", "S4", "S5", "S6", "S7"} { + fmt.Fprintf(f, "├── %s.json%s\n", strings.ToLower(s), "") + fmt.Fprintf(f, "├── %s.png%s\n", strings.ToLower(s), "") + } + fmt.Fprintf(f, "```\n\n") + fmt.Fprintf(f, "### 如何复现\n\n") + fmt.Fprintf(f, "```bash\n") + fmt.Fprintf(f, "cd /opt/topfans/loadtest\n") + if meta.StepSchedule != "" { + fmt.Fprintf(f, "./loadgen --cmd=run --scenarios=%s --stage=%s --step-schedule='%s' \\\n", + strings.Join(meta.Scenarios, ","), meta.StageMode, meta.StepSchedule) + } else { + fmt.Fprintf(f, "./loadgen --cmd=run --scenarios=%s --stage=%s \\\n", + strings.Join(meta.Scenarios, ","), meta.StageMode) + } + if meta.Target != "" { + fmt.Fprintf(f, " --target=%s \\\n", meta.Target) + } + if meta.MonitorMode != "" { + fmt.Fprintf(f, " --monitor=%s \\\n", meta.MonitorMode) + } + if meta.ProdSSH != "" { + fmt.Fprintf(f, " --prod-ssh=%s\n", meta.ProdSSH) + } + fmt.Fprintf(f, "```\n") +} + +// ---- helpers ---- + +func workloadLabel(w string) string { + switch w { + case "read": + return "📖 读" + case "write_light": + return "✏️ 轻写" + case "write_heavy": + return "🛠️ 重写" + } + return w +} + +func thresholdMark(value, threshold float64) string { + if value <= threshold { + return "✅" + } + if value <= threshold*1.5 { + return "⚠️" + } + return "🚨" +} + +func errSummary(s RunReport) string { + if s.TotalRequests == 0 { + return "无请求" + } + rate := pct(s.Errors, s.TotalRequests) + return fmt.Sprintf("err %.2f%%", rate) +} + +func stagesIdx(stages []StageReport) int { + _, idx, _ := KneeRPS(stages) + return idx +} + +func pct(num, denom int64) float64 { + if denom == 0 { + return 0 + } + return float64(num) / float64(denom) * 100 +} + +func usToMs(us int64) float64 { + return float64(us) / 1000 +} + +func commaInt(n int64) string { + if n == 0 { + return "0" + } + neg := n < 0 + if neg { + n = -n + } + s := fmt.Sprintf("%d", n) + // Insert commas + out := []byte{} + for i, c := range s { + if i > 0 && (len(s)-i)%3 == 0 { + out = append(out, ',') + } + out = append(out, byte(c)) + } + if neg { + return "-" + string(out) + } + return string(out) +} + +func emptyDash(s string) string { + if s == "" { + return "—" + } + return s +} + +func ifThen(cond bool, a, b string) string { + if cond { + return a + } + return b +} + +// actionItems emits scenario-specific P0/P1/P2 action items. +func actionItems(f *os.File, s RunReport, meta ScenarioMeta, knee bool, _ int) { + p99Ms := usToMs(s.P99Us) + errRate := pct(s.Errors, s.TotalRequests) + fiveXXRate := pct(s.FiveXX, s.TotalRequests) + p99Over := p99Ms > meta.Thresholds.P99MsMax + + anyAction := false + + if knee { + kneeRPS, kneeIdx, _ := KneeRPS(s.Stages) + fmt.Fprintf(f, "- [ ] **🔴 P0**: 修复 stage %d 拐点 (%d RPS, p99=%.0fms)\n", kneeIdx, kneeRPS, p99Ms) + fmt.Fprintf(f, " - 看 PG 慢查询 (`pg_stat_statements ORDER BY mean_exec_time DESC`)\n") + fmt.Fprintf(f, " - 跑应用层 profile (`pprof http://localhost:PORT/debug/pprof/profile`)\n") + fmt.Fprintf(f, " - 临时方案: 服务端限流到 %d RPS,超限返回 429\n", kneeRPS) + anyAction = true + } + + if fiveXXRate > 0.5 { + fmt.Fprintf(f, "- [ ] **🔴 P0**: 5xx 率 %.2f%% — 看 prod 服务日志,定位具体错误\n", fiveXXRate) + anyAction = true + } + if errRate > 1 { + fmt.Fprintf(f, "- [ ] **🟡 P1**: 错误率 %.2f%% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失\n", errRate) + anyAction = true + } + if p99Over && !knee { + fmt.Fprintf(f, "- [ ] **🟡 P1**: P99 %.0fms 超过阈值 %.0fms — 检查是否有个别慢查询\n", p99Ms, meta.Thresholds.P99MsMax) + anyAction = true + } + + // Workload-specific suggestions + if meta.Workload == "write_heavy" && (knee || p99Over) { + fmt.Fprintf(f, "- [ ] **🟡 P1**: 写重场景有性能问题 — 考虑把同步写改成异步(消息队列)\n") + anyAction = true + } + if meta.Workload == "read" && (knee || p99Over) { + fmt.Fprintf(f, "- [ ] **🟡 P1**: 读路径有性能问题 — 加 Redis 缓存,减少 DB 直查\n") + anyAction = true + } + + if !anyAction { + fmt.Fprintf(f, "✅ 无需行动项 — 所有指标在阈值内。\n") + } + fmt.Fprintf(f, "\n") } diff --git a/backend/scripts/loadgen/loadgen/reporter/meta.go b/backend/scripts/loadgen/loadgen/reporter/meta.go new file mode 100644 index 0000000..8f779a8 --- /dev/null +++ b/backend/scripts/loadgen/loadgen/reporter/meta.go @@ -0,0 +1,156 @@ +package reporter + +import "time" + +// Thresholds defines health KPIs for a scenario. +type Thresholds struct { + P50MsMax float64 // P50ms should be <= this + P95MsMax float64 // P95ms should be <= this + P99MsMax float64 // P99ms should be <= this + ErrorRateMax float64 // e.g. 0.01 = 1% + FiveXXRateMax float64 // e.g. 0.001 = 0.1% +} + +// ScenarioMeta describes what a scenario tests and how to evaluate it. +type ScenarioMeta struct { + ID string // "S1" + Name string // "登录" + API string // "POST /api/v1/auth/login" + Description string // 业务一句话 + BusinessImp string // 影响范围 (所有用户 / 写重 / 边缘功能) + Workload string // "read" | "write_light" | "write_heavy" + Thresholds Thresholds +} + +// AllScenarios is the registry of known scenarios. +// Keep this in sync with scenarios/s*.go registry. +var AllScenarios = map[string]ScenarioMeta{ + "S1": { + ID: "S1", + Name: "用户登录", + API: "POST /api/v1/auth/login", + Description: "用户身份认证,签发 JWT", + BusinessImp: "🔴 所有用户必经路径,失败 = 用户进不来", + Workload: "write_light", + Thresholds: Thresholds{ + P50MsMax: 100, P95MsMax: 300, P99MsMax: 1000, + ErrorRateMax: 0.01, FiveXXRateMax: 0.001, + }, + }, + "S2": { + ID: "S2", + Name: "浏览资产详情", + API: "GET /api/v1/assets/{id}", + Description: "高频读路径,典型缓存命中场景", + BusinessImp: "🟢 单用户最高频操作,影响页面加载体验", + Workload: "read", + Thresholds: Thresholds{ + P50MsMax: 50, P95MsMax: 150, P99MsMax: 500, + ErrorRateMax: 0.01, FiveXXRateMax: 0.001, + }, + }, + "S3": { + ID: "S3", + Name: "点赞 / 取消点赞", + API: "POST/DELETE /api/v1/social/assets/{id}/like", + Description: "轻量写,社交互动", + BusinessImp: "🟢 写多但单条小,影响点赞数显示", + Workload: "write_light", + Thresholds: Thresholds{ + P50MsMax: 80, P95MsMax: 250, P99MsMax: 800, + ErrorRateMax: 0.01, FiveXXRateMax: 0.001, + }, + }, + "S4": { + ID: "S4", + Name: "资产铸造 (mint)", + API: "POST /api/v1/assets/mints/precreate", + Description: "写重路径:OSS 上传 + 签名 + 事务落库", + BusinessImp: "🟡 核心交易,影响创作者产出节奏", + Workload: "write_heavy", + Thresholds: Thresholds{ + P50MsMax: 300, P95MsMax: 800, P99MsMax: 2000, // 写重场景阈值更宽 + ErrorRateMax: 0.01, FiveXXRateMax: 0.001, + }, + }, + "S5": { + ID: "S5", + Name: "Dashboard 聚合", + API: "聚合多个用户/资产指标", + Description: "后台聚合查询,可能涉及多表 JOIN", + BusinessImp: "🟢 运营场景,非实时关键", + Workload: "read", + Thresholds: Thresholds{ + P50MsMax: 200, P95MsMax: 500, P99MsMax: 1500, + ErrorRateMax: 0.01, FiveXXRateMax: 0.001, + }, + }, + "S6": { + ID: "S6", + Name: "热门榜单", + API: "GET /api/v1/rankings/hot", + Description: "排序读,Redis 缓存命中率关键", + BusinessImp: "🟢 首页流量入口,影响新用户第一印象", + Workload: "read", + Thresholds: Thresholds{ + P50MsMax: 30, P95MsMax: 100, P99MsMax: 300, + ErrorRateMax: 0.01, FiveXXRateMax: 0.001, + }, + }, + "S7": { + ID: "S7", + Name: "摆展 (place)", + API: "展位分配 + 事务", + Description: "写重路径,涉及展位锁竞争", + BusinessImp: "🟡 创作者核心操作,涉及并发事务", + Workload: "write_heavy", + Thresholds: Thresholds{ + P50MsMax: 400, P95MsMax: 1000, P99MsMax: 2500, + ErrorRateMax: 0.01, FiveXXRateMax: 0.001, + }, + }, +} + +// Verdict returns one of ✅ (good), ⚠️ (warning), 🚨 (critical). +// Based on thresholds + knee detection. +func (s ScenarioMeta) Verdict(r RunReport, kneeTriggered bool) string { + if len(r.Stages) == 0 { + return "❓" + } + errRate := float64(0) + fiveXXRate := float64(0) + if r.TotalRequests > 0 { + errRate = float64(r.Errors) / float64(r.TotalRequests) + fiveXXRate = float64(r.FiveXX) / float64(r.TotalRequests) + } + p99Ms := float64(r.P99Us) / 1000 + + // 红色条件:任一严重超标 + if errRate > s.Thresholds.ErrorRateMax*2 || + fiveXXRate > s.Thresholds.FiveXXRateMax*5 || + p99Ms > s.Thresholds.P99MsMax*2 { + return "🚨" + } + // 黄色条件:接近阈值 或 触发拐点 + if errRate > s.Thresholds.ErrorRateMax || + fiveXXRate > s.Thresholds.FiveXXRateMax || + p99Ms > s.Thresholds.P99MsMax || + kneeTriggered { + return "⚠️" + } + return "✅" +} + +// RunMetadata captures run-level context for the report header. +type RunMetadata struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + Target string `json:"target"` + Scenarios []string `json:"scenarios"` + StepSchedule string `json:"step_schedule,omitempty"` + JWTSecretHint string `json:"jwt_secret_hint,omitempty"` + ProdSSH string `json:"prod_ssh,omitempty"` + MonitorMode string `json:"monitor_mode,omitempty"` + StageMode string `json:"stage_mode"` // "baseline" | "step" | ... + RPSOverride int `json:"rps_override,omitempty"` +} diff --git a/backend/scripts/loadgen/loadgen/scenarios/common.go b/backend/scripts/loadgen/loadgen/scenarios/common.go index 5fc1667..a339962 100644 --- a/backend/scripts/loadgen/loadgen/scenarios/common.go +++ b/backend/scripts/loadgen/loadgen/scenarios/common.go @@ -20,17 +20,20 @@ func doRequest(client *http.Client, req *http.Request, rec *lib.LatencyRecorder, totalCount.Add(1) if err != nil { errCount.Add(1) + rec.RecordResult(true, false) checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker) return } defer resp.Body.Close() - switch { - case resp.StatusCode >= 500: + is5xx := resp.StatusCode >= 500 + isErr := resp.StatusCode >= 400 + if is5xx { fiveXXCount.Add(1) errCount.Add(1) - case resp.StatusCode >= 400: + } else if isErr { errCount.Add(1) } + rec.RecordResult(isErr, is5xx) checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker) } diff --git a/backend/scripts/loadgen/loadgen/scenarios/s1_login.go b/backend/scripts/loadgen/loadgen/scenarios/s1_login.go index 4cbd57d..0491c70 100644 --- a/backend/scripts/loadgen/loadgen/scenarios/s1_login.go +++ b/backend/scripts/loadgen/loadgen/scenarios/s1_login.go @@ -40,6 +40,10 @@ func (s *s1Login) Run(ctx context.Context, rpsOverride int, durationOverride tim duration = 2 * time.Minute } + // S1 doesn't internally iterate stages, so wrap entire run as stage 1 + s.rec.BeginStage(1, targetRPS) + defer s.rec.EndStage() + ticker := time.NewTicker(time.Second / time.Duration(targetRPS)) defer ticker.Stop() timeout := time.NewTimer(duration) diff --git a/backend/scripts/loadgen/loadgen/scenarios/s2_read.go b/backend/scripts/loadgen/loadgen/scenarios/s2_read.go index faa52c8..696155a 100644 --- a/backend/scripts/loadgen/loadgen/scenarios/s2_read.go +++ b/backend/scripts/loadgen/loadgen/scenarios/s2_read.go @@ -38,6 +38,10 @@ func (s *s2Read) Run(ctx context.Context, rpsOverride int, durationOverride time duration = 2 * time.Minute } + // S2 doesn't internally iterate stages, wrap entire run as stage 1 + s.rec.BeginStage(1, targetRPS) + defer s.rec.EndStage() + ticker := time.NewTicker(time.Second / time.Duration(targetRPS)) defer ticker.Stop() timeout := time.NewTimer(duration) diff --git a/backend/scripts/loadgen/loadgen/scenarios/s3_like.go b/backend/scripts/loadgen/loadgen/scenarios/s3_like.go index 6b4bf77..8511958 100644 --- a/backend/scripts/loadgen/loadgen/scenarios/s3_like.go +++ b/backend/scripts/loadgen/loadgen/scenarios/s3_like.go @@ -39,6 +39,10 @@ func (s *s3Like) Run(ctx context.Context, rpsOverride int, durationOverride time duration = 2 * time.Minute } + // S3 doesn't internally iterate stages, wrap entire run as stage 1 + s.rec.BeginStage(1, targetRPS) + defer s.rec.EndStage() + ticker := time.NewTicker(time.Second / time.Duration(targetRPS)) defer ticker.Stop() timeout := time.NewTimer(duration) diff --git a/backend/scripts/loadgen/loadgen/scenarios/s4_mint.go b/backend/scripts/loadgen/loadgen/scenarios/s4_mint.go index d31e812..9909231 100644 --- a/backend/scripts/loadgen/loadgen/scenarios/s4_mint.go +++ b/backend/scripts/loadgen/loadgen/scenarios/s4_mint.go @@ -37,11 +37,18 @@ func (s *s4Mint) Run(ctx context.Context, rpsOverride int, durationOverride time if len(stages) == 0 { stages = []int{5, 10, 20, 30, 50, 80} } + stageDuration := 2 * time.Minute + if durationOverride > 0 && durationOverride < stageDuration { + stageDuration = durationOverride + } for stageIdx, stageRPS := range stages { - logf("S4 stage %d/%d: %d RPS × 2min", stageIdx+1, len(stages), stageRPS) - if err := s.runStage(ctx, stageRPS, 2*time.Minute); err != nil { + logf("S4 stage %d/%d: %d RPS × %v", stageIdx+1, len(stages), stageRPS, stageDuration) + s.rec.BeginStage(stageIdx+1, stageRPS) + if err := s.runStage(ctx, stageRPS, stageDuration); err != nil { + s.rec.EndStage() return err } + s.rec.EndStage() logf("S4 stage %d done, resetting mint data...", stageIdx+1) if s.prodSSH != "" { cmd := exec.Command("ssh", s.prodSSH, "bash /opt/topfans/loadtest/scripts/mint_reset.sh") diff --git a/backend/scripts/loadgen/scripts/prod_seed.sh b/backend/scripts/loadgen/scripts/prod_seed.sh new file mode 100644 index 0000000..59bb229 --- /dev/null +++ b/backend/scripts/loadgen/scripts/prod_seed.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# =================================================================== +# prod seed 一键运行脚本 +# 用途:从 /opt/topfans/docker/.env.prod 读 DB/JWT 凭据,跑 seed 工具 +# 使用:ssh root@101.132.250.62 "bash /opt/topfans/loadtest/scripts/prod_seed.sh" +# =================================================================== +set -euo pipefail + +ENV_FILE="/opt/topfans/docker/.env.prod" +LOADTEST_DIR="/opt/topfans/loadtest" + +if [[ ! -f "$ENV_FILE" ]]; then + echo "❌ $ENV_FILE 不存在" + exit 1 +fi + +export DB_PASSWORD=$(grep '^DB_PASSWORD=' "$ENV_FILE" | cut -d= -f2) +export JWT_SECRET=$(grep '^JWT_SECRET=' "$ENV_FILE" | cut -d= -f2) + +cd "$LOADTEST_DIR" + +echo "==========================================" +echo "prod seed - 准备 loadtest 数据" +echo "DB host: localhost (容器内)" +echo "DB name: topfans" +echo "JWT secret: ${JWT_SECRET:0:10}..." +echo "==========================================" + +./seed --db-name=topfans --jwt-secret="$JWT_SECRET" + +echo "" +echo "✅ seed 完成。生成的文件:" +ls -la users.csv +echo "" +echo "下一步: ./loadgen --cmd=preflight --target=http://localhost:8080" diff --git a/backend/scripts/loadgen/seed/README.md b/backend/scripts/loadgen/seed/README.md index 9404d5c..45cf422 100644 --- a/backend/scripts/loadgen/seed/README.md +++ b/backend/scripts/loadgen/seed/README.md @@ -1,67 +1,188 @@ # seed - 压测数据准备工具 -## 用途 +> 给 prod 凌晨压测灌 1000 个测试用户 + 资产 + JWT,数据用 `star_id=999900` 物理隔离。 -在 prod 本地插入 1000 个测试用户、5000 资产、3000 booth_slots、2000 exhibitions、10000 friendships,签 1000 个 JWT,写 `users.csv`。 +--- + +## 一句话总结 + +跑 `./seed`,数据库里多出 1000 个用户 + 5000 个 assets + 2000 个 exhibitions,本地多出 `users.csv` (含 JWT)。 + +--- ## 编译 ```bash -cd backend && go build -o seed ./scripts/loadgen/seed/ +cd backend +go build -o bin/seed ./scripts/loadgen/seed/ +# 或 +make loadgen-build ``` -## 在 prod 上跑 +--- + +## 在 prod 上跑 (凌晨 T0 = 02:00) ```bash -# 1. 上传二进制 -scp seed root@101.132.250.62:/opt/topfans/loadtest/ - -# 2. SSH 上去跑 ssh root@101.132.250.62 cd /opt/topfans/loadtest -export DB_PASSWORD=$(cat /opt/topfans/docker/.env.prod | grep DB_PASSWORD | cut -d= -f2) -export JWT_SECRET=$(cat /opt/topfans/docker/.env.prod | grep JWT_SECRET | cut -d= -f2) -./seed --db-name=topfans --jwt-secret="$JWT_SECRET" +bash scripts/prod_seed.sh ``` -## 清理 +这个脚本会自动: +1. 读 `/opt/topfans/docker/.env.prod` 拿 DB_PASSWORD + JWT_SECRET +2. 跑 seed (插入 23k 行测试数据) +3. 自动重置 PG 序列 (CLAUDE.md 规范) +4. 写 `users.csv` (含 1000 个 JWT) + +**预计耗时**:30-60 秒 + +--- + +## 在本地 docker 跑 (开发联调) ```bash -# 保留 1000 users + 资产(下次复用) -./seed --cleanup +cd backend/scripts/loadgen/seed -# 全删(包括账号本身) -./seed --cleanup --full +# 1. 生成 bcrypt 哈希 (与 tokens.go 硬编码的 "Test@123" 匹配) +python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \ + > loadtest_bcrypt.txt -# 只重签 token(第二轮压测 JWT 过期时) -./seed --reset-tokens --jwt-secret="$JWT_SECRET" +# 2. 跑 seed (假设本地 docker postgres 在 15432) +cd /Users/liulujian/Documents/code/TopFansByGithub/backend +DB_PASSWORD=123456 \ +JWT_SECRET=topfans-secret-key-local-dev-only \ +./bin/seed \ + --db-name=top-fans \ + --db-host=localhost \ + --db-port=15432 \ + --db-user=postgres ``` -## 本地 docker 联调(开发阶段) +**注意**: `loadtest_bcrypt.txt` 必须在 seed 二进制运行的**当前目录**(代码用相对路径读)。 + +--- + +## 命令行参数 + +``` +./bin/seed --help + +Usage of ./bin/seed: + -cleanup # 跑清理 (默认保留 1000 users) + -cleanup-star-id int # 要清的 star_id (默认 999900, 防止误删) + -full # 配合 -cleanup: 也删用户和 stars + -reset # 删旧数据再 seed (隐含 --cleanup 行为) + -reset-tokens # 只重签 JWT (数据保留) + -jwt-secret string # JWT 密钥 (默认 $JWT_SECRET) + -db-host string # PG host (默认 localhost) + -db-port int # PG port (默认 5432) + -db-name string # PG 数据库 (prod=topfans, 本地=top-fans) + -db-user string # PG user (默认 postgres) + -db-password string # PG 密码 (默认 $DB_PASSWORD) +``` + +--- + +## 三种"清理"模式对比 + +| 命令 | 删 stars | 删 users | 删 assets/exhibits | 用途 | +|------|---------|---------|-------------------|------| +| `./seed --cleanup` | ❌ | ❌ | ✅ | 压完一轮,清理资产但保留账号 | +| `./seed --cleanup --full` | ✅ | ✅ | ✅ | 全部清,下次重新 seed | +| `./seed --reset` | ❌ | ❌ | ✅ | 等同 `--cleanup`(保留用户) | +| `./seed --reset-tokens` | ❌ | ❌ | ❌ | 只重新签 JWT,数据不动 | + +**典型流程**: +```bash +# 第 1 轮压测 (02:00-03:00) +./seed # 灌数据 +./loadgen --cmd=run --scenarios=S1,S2,S4 # 压测 +./seed --cleanup # 压完清理资产 + +# 第 2 轮压测 (下周,JWT 过期了) +./seed --reset-tokens --jwt-secret=$JWT_SECRET # 只重签 JWT +./loadgen --cmd=run --scenarios=S1,S2,S4 # 复测 + +# 完全重来 (例如改了用户模型) +./seed --cleanup --full # 全删 +./seed # 重新灌 +``` + +--- + +## 数据规模 + +| 表 | 行数 | 备注 | +|----|------|------| +| `stars` | +1 | star_id=999900 | +| `users` | +1000 | mobile 19900000001 ~ 19900001000 | +| `fan_profiles` | +1000 | 每个 user 一个 | +| `crystal_transaction_records` | +1000+ | 初始水晶 | +| `assets` | +5000 | 每个 user ~5 个 | +| `booth_slots` | +3000 | | +| `exhibitions` | +2000 | | +| `friendships` | +10000 | | +| **TOTAL** | **~23k 行** | | + +--- + +## 关键设计 + +### 1. star_id 隔离 +所有测试数据用 `star_id = 999900`,**不影响**真实业务 (87, 88, 91, 93, 94, 95)。 + +### 2. PG max_connections = 50 +prod 已将 `POSTGRES_MAX_CONNECTIONS` 从 100 调到 50,避免被测试数据耗尽连接池。 + +### 3. CLAUDE.md 序列重置 +seed 末尾自动 `setval()` 所有相关表的 sequence,避免后续 GORM 插入报 duplicate key。 + +### 4. JWT 7 天过期 +跨周第二轮压测前需 `--reset-tokens` 重签。 + +### 5. bcrypt 哈希与密码硬编码 +- `tokens.go` 硬编码密码为 `"Test@123"`(写到 users.csv 的 password 列) +- `loadtest_bcrypt.txt` 是这个密码的 bcrypt(cost=10) 哈希 +- 二者必须匹配,否则 login 会报 500 + +--- + +## 常见问题 + +### Q: 跑完 seed 但 login 报"密码错误"? +A: `loadtest_bcrypt.txt` 没匹配上 `Test@123`。 +```bash +python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \ + > loadtest_bcrypt.txt +./seed --cleanup --full && ./seed +``` + +### Q: 想换密码怎么办? +A: 同时改两个地方: +1. `tokens.go` 的 `u.Mobile, "Test@123"` → 你的密码 +2. `loadtest_bcrypt.txt` 重新生成 + +### Q: "loadtest_bcrypt.txt: no such file or directory"? +A: seed 用相对路径读这个文件,必须在 seed 目录跑(或者把文件 cp 到当前目录)。 + +### Q: --reset 没生效,users 还是旧的? +A: 因为 `--reset` 等同 `--cleanup`(保留用户)。要删用户用 `--cleanup --full`。 + +--- + +## 单元测试 ```bash cd backend -go build -o bin/seed ./scripts/loadgen/seed/ -DB_PASSWORD=postgres123 JWT_SECRET=topfans-secret-key-local-dev-only \ - ./bin/seed --db-name=top-fans --db-host=localhost -``` - -## 关键约束 - -- **star_id = 999900**:所有数据用此 star_id 隔离,不影响真实业务 -- **PG max_connections = 50**:Task 5 已将 `POSTGRES_MAX_CONNECTIONS` 从 100 改到 50 -- **CLAUDE.md 序列重置**:ResetSequences 会在 seed 末尾自动同步所有相关表的 sequence,避免后续 GORM 插入报 duplicate key -- **JWT 7 天过期**:跨周第二轮压测前需 `--reset-tokens` 重签 - -## 测试 - -```bash -cd backend && go test ./scripts/loadgen/seed/ -v +go test ./scripts/loadgen/seed/ -v ``` 5 个测试: -- `TestMobileNumbering`:mobile 编号正确性 -- `TestSequenceMapping`:loadtestSeqs 映射 -- `TestPKColumnMapping`:pkColumns 映射(关键 stars/star_id, booth_slots/slot_id) -- `TestCleanupRejectsInvalidStarID`:cleanup 拒绝非 loadtest star_id -- `TestJoinInt64`:CSV 序列化辅助函数 +- `TestMobileNumbering`: mobile 编号正确性 +- `TestSequenceMapping`: loadtestSeqs 映射 +- `TestPKColumnMapping`: pkColumns 映射(关键 stars/star_id, booth_slots/slot_id) +- `TestCleanupRejectsInvalidStarID`: cleanup 拒绝非 loadtest star_id +- `TestJoinInt64`: CSV 序列化辅助函数 + +**测试状态**: 5/5 PASS diff --git a/frontend/pages/square/components/CreationGrid.vue b/frontend/pages/square/components/CreationGrid.vue index 3197fe9..6bb6a58 100644 --- a/frontend/pages/square/components/CreationGrid.vue +++ b/frontend/pages/square/components/CreationGrid.vue @@ -470,7 +470,7 @@ defineExpose({ .creation-grid { display: flex; flex-wrap: wrap; - justify-content: space-between; + justify-content: space-around; padding-bottom: 120rpx; } diff --git a/frontend/pages/square/components/HotCategoryBlock.vue b/frontend/pages/square/components/HotCategoryBlock.vue index cf401db..5fdc78a 100644 --- a/frontend/pages/square/components/HotCategoryBlock.vue +++ b/frontend/pages/square/components/HotCategoryBlock.vue @@ -456,6 +456,8 @@ onUnmounted(() => { min-height: 0; border-radius: 12px; overflow: hidden; + position: relative; + z-index: 2; } .ranking-tabs { @@ -636,6 +638,7 @@ onUnmounted(() => { /* box-shadow: 2px 2px 4.5px 0px #f04b4b40; */ box-shadow: 2px 4px 4px 0px #c92f2f5c; margin-bottom: 36.8rpx; + z-index: 3; } /* 单行布局:藏品图片 + 头像 + 点赞信息 + TOP 标签 */