feat:修改压测配置

This commit is contained in:
zerosaturation 2026-06-15 20:10:56 +08:00
parent 0aa02cc3a4
commit 1bc86f0447
28 changed files with 2354 additions and 181 deletions

View File

@ -1,7 +1,7 @@
# TopFans Backend Makefile # TopFans Backend Makefile
# 用于简化开发流程 # 用于简化开发流程
.PHONY: help install-swagger gen-swagger update-swagger start-swagger start-all stop-all clean build run all .PHONY: help install-swagger gen-swagger update-swagger start-swagger start-all stop-all clean build run all loadgen-build loadgen-test loadgen-vet loadgen-ci
# 默认目标 # 默认目标
help: help:
@ -23,6 +23,11 @@ help:
@echo " make run - 运行 Gateway" @echo " make run - 运行 Gateway"
@echo " make all - 安装依赖 + 生成文档 + 构建" @echo " make all - 安装依赖 + 生成文档 + 构建"
@echo "" @echo ""
@echo "压测工具:"
@echo " make loadgen-build - 编译 seed + loadgen 到 bin/"
@echo " make loadgen-test - 运行 loadgen 单元测试"
@echo " make loadgen-vet - go vet 静态检查"
@echo ""
@echo "清理:" @echo "清理:"
@echo " make clean - 清理生成的文件" @echo " make clean - 清理生成的文件"
@echo "" @echo ""
@ -37,6 +42,11 @@ help:
@echo " make run - 运行 Gateway" @echo " make run - 运行 Gateway"
@echo " make all - 安装依赖 + 生成文档 + 构建" @echo " make all - 安装依赖 + 生成文档 + 构建"
@echo "" @echo ""
@echo "压测工具:"
@echo " make loadgen-build - 编译 seed + loadgen 到 bin/"
@echo " make loadgen-test - 运行 loadgen 单元测试"
@echo " make loadgen-vet - go vet 静态检查"
@echo ""
@echo "清理:" @echo "清理:"
@echo " make clean - 清理生成的文件" @echo " make clean - 清理生成的文件"
@ -92,8 +102,32 @@ clean:
@rm -rf backend/gateway/docs/*.go @rm -rf backend/gateway/docs/*.go
@rm -rf backend/gateway/docs/*.json @rm -rf backend/gateway/docs/*.json
@rm -rf backend/gateway/docs/*.yaml @rm -rf backend/gateway/docs/*.yaml
@rm -rf backend/bin/
@echo "✅ 清理完成" @echo "✅ 清理完成"
# ==================== Loadgen / 压测工具 ====================
# 编译 seed 和 loadgen 二进制到 bin/
loadgen-build:
@echo "编译 loadgen 工具..."
@mkdir -p bin
@go build -ldflags="-s -w" -o bin/seed ./scripts/loadgen/seed/
@go build -ldflags="-s -w" -o bin/loadgen ./scripts/loadgen/loadgen/
@echo "✅ seed + loadgen → bin/"
# 运行 loadgen 单元测试 (当前 23 个测试, 应全过)
loadgen-test:
@echo "运行 loadgen 单元测试..."
@go test -count=1 ./scripts/loadgen/...
# go vet 静态检查
loadgen-vet:
@echo "go vet loadgen..."
@go vet ./scripts/loadgen/...
# loadgen 完整 CI 入口: vet + test + build
loadgen-ci: loadgen-vet loadgen-test loadgen-build
# 全部:安装依赖 + 生成文档 + 构建 # 全部:安装依赖 + 生成文档 + 构建
all: install-swagger gen-swagger build all: install-swagger gen-swagger build
@echo "" @echo ""

23
backend/reports/S1.json Normal file
View File

@ -0,0 +1,23 @@
{
"scenario": "S1",
"total_requests": 8,
"errors": 0,
"five_xx": 0,
"p50_us": 73919,
"p95_us": 83071,
"p99_us": 83071,
"max_us": 83071,
"stages": [
{
"stage_idx": 1,
"target_rps": 1,
"total_requests": 8,
"errors": 0,
"five_xx": 0,
"p50_us": 73919,
"p95_us": 83071,
"p99_us": 83071,
"max_us": 83071
}
]
}

23
backend/reports/S2.json Normal file
View File

@ -0,0 +1,23 @@
{
"scenario": "S2",
"total_requests": 8,
"errors": 8,
"five_xx": 0,
"p50_us": 1552,
"p95_us": 2909,
"p99_us": 2909,
"max_us": 2909,
"stages": [
{
"stage_idx": 1,
"target_rps": 1,
"total_requests": 8,
"errors": 8,
"five_xx": 0,
"p50_us": 1552,
"p95_us": 2909,
"p99_us": 2909,
"max_us": 2909
}
]
}

45
backend/reports/S4.json Normal file
View File

@ -0,0 +1,45 @@
{
"scenario": "S4",
"total_requests": 18,
"errors": 18,
"five_xx": 0,
"p50_us": 1210,
"p95_us": 2161,
"p99_us": 2161,
"max_us": 2161,
"stages": [
{
"stage_idx": 1,
"target_rps": 1,
"total_requests": 3,
"errors": 3,
"five_xx": 0,
"p50_us": 4143,
"p95_us": 8943,
"p99_us": 8943,
"max_us": 8943
},
{
"stage_idx": 2,
"target_rps": 2,
"total_requests": 6,
"errors": 6,
"five_xx": 0,
"p50_us": 1314,
"p95_us": 2044,
"p99_us": 2044,
"max_us": 2044
},
{
"stage_idx": 3,
"target_rps": 3,
"total_requests": 9,
"errors": 9,
"five_xx": 0,
"p50_us": 1210,
"p95_us": 2161,
"p99_us": 2161,
"max_us": 2161
}
]
}

View File

@ -0,0 +1,4 @@
scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms,stages
S1,8,0,0,73.91,83.07,83.07,83.07,1
S2,8,8,0,1.55,2.90,2.90,2.90,1
S4,18,18,0,1.20,2.16,2.16,2.16,3
1 scenario total errors five_xx p50_ms p95_ms p99_ms max_ms stages
2 S1 8 0 0 73.91 83.07 83.07 83.07 1
3 S2 8 8 0 1.55 2.90 2.90 2.90 1
4 S4 18 18 0 1.20 2.16 2.16 2.16 3

View File

@ -0,0 +1,227 @@
# TopFans 压测报告
## 📋 运行信息
| 项 | 值 |
|---|---|
| **生成时间** | 2026-06-15 20:05:56 CST |
| **压测开始** | 2026-06-15 20:05:47 CST |
| **压测结束** | 2026-06-15 20:05:56 CST |
| **总耗时** | 9s |
| **目标地址** | `http://localhost:8080` |
| **测试场景** | S4 |
| **阶梯模式** | step (`1,2,3`) |
| **JWT 签名密钥** | `topfans-***` (前 8 位) |
| **监控模式** | off |
| **总请求数** | 34 |
| **总错误数** | 26 (76.47%) |
| **5xx 数** | 0 (0.00%) |
---
## 🎯 执行摘要
**总览**: ✅ 1 健康 / ⚠️ 0 警告 / 🚨 2 严重 (共 3)
🚨 **关键问题** (2 个):
- **S2 (浏览资产详情)**: 错误率 100.00%
- **S4 (资产铸造 (mint))**: 错误率 100.00%
**场景速览**:
- ✅ **S1 用户登录** — p99=83ms, err 0.00%
- 🚨 **S2 浏览资产详情** — p99=3ms, err 100.00%
- 🚨 **S4 资产铸造 (mint)** — p99=2ms, err 100.00%
---
## 📊 总览表
| 场景 | 描述 | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 拐点 RPS | 状态 |
|------|------|-------|-----|-----|-------|-------|-------|-------|---------|------|
| **S1** | 用户登录 | 8 | 0 (0.00%) | 0 (0.00%) | 74 | 83 | 83 | 83 | — | ✅ |
| **S2** | 浏览资产详情 | 8 | 8 (100.00%) | 0 (0.00%) | 2 | 3 | 3 | 3 | — | 🚨 |
| **S4** | 资产铸造 (mint) | 18 | 18 (100.00%) | 0 (0.00%) | 1 | 2 | 2 | 2 | — | 🚨 |
> 说明: Err 包含 4xx + 5xx,5xx 是子集。错误率 = Err / Total。
## 🔬 跨场景瓶颈分析
**无明显瓶颈**,所有场景 P99 都在阈值内。
**P99 / 阈值 比率** (从高到低):
- S1: 0.08x (83ms)
- S2: 0.01x (3ms)
- S4: 0.00x (2ms)
---
## ✅ S1 用户登录
### 📌 测试说明
| 项 | 值 |
|---|---|
| **API** | `POST /api/v1/auth/login` |
| **负载类型** | ✏️ 轻写 |
| **业务说明** | 用户身份认证,签发 JWT |
| **影响范围** | 🔴 所有用户必经路径,失败 = 用户进不来 |
### 📈 性能指标 vs 健康阈值
| 指标 | 实测 | 阈值 | 判定 |
|------|------|------|------|
| P50ms | 74 | ≤100 | ✅ |
| P95ms | 83 | ≤300 | ✅ |
| P99ms | 83 | ≤1000 | ✅ |
| Maxms | 83 | — | 参考 |
| 错误率 | 0.00% | ≤1.00% | ✅ |
| 5xx 率 | 0.00% | ≤0.10% | ✅ |
### 📍 拐点分析
仅 1 个 stage,未做阶梯测试,无法判断拐点。
### 🔢 阶梯结果
| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |
|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|
| 1 | 1 | 8 | 0 | 0 | 74 | 83 | 83 | 83 | |
### 🎯 行动项
✅ 无需行动项 — 所有指标在阈值内。
### 📉 图表
![S1 RPS / P99 / Error](.//s1.png)
---
## 🚨 S2 浏览资产详情
### 📌 测试说明
| 项 | 值 |
|---|---|
| **API** | `GET /api/v1/assets/{id}` |
| **负载类型** | 📖 读 |
| **业务说明** | 高频读路径,典型缓存命中场景 |
| **影响范围** | 🟢 单用户最高频操作,影响页面加载体验 |
### 📈 性能指标 vs 健康阈值
| 指标 | 实测 | 阈值 | 判定 |
|------|------|------|------|
| P50ms | 2 | ≤50 | ✅ |
| P95ms | 3 | ≤150 | ✅ |
| P99ms | 3 | ≤500 | ✅ |
| Maxms | 3 | — | 参考 |
| 错误率 | 100.00% | ≤1.00% | 🚨 |
| 5xx 率 | 0.00% | ≤0.10% | ✅ |
### 📍 拐点分析
仅 1 个 stage,未做阶梯测试,无法判断拐点。
### 🔢 阶梯结果
| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |
|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|
| 1 | 1 | 8 | 8 | 0 | 2 | 3 | 3 | 3 | |
### 🎯 行动项
- [ ] **🟡 P1**: 错误率 100.00% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失
### 📉 图表
![S2 RPS / P99 / Error](.//s2.png)
---
## 🚨 S4 资产铸造 (mint)
### 📌 测试说明
| 项 | 值 |
|---|---|
| **API** | `POST /api/v1/assets/mints/precreate` |
| **负载类型** | 🛠️ 重写 |
| **业务说明** | 写重路径:OSS 上传 + 签名 + 事务落库 |
| **影响范围** | 🟡 核心交易,影响创作者产出节奏 |
### 📈 性能指标 vs 健康阈值
| 指标 | 实测 | 阈值 | 判定 |
|------|------|------|------|
| P50ms | 1 | ≤300 | ✅ |
| P95ms | 2 | ≤800 | ✅ |
| P99ms | 2 | ≤2000 | ✅ |
| Maxms | 2 | — | 参考 |
| 错误率 | 100.00% | ≤1.00% | 🚨 |
| 5xx 率 | 0.00% | ≤0.10% | ✅ |
### 📍 拐点分析
**拐点未触发** — 全程 3 个 stage 健康运行,最高 3 RPS p99=2ms。
### 🔢 阶梯结果
| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |
|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|
| 1 | 1 | 3 | 3 | 0 | 4 | 9 | 9 | 9 | |
| 2 | 2 | 6 | 6 | 0 | 1 | 2 | 2 | 2 | -77% |
| 3 | 3 | 9 | 9 | 0 | 1 | 2 | 2 | 2 | +6% |
### 🎯 行动项
- [ ] **🟡 P1**: 错误率 100.00% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失
### 📉 图表
![S4 RPS / P99 / Error](.//s4.png)
---
## 📎 附录
### 健康阈值说明
- **P50/P95/P99**: 百分位延迟 (毫秒),值越小越好
- **错误率**: 4xx+5xx 请求占比,健康 < 1%
- **5xx 率**: 服务端错误率,健康 < 0.1%
- **拐点**: 阶梯测试中,p99 相对前一 stage 涨幅 > 50% 的第一个 stage
### 文件清单
```
reports/
├── final-report.md (本文件)
├── baseline.csv (Excel 可打开的汇总)
├── s1.json
├── s1.png
├── s2.json
├── s2.png
├── s3.json
├── s3.png
├── s4.json
├── s4.png
├── s5.json
├── s5.png
├── s6.json
├── s6.png
├── s7.json
├── s7.png
```
### 如何复现
```bash
cd /opt/topfans/loadtest
./loadgen --cmd=run --scenarios=S4 --stage=step --step-schedule='1,2,3' \
--target=http://localhost:8080 \
--monitor=off \
```

View File

@ -0,0 +1,12 @@
{
"start_time": "2026-06-15T20:05:47.357522+08:00",
"end_time": "2026-06-15T20:05:56.380495+08:00",
"target": "http://localhost:8080",
"scenarios": [
"S4"
],
"step_schedule": "1,2,3",
"jwt_secret_hint": "topfans-",
"monitor_mode": "off",
"stage_mode": "step"
}

BIN
backend/reports/s1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

BIN
backend/reports/s2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

BIN
backend/reports/s4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

View File

@ -1,69 +1,129 @@
# 后端服务压测工具 # 后端服务压测工具 (loadgen)
为部署在阿里云单机4G/2C的 TopFans 后端微服务设计。 > 给阿里云单机 (4G/2C) TopFans 后端微服务用的压测 + 数据准备工具集。
> 凌晨 02:00-06:00 业务低峰执行,数据物理隔离 `star_id=999900`
## 目录 ---
## 📚 文档地图
| 文档 | 用途 | 谁要看 |
|------|------|--------|
| **README.md** (本文) | 工具集概览 + 5 分钟入门 | 所有人 |
| [RUNBOOK.md](RUNBOOK.md) | 凌晨压测**一步一步**操作手册 | on-call 工程师 |
| [REPORT_GUIDE.md](REPORT_GUIDE.md) | 压测报告**怎么读** + 瓶颈定位 + 行动项模板 | 看报告的工程师 / TL |
| [seed/README.md](seed/README.md) | seed 工具细节 (数据准备) | 第一次跑压测的人 |
---
## 🧰 工具集概览
``` ```
backend/scripts/loadgen/ loadgen/
├── seed/ # 数据准备工具(CLI) ├── seed/ # 数据准备 CLI (生成 1000 个测试用户 + 资产 + JWT)
│ ├── main.go # seed CLI 入口 ├── loadgen/ # 压测主程序 (7 个场景,6 维熔断,带 reporter)
│ ├── stars.go users.go profiles.go assets.go ├── monitor/ # 监控栈 (Prometheus + Grafana,可选)
│ ├── slots_and_exhibits.go friendships.go ├── recover/ # 紧急灭火 (一键停 + 数据库恢复)
│ ├── tokens.go sequences.go cleanup.go ├── scripts/ # 部署到 prod 的辅助脚本
│ ├── seed_test.go # 单元测试 └── reports/ # 跑测产出 (gitignore,scp 拉回本地)
│ └── README.md
├── loadgen/ # 压测主程序
│ ├── main.go # loadgen CLI 入口
│ ├── preflight.go verify.go # 7 项开压前检查 + 压后验证
│ ├── lib/ # 核心库(16 个测试全过)
│ │ ├── csv.go client.go hdr.go log.go ramp.go
│ │ ├── circuit.go ssh_metrics.go config.go
│ │ └── *_test.go
│ ├── scenarios/ # 7 个场景(已注册)
│ │ ├── s1_login.go s2_read.go s3_like.go s4_mint.go
│ │ ├── s5_dashboard.go s6_ranking.go s7_place.go
│ │ ├── common.go scenarios.go
│ │ └── scenarios_test.go
│ └── reporter/ # 报告生成
│ ├── json.go csv.go plot.go markdown.go
├── monitor/ # 监控栈
│ ├── sample.sh # 后台采样(写到 metrics-feed.jsonl)
│ ├── docker-compose.monitor.yml
│ ├── prometheus.yml
│ └── grafana-dashboards/ # 4 个预置面板
├── recover/ # 一键灭火 + 备份还原
│ ├── emergency-stop.sh
│ └── restore-from-backup.sh
├── scripts/ # 部署到 prod
│ └── mint_reset.sh
└── reports/ # 跑测产出(gitignore)
``` ```
## 编译 ### 核心 CLI: `bin/seed` + `bin/loadgen`
| 命令 | 作用 |
|------|------|
| `./bin/seed` | 灌测试数据 → `users.csv` + 数据库 |
| `./bin/seed --cleanup` | 清理测试数据 (保留 1000 用户) |
| `./bin/seed --cleanup --full` | 全部删掉 (账号本身) |
| `./bin/seed --reset-tokens` | 只重签 JWT (跨周压测用) |
| `./bin/loadgen --cmd=preflight` | 7 项开压前检查 |
| `./bin/loadgen --cmd=run --scenarios=S1` | 跑场景 |
| `./bin/loadgen --cmd=report` | 生成 markdown 报告 + PNG 图表 |
### 7 个场景
| ID | 场景 | 默认 RPS | 写/读 | 关键 API |
|----|------|---------|------|---------|
| S1 | Login | 15 | 写(轻) | `POST /api/v1/auth/login` |
| S2 | Read | 250 | 读 | `GET /api/v1/assets/{id}` |
| S3 | Like | 50 | 写(轻) | `POST/DELETE /api/v1/social/assets/{id}/like` |
| S4 | Mint | 1-5 | **写(重)** | `POST /api/v1/assets/mints/precreate` |
| S5 | Dashboard | — | 读聚合 | (dashboard 聚合) |
| S6 | Ranking | 300 | 读 | `GET /api/v1/rankings/hot` |
| S7 | Place | 1-5 | **写(重)** | (摆展事务) |
---
## 🚀 5 分钟入门 (本地 docker)
```bash
# 1. 编译 (Linux prod 部署用,本地 darwin 直接 go build)
cd backend
make loadgen-build
# 2. 准备数据 (需要本地 docker postgres)
cd scripts/loadgen/seed
# 生成 bcrypt 哈希 (与 tokens.go 硬编码的 "Test@123" 匹配)
python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \
> loadtest_bcrypt.txt
# 跑 seed (用本地 docker 的 env)
DB_PASSWORD=123456 \
JWT_SECRET=topfans-secret-key-local-dev-only \
/Users/liulujian/Documents/code/TopFansByGithub/backend/bin/seed \
--db-name=top-fans --db-host=localhost --db-port=15432 --db-user=postgres
# 3. 复制 users.csv 到 backend 目录
cp users.csv ../../../users.csv
# 4. 开压前检查
cd ../../../ # = backend
JWT_SECRET=topfans-secret-key-local-dev-only \
./bin/loadgen --cmd=preflight --target=http://localhost:8080
# 5. 烟雾测试 (30 秒,1 RPS)
JWT_SECRET=topfans-secret-key-local-dev-only \
./bin/loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \
--target=http://localhost:8080 --monitor=off
# 6. 生成报告
JWT_SECRET=topfans-secret-key-local-dev-only \
./bin/loadgen --cmd=report --input=./reports --output=./reports/final-report.md
open reports/final-report.md # macOS
```
---
## 🔨 编译
```bash ```bash
cd backend cd backend
make loadgen-build # 编译 seed + loadgen 到 bin/
make loadgen-test # 单元测试 (23 个)
make loadgen-vet # go vet
make loadgen-ci # vet + test + build (CI 单步)
```
手动编译 (Linux prod):
```bash
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/seed ./scripts/loadgen/seed/ GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/seed ./scripts/loadgen/seed/
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/loadgen ./scripts/loadgen/loadgen/ GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/loadgen ./scripts/loadgen/loadgen/
``` ```
## 测试 ---
```bash ## 🛡️ 安全设计
cd backend
go test ./scripts/loadgen/...
```
**当前测试状态** (截至 Phase 7 完结): ### 数据隔离
- `seed` 包: 5/5 PASS 所有测试数据用 `star_id = 999900` 物理隔离,**不影响**真实业务 star_id (87, 88, 91, 93, 94, 95)。
- `loadgen/lib` 包: 16/16 PASS
- `loadgen/scenarios` 包: 2/2 PASS
- 共 23 个测试全过
## 关键特性 ### CLAUDE.md 序列重置
seed 工具末尾自动同步所有相关表的 PG 序列(避免后续 GORM 插入报 duplicate key)。
### 1. 6 维红线判停(自动熔断) ### 凌晨窗口
执行窗口:**02:00 - 06:00** 业务低峰。
紧急灭火: `recover/emergency-stop.sh` 一键停 + `restore-from-backup.sh` 5-8min 还原。
### 6 维红线熔断 (自动停)
| # | 红线 | 阈值 | 数据源 | | # | 红线 | 阈值 | 数据源 |
|---|------|------|--------| |---|------|------|--------|
@ -74,20 +134,108 @@ go test ./scripts/loadgen/...
| R5 | 磁盘空闲 | < 5GB 持续 30s | metrics-feed | | R5 | 磁盘空闲 | < 5GB 持续 30s | metrics-feed |
| R6 | OOM 事件 | 瞬时触发 | metrics-feed | | R6 | OOM 事件 | 瞬时触发 | metrics-feed |
### 2. CLAUDE.md 序列重置 ---
seed 工具自动同步所有相关表的 PG 序列(避免后续 GORM 插入报 duplicate key)。 ## 📊 报告产出
### 3. 数据隔离 跑完 + `--cmd=report` 后,`reports/` 下:
所有测试数据用 `star_id = 999900` 物理隔离,不影响真实业务 star_id (87, 88, 91, 93, 94, 95)。 ```
reports/
├── S1.json # 原始数据 (含 stages)
├── S2.json
├── S4.json
├── baseline.csv # Excel 友好的汇总
├── s1.png # RPS / P99 / Error 曲线
├── s2.png
├── s4.png
└── final-report.md # ← 主要看这个
```
### 4. 凌晨窗口 `final-report.md` 包含:
1. **总览表** (所有场景一行一个,7 列)
2. **每个场景的 ⚠️ 拐点 RPS** (自动算:第一个 p99 涨 >50% 的 stage)
3. **阶梯结果表** (每 stage 的 RPS / p50 / p95 / p99 / err / 5xx)
4. **PNG 曲线图** (RPS / P99 / Error 三条线)
执行窗口:凌晨 02:00-06:00 业务低峰。emergency-stop 一键回滚,restore-from-backup.sh 5-8min 还原。 详细读法见 [REPORT_GUIDE.md](REPORT_GUIDE.md)
## 详细文档 ---
## 🧪 测试状态
```
seed: 5/5 PASS
loadgen/lib: 16/16 PASS
scenarios: 2/2 PASS
TOTAL: 23/23 PASS
```
---
## 📁 完整目录
```
backend/scripts/loadgen/
├── README.md # ← 你在这里
├── RUNBOOK.md # ← 凌晨压测操作手册
├── REPORT_GUIDE.md # ← 报告怎么读
├── seed/ # 数据准备工具
│ ├── main.go # CLI 入口
│ ├── stars.go users.go profiles.go assets.go
│ ├── slots_and_exhibits.go friendships.go
│ ├── tokens.go sequences.go cleanup.go
│ ├── seed_test.go # 单元测试
│ ├── loadtest_bcrypt.txt # Test@123 哈希 (与 tokens.go 匹配)
│ └── README.md
├── loadgen/ # 压测主程序
│ ├── main.go # CLI 入口
│ ├── preflight.go verify.go # 7 项开压前检查 + 压后验证
│ ├── lib/ # 核心库
│ │ ├── csv.go # users.csv 解析
│ │ ├── client.go # HTTP client
│ │ ├── hdr.go # 延迟直方图 + per-stage 计数
│ │ ├── log.go ramp.go # 日志 + 阶梯调度
│ │ ├── circuit.go # 6 维熔断
│ │ ├── ssh_metrics.go # prod server metrics 抓取
│ │ ├── config.go
│ │ └── *_test.go # 16 个测试
│ ├── scenarios/ # 7 个场景
│ │ ├── s1_login.go
│ │ ├── s2_read.go
│ │ ├── s3_like.go
│ │ ├── s4_mint.go # 支持多 stage
│ │ ├── s5_dashboard.go
│ │ ├── s6_ranking.go
│ │ ├── s7_place.go
│ │ ├── common.go # doRequest + DefaultBaseURL
│ │ ├── scenarios.go # 注册表
│ │ ├── helpers.go
│ │ └── scenarios_test.go
│ └── reporter/ # 报告生成
│ ├── json.go # RunReport + StageReport
│ ├── csv.go # baseline.csv
│ ├── plot.go # PNG 曲线 (gonum)
│ ├── markdown.go # final-report.md
│ └── knee.go # KneeRPS 自动算
├── monitor/ # 监控栈 (可选)
│ ├── sample.sh # 后台采样到 metrics-feed.jsonl
│ ├── docker-compose.monitor.yml
│ ├── prometheus.yml
│ └── grafana-dashboards/ # 4 个预置面板
├── recover/ # 紧急灭火
│ ├── emergency-stop.sh
│ └── restore-from-backup.sh
├── scripts/ # prod 辅助
│ ├── mint_reset.sh # S4 之间的 mint 数据清理
│ └── prod_seed.sh # 一键跑 seed (读 prod env)
└── reports/ # 跑测产出 (gitignore)
```
---
## 详细设计
- **设计文档**: `docs/superpowers/specs/2026-06-12-load-testing-design.md` - **设计文档**: `docs/superpowers/specs/2026-06-12-load-testing-design.md`
- **实施计划**: `docs/superpowers/plans/2026-06-12-load-testing.md` - **实施计划**: `docs/superpowers/plans/2026-06-12-load-testing.md`
- **seed 工具说明**: `seed/README.md` - **seed 工具说明**: [seed/README.md](seed/README.md)

View File

@ -0,0 +1,266 @@
# REPORT_GUIDE — 压测报告怎么读
> **目标读者**:看完压测报告后,需要判断"系统能扛住吗"+"哪里是瓶颈"+"下一步改什么"的工程师
> **报告路径**:`reports/final-report.md` (主) + `reports/{scenario}.json` (原始) + `reports/{scenario}.png` (图)
---
## 1. 报告目录结构
```
reports/
├── S1.json # 场景 1 原始数据 (程序读)
├── S2.json # 场景 2
├── S4.json # 场景 4
├── baseline.csv # Excel 可打开的汇总表
├── s1.png # 场景 1 曲线图 (RPS / P99 / Error)
├── s2.png
├── s4.png
└── final-report.md # ← 你要看的总报告
```
---
## 2. 三步读完报告
### 第 1 步:看汇总表 (1 分钟)
```markdown
| Scenario | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | Stages |
|----------|-------|-----|-----|-------|-------|-------|-------|--------|
| S1 | 12500 | 0 | 0 | 86.59 | 119.23 | 200.50 | 450 | 5 |
| S2 | 25000 | 5 | 0 | 12.30 | 35.00 | 88.00 | 250 | 5 |
| S4 | 600 | 12 | 2 | 200.00 | 500.00 | 850.00 | 1200 | 4 |
```
**每个字段的含义**:
| 字段 | 含义 | 健康参考 (4G/2C prod) |
|------|------|----------------------|
| `Scenario` | 场景 ID (S1=登录, S2=读, S3=点赞, S4=铸造, ...) | — |
| `Total` | 该场景总请求数 | 越大越好,代表你扛住了 |
| `Err` | 客户端+服务端错误总和 | **< 1%** |
| `5xx` | 服务端错误 (500-599) | **< 0.1%** (1‰) |
| `P50ms` | 50% 请求在这个时间内 | < 100ms |
| `P95ms` | 95% 请求在这个时间内 | < 300ms |
| `P99ms` | 99% 请求在这个时间内 | < 1000ms (S4 写重可放宽到 2000ms) |
| `Maxms` | 最慢的一次请求 | 一般 3-5x P99 |
| `Stages` | 阶梯测试的阶段数 | = step-schedule 的元素数 |
**判断模板**:
- ✅ 全绿 → 系统扛得住,准备上线
- ⚠️ 某个 S* Err > 1% → 优先看那个场景
- 🚨 某个 S* 5xx > 1% → 服务端有问题,看 §3 定位
---
### 第 2 步:看拐点 (KneeRPS) (2 分钟)
每个 scenario 标题下会出现一行:
```markdown
**⚠️ 拐点**: stage 3 @ 3 RPS (p99 暴涨 514%)
```
**含义**: 当 RPS 升到 3 时,p99 延迟比 stage 2 暴涨 514% (5.14 倍)。
**判定逻辑** (在 `reporter/knee.go`):
- 逐 stage 比 p99
- 第一次涨幅 > 50% 时,标记为拐点
- 全程没涨 > 50% → 显示 "✅ 拐点未触发"
**怎么用这个数字**:
- **S1 拐点 RPS = 15** → 你的登录服务,超过 15 QPS 就开始劣化。生产预估峰值 10 QPS,留 50% buffer
- **S4 拐点 RPS = 2** → 铸造接口很重,2 QPS 就劣化了。要么优化,要么限流
**举例**:
| 拐点 RPS | 业务含义 | 行动项 |
|---------|---------|--------|
| ≥ 期望峰值的 2x | ✅ 健康 | 上线,加监控 |
| ≈ 期望峰值 | ⚠️ 临界 | 加缓存 / 异步化 / 限流 |
| < 期望峰值 | 🚨 不达标 | 重构 + 复测 |
---
### 第 3 步:看阶梯表 + 曲线图 (5 分钟)
**阶梯表** (md 里每个场景下):
```markdown
### 阶梯结果
| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms |
|-------|-----------|-------|-----|-----|-------|-------|-------|-------|
| 1 | 2 | 600 | 0 | 0 | 80 | 100 | 110 | 130 |
| 2 | 5 | 1500 | 0 | 0 | 82 | 105 | 115 | 140 |
| 3 | 10 | 3000 | 0 | 0 | 85 | 110 | 130 | 180 |
| 4 | 15 | 4500 | 0 | 0 | 95 | 130 | 200 | 350 |
| 5 | 20 | 6000 | 5 | 0 | 120 | 200 | 450 | 800 |
```
**怎么读**:
- **Total** 应该是 `TargetRPS × Duration` (近似,因为有误差)
- **P99ms** 应该随 TargetRPS 上升**平滑增加** (10-30% 涨幅/stage 是正常)
- **Err / 5xx** 应该全程 < 1%
- **如果某 stage 突然 P99 翻倍** → 拐点,看上面 KneeRPS
**曲线图** (`s1.png` 等):
- **X 轴**: Stage 编号 (1, 2, 3, ...)
- **Y 轴**: 三个值 — RPS (蓝)、P99ms (绿)、Error% (红)
- **怎么看**:
- 三条线**平稳上升** = 正常
- **P99 突然陡升** = 拐点
- **Error% 突然跳起来** = 服务挂了
---
## 3. 定位瓶颈 — 常见模式
### 模式 1: P99 阶梯上升,但 Error 一直 0
**含义**: 系统扛得住,但在变慢。
**原因**: GC 抖动 / DB 慢查询 / 锁竞争。
**行动**:
1. 看 PG 慢查询日志: `pg_stat_statements` ORDER BY `mean_exec_time` DESC
2. 看应用层 profile: `pprof` heap + cpu
3. 检查连接池配置: 可能太小
### 模式 2: P99 阶梯上升 + Error 也开始涨
**含义**: 系统到极限。
**原因**: 资源耗尽 (CPU 100%, 连接池满, DB 锁)。
**行动**:
1. 看 server metrics feed: `tail -f metrics-feed.jsonl`
2. `top` 看 CPU/内存,`iostat` 看 IO
3. 检查是否有连接泄漏 (`netstat | grep TIME_WAIT`)
### 模式 3: 阶梯早期就 5xx > 5%
**含义**: 系统本身有问题,不是负载问题。
**原因**: 代码 bug / 配置错误 / 依赖缺失。
**行动**:
1. 看 5xx 的具体响应体 (在 log 里)
2. 检查 error 码,对照业务错误码定义
3. 看是不是 auth/JWT 过期
### 模式 4: 第一个 stage P99 很高,后续反而低
**含义**: 热身不够 / 缓存没预热。
**原因**: Redis 冷启动 / JIT 编译 / DB 连接池启动慢。
**行动**:
1. 第一次 stage 加长 (例如先 2min 预热)
2. 或者用 `--rps=1` 先跑 1-2min 预热,再开阶梯
### 模式 5: S4 (Mint) 在很低的 RPS 就拐
**含义**: 写路径太重。
**原因**: 铸造涉及事务 / 签名 / OSS 上传,本身就是慢操作。
**行动**:
1. 检查 mint 是不是同步阻塞 (能不能异步化?)
2. 看 mint 数据是否需要落库 (能否用 append-only?)
3. 考虑限流: 服务端拒绝 > 2 QPS 的 mint 请求
---
## 4. 怎么写出行动项
读完报告,应该能回答三个问题:
### Q1: 系统能扛住业务预期峰值吗?
- 业务预期峰值 → 比对拐点 RPS
- 拐点 ≥ 2x 峰值 → ✅ 可以上线
- 拐点 ≈ 1x 峰值 → ⚠️ 加监控告警,谨慎上线
- 拐点 < 峰值 🚨 必须先优化
### Q2: 拐点在哪里?为什么?
看哪个 stage 拐的,然后:
- **CPU 100%** → 计算密集,优化算法或加机器
- **DB CPU 100%** → 慢查询,加索引或读写分离
- **PG 连接数满** → 连接池配置 / 服务降级
- **PG 锁等待** → 事务设计问题
- **磁盘 IO 满** → 加 SSD 或缓存
### Q3: 下一步改什么?
行动项模板:
```markdown
## [Loadtest 2026-06-15] 行动项
### P0 (上线前必修)
- [ ] **S2 Read 拐点 100 RPS < 业务预期 150 RPS**
- 根因: PG `assets` 表全表扫描,10 万行
- 修复: 加 `idx_assets_star_id_status` 索引
- Owner: @dba
### P1 (1 周内修)
- [ ] **S4 Mint 拐点 2 RPS**
- 根因: 同步写 OSS + 同步落库
- 修复: mint 流程拆成 precreate + 后台 worker
- Owner: @backend
### P2 (技术债)
- [ ] 压测期间 CPU 持续 80%,考虑扩容到 4C
```
---
## 5. JSON 原始数据怎么读 (高级)
`reports/S1.json` 长这样:
```json
{
"scenario": "S1",
"total_requests": 12500,
"errors": 5,
"five_xx": 0,
"p50_us": 86591,
"p95_us": 119231,
"p99_us": 200502,
"max_us": 450000,
"stages": [
{
"stage_idx": 1,
"target_rps": 2,
"total_requests": 600,
"errors": 0,
"five_xx": 0,
"p50_us": 80000,
"p95_us": 100000,
"p99_us": 110000,
"max_us": 130000
},
...
]
}
```
**单位说明**:
- 所有 `_us` 后缀 = microseconds (微秒,1ms = 1000us)
- 例: `p99_us: 200502` = 200.5 ms
**怎么用**:
- 画自己的图 (用 Excel/Google Sheets 打开 baseline.csv 最方便)
- 跟历史报告对比 (跨版本性能回归)
- CI 集成: 解析 JSON,断言 P99 < 某个阈值
---
## 6. 常见问题
### Q: "5xx=0 但 Err=5" 是什么意思?
A: 5xx 是服务端错,Err 是总错 (含 4xx)。Err > 5xx 表示有客户端错 (一般是 401/403/404)。看 log 里具体错误码。
### Q: 为什么 P50 很低但 P99 很高?
A: 正常 — 长尾效应。99% 都快但 1% 慢。如果 P99 太高说明有少数请求卡住,看是不是 GC / 锁 / IO 抖动。
### Q: Max 比 P99 高很多,是不是异常?
A: 可能是单个网络抖动,正常。Max / P99 < 5x 都是健康
### Q: 同一个场景不同次跑,数据差很多?
A: 检查 prod 是否有其他流量在跑 (业务)。压测应在凌晨,业务低峰。
---
## 7. 进一步
- 想优化场景,见 `seed/README.md`
- 想加新场景,在 `scenarios/` 新建 `s8_xxx.go`,模仿 s1_login.go 的 BeginStage/EndStage 模式
- 想加新的红线指标,见 `lib/circuit.go`

View File

@ -0,0 +1,366 @@
# RUNBOOK — 凌晨压测执行手册
> **目标读者**:负责 prod 凌晨压测的 on-call 工程师
> **执行窗口**:02:00 - 06:00 (业务低峰)
> **预计总耗时**:1.5 - 4 小时 (按场景数)
> **风险等级**:🟡 中 (会写 23k+ 测试数据,但物理隔离 star_id=999900)
---
## 0. 前置检查 (T-1 天)
### 0.1 确认 prod 状态
```bash
# SSH 到 prod
ssh root@101.132.250.62
# 确认 prod 网关正常
curl -sS http://localhost:8080/health
# 期望: {"service":"top-fans-gateway","status":"ok"}
# 确认磁盘空间 > 10GB (R5 红线需要)
df -h /opt
# 期望: Avail > 10G
```
### 0.2 确认阿里云快照 < 24h
- 登录 ECS 控制台 → 实例 → 磁盘与镜像 → 快照
- 必须有 < 24h 的快照,**否则不要开压**
- 没有的话先手动触发:实例 → 更多 → 磁盘和镜像 → 创建快照
### 0.3 备份数据库
```bash
ssh root@101.132.250.62
mkdir -p /opt/topfans/backups
pg_dump -h localhost -U postgres topfans > /opt/topfans/backups/pre-loadtest-$(date +%Y%m%d-%H%M).sql
ls -lh /opt/topfans/backups/pre-loadtest-*.sql
# 期望: 文件 > 50MB
```
---
## 1. 上传/确认工具 (T-30min)
### 1.1 确认工具已上传到 prod
```bash
ssh root@101.132.250.62
ls -la /opt/topfans/loadtest/
# 必须看到:
# seed (二进制)
# loadgen (二进制)
# loadtest_bcrypt.txt
# scripts/prod_seed.sh
# README.md
# reports/ (空目录)
```
如果文件缺失,本地重新上传:
```bash
# 本地 (从 backend 目录)
cd /Users/liulujian/Documents/code/TopFansByGithub/backend
# 重新编译
make loadgen-build
# 上传
scp bin/seed bin/loadgen root@101.132.250.62:/opt/topfans/loadtest/
scp scripts/loadgen/seed/loadtest_bcrypt.txt root@101.132.250.62:/opt/topfans/loadtest/
scp scripts/loadgen/scripts/prod_seed.sh root@101.132.250.62:/opt/topfans/loadtest/scripts/
ssh root@101.132.250.62 "chmod +x /opt/topfans/loadtest/{seed,loadgen} /opt/topfans/loadtest/scripts/prod_seed.sh"
```
### 1.2 重新生成 bcrypt 哈希 (如果你改了密码策略)
```bash
# 本地
cd backend/scripts/loadgen/seed
# 生成与 tokens.go 硬编码密码 (默认 "Test@123") 匹配的哈希
python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \
> loadtest_bcrypt.txt
# 上传覆盖
scp loadtest_bcrypt.txt root@101.132.250.62:/opt/topfans/loadtest/
```
---
## 2. 数据准备 (T0 = 02:00)
### 2.1 SSH 到 prod
```bash
ssh root@101.132.250.62
```
### 2.2 一键跑 seed (生产数据灌入)
```bash
cd /opt/topfans/loadtest
bash scripts/prod_seed.sh
```
**这一步骤会做什么**:
- 读 `/opt/topfans/docker/.env.prod` 拿 DB_PASSWORD + JWT_SECRET
- 插入 star_id=999900 测试明星 (1 行)
- 插入 1000 个测试用户 (mobile 19900000001 - 19900001000)
- 插入 1000 个 fan_profile + crystal
- 插入 5000 个 assets
- 插入 3000 个 booth_slots + 2000 个 exhibitions
- 插入 10000 个 friendships
- **重置所有相关表的 PG 序列** (CLAUDE.md 规范,避免后续 GORM 插入报 duplicate key)
- 签 1000 个 JWT,写到 `users.csv`
**预计耗时**:30 - 60 秒
**预期输出**:
```
✓ stars seeded
✓ 1000 users seeded
✓ 1000 fan_profiles + crystal seeded
✓ 5000 assets seeded
✓ 3000 booth_slots + 2000 exhibitions seeded
✓ 10000 friendships seeded
✓ sequences reset
✅ users.csv written: 1000 rows
✅ seed + tokens completed
```
---
## 3. 开压前 7 项检查 (T0+1min)
```bash
cd /opt/topfans/loadtest
./loadgen --cmd=preflight --target=http://localhost:8080
```
**预期全部 PASS**:
```
✓ ① Gateway /health HTTP 200
✓ ② SSH to prod (省略,如不需要 server metrics)
✓ ③ pg_dump backup > 50MB (你的备份)
✓ ④ 阿里云快照 < 24h (人工确认)
✓ ⑤ prod 磁盘空闲 > 10GB free > 10G
✓ ⑥ users.csv 1000 rows rows=1000
✓ ⑦ JWT_SECRET set set
ALL CHECKS PASSED — 可以开压
```
**如果有 FAIL**:见 "附录 A: 故障排查"
---
## 4. 烟雾测试 (T0+2min) — 强烈推荐
> 这一步只花 30 秒,但能提前发现 90% 的集成问题,省后面 1 小时排错
```bash
cd /opt/topfans/loadtest
JWT_SECRET=$(grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod | cut -d= -f2) \
./loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \
--target=http://localhost:8080 --monitor=off 2>&1 | tee reports/smoke-s1.log
```
**预期**:
```
📊 S1: total=30 err=0 5xx=0 p99=200ms stages=1
✅ loadgen done. total=30 err=0 fiveXX=0
```
**判定**:
- ✅ total=30, err=0 → 进入正式压测
- ❌ total < 30 跑挂了, `reports/smoke-s1.log`
- ❌ err > 0 → auth/JWT 问题,检查 `users.csv` 和 JWT_SECRET
---
## 5. 正式压测 (T0+3min)
### 5.1 选择策略
**Plan B 推荐** (S1 + S2 + S4,~1.5 小时):
```bash
cd /opt/topfans/loadtest
export JWT_SECRET=$(grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod | cut -d= -f2)
export PROD_SSH=root@101.132.250.62
# === 场景 1: Login (02:05-02:30, 25min) ===
./loadgen --cmd=run --scenarios=S1 \
--stage=step --step-schedule='2,5,10,15,20' \
--duration=5m --target=http://localhost:8080 \
--monitor=full --prod-ssh=$PROD_SSH \
--inter-scenario-pause=0s 2>&1 | tee reports/s1.log
# 预期: 5 个 stage,每 stage 5min,p99 应随 RPS 阶梯上升
# === 场景 2: Read (02:35-03:00, 25min) ===
./loadgen --cmd=run --scenarios=S2 \
--stage=step --step-schedule='10,30,60,100,150' \
--duration=5m --target=http://localhost:8080 \
--monitor=full --prod-ssh=$PROD_SSH \
--inter-scenario-pause=0s 2>&1 | tee reports/s2.log
# === 场景 4: Mint (03:05-03:30, 25min, 写重,保守) ===
./loadgen --cmd=run --scenarios=S4 \
--stage=step --step-schedule='1,2,3,5' \
--duration=5m --target=http://localhost:8080 \
--monitor=full --prod-ssh=$PROD_SSH \
--inter-scenario-pause=0s 2>&1 | tee reports/s4.log
```
**Plan A 全量** (S1-S7,~3.5 小时):
```bash
# S1-S7 全部跑,S4/S7 写重场景保守
SCENARIOS="S1,S2,S3,S4,S5,S6,S7"
SCHEDULES_BY_SCENARIO='{"S1":"2,5,10,15,20","S2":"10,30,60,100,150","S3":"5,15,30,50","S4":"1,2,3,5","S5":"5,10,20,40","S6":"20,50,100,150","S7":"1,2,3,5"}'
# (目前 loadgen 一次只支持一个 schedule,需要跑 7 次)
```
### 5.2 每个场景跑完后做什么
1. 检查 `reports/{scenario}.log` 末尾的 `📊`
2. 记录 total / err / 5xx / p99 / stages
3. 如果 `🚨 circuit breaker tripped` 触发,**立即停**,见附录 B
---
## 6. 生成报告 (T+1min)
```bash
cd /opt/topfans/loadtest
./loadgen --cmd=report --input=./reports --output=./reports/final-report.md
```
**产出**:
```
reports/
├── S1.json
├── S2.json
├── S4.json
├── baseline.csv # Excel 可直接打开
├── s1.png # RPS/P99/Error 曲线图
├── s2.png
├── s4.png
└── final-report.md # 人看的报告
```
---
## 7. 收尾 (T+2min)
### 7.1 拉报告到本地
```bash
# 本地
mkdir -p ~/Desktop/loadtest-report-$(date +%Y%m%d)
scp -r root@101.132.250.62:/opt/topfans/loadtest/reports/* ~/Desktop/loadtest-report-$(date +%Y%m%d)/
```
### 7.2 决定是否清理测试数据
| 情况 | 动作 |
|------|------|
| 数据分析完,后续不需要 | `./seed --cleanup --full` |
| 数据还要保留做下一轮 | `./seed --cleanup` (保留 1000 用户,清理关联数据) |
| 只是 JWT 过期 | `./seed --reset-tokens --jwt-secret=$JWT_SECRET` |
| **生产事故** | `./seed --cleanup --full` + 立即回滚,见附录 C |
### 7.3 (可选) 关闭监控后台采样
```bash
# 如果你启动了 monitor/sample.sh,杀掉
ssh root@101.132.250.62 "pkill -f 'monitor/sample.sh'"
```
---
## 8. 报告分析 (T+30min,白天)
`REPORT_GUIDE.md` — 教你怎么读 `final-report.md`,定位瓶颈,写行动项。
---
## 附录 A: 故障排查
### A.1 preflight FAIL: users.csv 不存在
**原因**: 上次 seed 没跑成功
**修复**: `cd /opt/topfans/loadtest && bash scripts/prod_seed.sh`
### A.2 preflight FAIL: 阿里云快照 < 24h
**原因**: 没备份
**修复**: 在 ECS 控制台手动建快照,等就绪后重跑 preflight
### A.3 烟雾测试 FAIL: 大量 4xx
**原因**: JWT_SECRET 不匹配 / users.csv 过期
**修复**:
```bash
# 1. 确认 JWT_SECRET
grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod
# 2. 重签 token (数据保留)
./seed --reset-tokens --jwt-secret=$JWT_SECRET
# 3. 重跑
./loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \
--target=http://localhost:8080 --monitor=off
```
### A.4 烟雾测试 FAIL: 大量 5xx
**原因**: 网关/服务挂了
**修复**: 先看 `docker ps` 确认服务在,`curl /health` 确认网关活
---
## 附录 B: Circuit Breaker 触发 (🚨)
如果出现 `🚨 circuit breaker tripped!`,**立即**:
1. **Ctrl+C** 停止当前 loadgen (会 graceful shutdown,等待当前请求完成)
2. 立即判断:
- 5xx > 10% 持续 10s → 服务有问题,见附录 C
- 仅客户端错率高 → 测试问题,可能是 step 跳太猛
3. **降低 RPS 重跑** 或 **改天再试**
---
## 附录 C: 紧急灭火 (production 被打挂了)
**判定**: 服务真实报错(不是测试客户端问题),prod 用户受影响。
**立即执行** (按顺序,每步 30s 内):
```bash
ssh root@101.132.250.62
# 1. 停 loadgen + 监控
pkill -f 'bin/loadgen'
pkill -f 'monitor/sample.sh'
# 2. 清测试数据 (1 秒)
cd /opt/topfans/loadtest
./seed --cleanup --full
# 3. 重启服务 (让 prod 回到 baseline)
cd /opt/topfans/docker
docker-compose -f docker-compose.prod.yml --profile prod restart
# 4. (最严重情况) 从备份还原
bash /opt/topfans/loadtest/recover/restore-from-backup.sh
# 输入 backup 文件路径,预计 5-8 分钟
```
**事后**:
- 写事故复盘
- 修压测发现的 bug
- 调整 step schedule (下一次更保守)
---
## 附录 D: 常用 cheat sheet
```bash
# 查看 loadtest 进程
ssh root@101.132.250.62 "ps aux | grep -E '(loadgen|sample)' | grep -v grep"
# 看实时日志
ssh root@101.132.250.62 "tail -f /opt/topfans/loadtest/reports/*.log"
# 看 metrics feed
ssh root@101.132.250.62 "tail -f /opt/topfans/loadtest/metrics-feed.jsonl"
# 测一下网关还活着
ssh root@101.132.250.62 "curl -sS http://localhost:8080/health"
```

View File

@ -2,13 +2,36 @@ package lib
import ( import (
"sync" "sync"
"sync/atomic"
"github.com/HdrHistogram/hdrhistogram-go" "github.com/HdrHistogram/hdrhistogram-go"
) )
// LatencyRecorder tracks latency histogram + per-stage counters.
//
// Concurrency model: a single LatencyRecorder is shared across all scenarios.
// Per-scenario isolation: callers MUST call Reset() at scenario boundaries.
// Per-stage isolation: callers MUST call BeginStage() at stage boundaries
// (which clears histogram + zero stage counters).
type LatencyRecorder struct { type LatencyRecorder struct {
mu sync.Mutex mu sync.Mutex
h *hdrhistogram.Histogram h *hdrhistogram.Histogram
stageTotal atomic.Int64
stageErrors atomic.Int64
stageFiveXX atomic.Int64
stages []StageSnapshot
}
// StageSnapshot is the per-stage data captured by EndStage.
type StageSnapshot struct {
StageIdx int
TargetRPS int
Histogram *hdrhistogram.Histogram
TotalRequests int64
Errors int64
FiveXX int64
} }
func NewLatencyRecorder() *LatencyRecorder { func NewLatencyRecorder() *LatencyRecorder {
@ -17,6 +40,7 @@ func NewLatencyRecorder() *LatencyRecorder {
} }
} }
// Record stores a latency sample (in microseconds).
func (r *LatencyRecorder) Record(latencyUs int64) { func (r *LatencyRecorder) Record(latencyUs int64) {
r.mu.Lock() r.mu.Lock()
defer r.mu.Unlock() defer r.mu.Unlock()
@ -26,8 +50,79 @@ func (r *LatencyRecorder) Record(latencyUs int64) {
_ = r.h.RecordValue(latencyUs) _ = r.h.RecordValue(latencyUs)
} }
// RecordResult increments per-stage error/5xx counters based on HTTP status code.
// isError: status >= 400 or transport error
// is5xx: status >= 500
func (r *LatencyRecorder) RecordResult(isError, is5xx bool) {
if isError {
r.stageErrors.Add(1)
}
if is5xx {
r.stageFiveXX.Add(1)
}
r.stageTotal.Add(1)
}
// Snapshot returns a copy of the current histogram (for use by circuit-breaker).
// Does NOT affect per-stage counters.
func (r *LatencyRecorder) Snapshot() *hdrhistogram.Histogram { func (r *LatencyRecorder) Snapshot() *hdrhistogram.Histogram {
r.mu.Lock() r.mu.Lock()
defer r.mu.Unlock() defer r.mu.Unlock()
return hdrhistogram.Import(r.h.Export()) return hdrhistogram.Import(r.h.Export())
} }
// Reset clears the histogram, per-stage counters, AND accumulated stages.
// Call between scenarios.
func (r *LatencyRecorder) Reset() {
r.mu.Lock()
defer r.mu.Unlock()
r.h = hdrhistogram.New(1, 30_000_000, 3)
r.stages = nil
r.stageTotal.Store(0)
r.stageErrors.Store(0)
r.stageFiveXX.Store(0)
}
// ClearStages drops accumulated stage data but keeps the current histogram and counters.
// Use when you want stages to remain but accumulated list to be discarded.
func (r *LatencyRecorder) ClearStages() {
r.mu.Lock()
defer r.mu.Unlock()
r.stages = nil
}
// BeginStage marks the start of a new stage at TargetRPS RPS.
// Resets histogram AND per-stage counters. Stages slice gains a new entry.
func (r *LatencyRecorder) BeginStage(idx, targetRPS int) {
r.mu.Lock()
defer r.mu.Unlock()
r.h = hdrhistogram.New(1, 30_000_000, 3)
r.stageTotal.Store(0)
r.stageErrors.Store(0)
r.stageFiveXX.Store(0)
r.stages = append(r.stages, StageSnapshot{StageIdx: idx, TargetRPS: targetRPS})
}
// EndStage freezes the histogram + per-stage counters into the latest stage entry.
// Must be called after BeginStage and after the stage has produced some traffic.
func (r *LatencyRecorder) EndStage() {
r.mu.Lock()
defer r.mu.Unlock()
if len(r.stages) == 0 {
return
}
last := &r.stages[len(r.stages)-1]
last.Histogram = hdrhistogram.Import(r.h.Export())
last.TotalRequests = r.stageTotal.Load()
last.Errors = r.stageErrors.Load()
last.FiveXX = r.stageFiveXX.Load()
}
// Stages returns a copy of accumulated stage snapshots.
func (r *LatencyRecorder) Stages() []StageSnapshot {
r.mu.Lock()
defer r.mu.Unlock()
out := make([]StageSnapshot, len(r.stages))
copy(out, r.stages)
return out
}

View File

@ -66,6 +66,31 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d
// 让 scenarios 用 --target 而不是写死的 prod IP // 让 scenarios 用 --target 而不是写死的 prod IP
scenarios.DefaultBaseURL = target scenarios.DefaultBaseURL = target
// 写 run-metadata.json (供 --cmd=report 使用)
runStart := time.Now()
defer func() {
meta := reporter.RunMetadata{
StartTime: runStart,
EndTime: time.Now(),
Target: target,
Scenarios: strings.Split(scenarioIDs, ","),
StepSchedule: stepSchedule,
StageMode: stage,
RPSOverride: rps,
MonitorMode: monitorMode,
ProdSSH: prodSSH,
}
// 取 JWT_SECRET 前 8 位作为 hint
if jwtSecret := os.Getenv("JWT_SECRET"); len(jwtSecret) >= 8 {
meta.JWTSecretHint = jwtSecret[:8]
}
if err := os.MkdirAll("reports", 0o755); err == nil {
if data, err := json.MarshalIndent(meta, "", " "); err == nil {
_ = os.WriteFile(filepath.Join("reports", "run-metadata.json"), data, 0o644)
}
}
}()
users, err := lib.LoadUsers("users.csv") users, err := lib.LoadUsers("users.csv")
if err != nil { if err != nil {
return fmt.Errorf("load users.csv: %w (先跑 `seed` 生成 users.csv)", err) return fmt.Errorf("load users.csv: %w (先跑 `seed` 生成 users.csv)", err)
@ -126,6 +151,14 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d
continue continue
} }
log.Printf("=== scenario %d/%d: %s ===", idx+1, len(ids), id) log.Printf("=== scenario %d/%d: %s ===", idx+1, len(ids), id)
// 场景开始:快照 delta 基线,清空 stage 累积
recorder.ClearStages()
recorder.Reset()
prevTotal := totalCount.Load()
prevErr := errCount.Load()
prev5xx := fiveXXCount.Load()
s, err := scenarios.Get(id, client, users, &errCount, &totalCount, &fiveXXCount, recorder, breaker, prodSSH) s, err := scenarios.Get(id, client, users, &errCount, &totalCount, &fiveXXCount, recorder, breaker, prodSSH)
if err != nil { if err != nil {
return fmt.Errorf("scenario %s: %w", id, err) return fmt.Errorf("scenario %s: %w", id, err)
@ -133,6 +166,38 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d
if err := s.Run(ctx, rps, duration, dashboard, breaker, stages); err != nil { if err := s.Run(ctx, rps, duration, dashboard, breaker, stages); err != nil {
return fmt.Errorf("run scenario %s: %w", id, err) return fmt.Errorf("run scenario %s: %w", id, err)
} }
// 场景结束:写 per-scenario JSON (含 stages)
scenarioTotal := totalCount.Load() - prevTotal
scenarioErr := errCount.Load() - prevErr
scenario5xx := fiveXXCount.Load() - prev5xx
scenarioStages := recorder.Stages()
stageReports := make([]reporter.StageReport, 0, len(scenarioStages))
for _, ss := range scenarioStages {
stageReports = append(stageReports, reporter.MakeStageReport(
ss.StageIdx, ss.TargetRPS, ss.Histogram,
ss.TotalRequests, ss.Errors, ss.FiveXX,
))
}
rr := reporter.RunReport{
Scenario: id,
TotalRequests: scenarioTotal,
Errors: scenarioErr,
FiveXX: scenario5xx,
P50Us: recorder.Snapshot().ValueAtPercentile(50),
P95Us: recorder.Snapshot().ValueAtPercentile(95),
P99Us: recorder.Snapshot().ValueAtPercentile(99),
MaxUs: recorder.Snapshot().Max(),
Stages: stageReports,
}
scenarioPath := filepath.Join("reports", id+".json")
if err := reporter.WriteJSON(scenarioPath, rr); err != nil {
return fmt.Errorf("write %s: %w", scenarioPath, err)
}
log.Printf("📊 %s: total=%d err=%d 5xx=%d p99=%dms stages=%d",
id, scenarioTotal, scenarioErr, scenario5xx, rr.P99Us/1000, len(stageReports))
if breaker.State() == lib.CircuitTripped { if breaker.State() == lib.CircuitTripped {
log.Printf("⚠️ circuit tripped, stopping") log.Printf("⚠️ circuit tripped, stopping")
break break
@ -143,11 +208,8 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d
} }
} }
// write final report
if err := reporter.WriteJSON("report.json", scenarioIDs, recorder.Snapshot(), totalCount.Load(), errCount.Load(), fiveXXCount.Load()); err != nil {
return fmt.Errorf("write report: %w", err)
}
log.Printf("✅ loadgen done. total=%d err=%d fiveXX=%d", totalCount.Load(), errCount.Load(), fiveXXCount.Load()) log.Printf("✅ loadgen done. total=%d err=%d fiveXX=%d", totalCount.Load(), errCount.Load(), fiveXXCount.Load())
log.Printf("💡 下一步: ./loadgen --cmd=report --input=./reports --output=./reports/final-report.md")
return nil return nil
} }
@ -186,20 +248,33 @@ func runReport(inputDir, output string) error {
return fmt.Errorf("--input required for cmd=report") return fmt.Errorf("--input required for cmd=report")
} }
// 1. 收集 reports/run-*/ 下的 *.json // 1. 递归收集 reports/ 下的所有 *.json (filepath.Glob 不支持 **, 用 WalkDir)
var scenarioReports []reporter.RunReport var scenarioReports []reporter.RunReport
matches, _ := filepath.Glob(filepath.Join(inputDir, "**", "*.json")) err := filepath.WalkDir(inputDir, func(path string, d os.DirEntry, walkErr error) error {
for _, m := range matches { if walkErr != nil {
data, err := os.ReadFile(m) return nil
}
if d.IsDir() || !strings.HasSuffix(path, ".json") {
return nil
}
// 跳过元数据文件 (它是 RunMetadata 不是 RunReport)
if strings.HasSuffix(path, "run-metadata.json") {
return nil
}
data, err := os.ReadFile(path)
if err != nil { if err != nil {
continue return nil
} }
var rr reporter.RunReport var rr reporter.RunReport
if err := json.Unmarshal(data, &rr); err != nil { if err := json.Unmarshal(data, &rr); err != nil {
log.Printf("skip %s: %v", m, err) log.Printf("skip %s: %v", path, err)
continue return nil
} }
scenarioReports = append(scenarioReports, rr) scenarioReports = append(scenarioReports, rr)
return nil
})
if err != nil {
return fmt.Errorf("walk %s: %w", inputDir, err)
} }
if len(scenarioReports) == 0 { if len(scenarioReports) == 0 {
return fmt.Errorf("no JSON reports found in %s", inputDir) return fmt.Errorf("no JSON reports found in %s", inputDir)
@ -213,17 +288,41 @@ func runReport(inputDir, output string) error {
} }
log.Printf("wrote %s", baselinePath) log.Printf("wrote %s", baselinePath)
// 3. 转 ScenarioReport (供 markdown 用) // 3. 生成每个 scenario 的 PNG 图表
scenarioMarkdownReports := make([]reporter.ScenarioReport, 0, len(scenarioReports))
for _, r := range scenarioReports { for _, r := range scenarioReports {
scenarioMarkdownReports = append(scenarioMarkdownReports, reporter.ScenarioReport{ if len(r.Stages) < 1 {
ID: r.Scenario, continue
KneeRPS: 0, // 拐点需要分析 raw data 算,简化版留 0 }
}) plotPath := filepath.Join(inputDir, strings.ToLower(r.Scenario)+".png")
samples := make([]reporter.Sample, 0, len(r.Stages))
for _, st := range r.Stages {
tot := st.TotalRequests
errRate := float64(0)
if tot > 0 {
errRate = float64(st.Errors) / float64(tot)
}
samples = append(samples, reporter.Sample{
RPS: float64(st.TargetRPS),
P99Ms: float64(st.P99Us) / 1000,
ErrorRate: errRate,
})
}
if err := reporter.PlotRPSLatencyError(r.Scenario, samples, plotPath); err != nil {
log.Printf("⚠️ plot %s failed: %v", r.Scenario, err)
continue
}
log.Printf("wrote %s", plotPath)
} }
// 4. markdown // 4. 读 run-metadata.json (可选,runLoadgen 写入)
if err := reporter.GenerateMarkdown(output, scenarioMarkdownReports); err != nil { var meta reporter.RunMetadata
metaPath := filepath.Join(inputDir, "run-metadata.json")
if data, err := os.ReadFile(metaPath); err == nil {
_ = json.Unmarshal(data, &meta)
}
// 5. markdown (引用生成的 PNG)
if err := reporter.GenerateMarkdown(output, meta, scenarioReports, "./"); err != nil {
return fmt.Errorf("write markdown: %w", err) return fmt.Errorf("write markdown: %w", err)
} }
log.Printf("wrote %s", output) log.Printf("wrote %s", output)

View File

@ -7,20 +7,50 @@ import (
"github.com/HdrHistogram/hdrhistogram-go" "github.com/HdrHistogram/hdrhistogram-go"
) )
type RunReport struct { type StageReport struct {
Scenario string `json:"scenario"` StageIdx int `json:"stage_idx"`
TotalRequests int64 `json:"total_requests"` TargetRPS int `json:"target_rps"`
Errors int64 `json:"errors"` TotalRequests int64 `json:"total_requests"`
FiveXX int64 `json:"five_xx"` Errors int64 `json:"errors"`
P50Us int64 `json:"p50_us"` FiveXX int64 `json:"five_xx"`
P95Us int64 `json:"p95_us"` P50Us int64 `json:"p50_us"`
P99Us int64 `json:"p99_us"` P95Us int64 `json:"p95_us"`
MaxUs int64 `json:"max_us"` P99Us int64 `json:"p99_us"`
MaxUs int64 `json:"max_us"`
} }
func WriteJSON(path string, scenario string, h *hdrhistogram.Histogram, total, errs, fiveXX int64) error { type RunReport struct {
r := RunReport{ Scenario string `json:"scenario"`
Scenario: scenario, TotalRequests int64 `json:"total_requests"`
Errors int64 `json:"errors"`
FiveXX int64 `json:"five_xx"`
P50Us int64 `json:"p50_us"`
P95Us int64 `json:"p95_us"`
P99Us int64 `json:"p99_us"`
MaxUs int64 `json:"max_us"`
Stages []StageReport `json:"stages,omitempty"`
}
// WriteJSON writes a RunReport (single scenario, optional per-stage data) to path.
func WriteJSON(path string, r RunReport) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
return enc.Encode(r)
}
// MakeStageReport fills a StageReport from a histogram + counters.
func MakeStageReport(idx, targetRPS int, h *hdrhistogram.Histogram, total, errs, fiveXX int64) StageReport {
if h == nil {
return StageReport{StageIdx: idx, TargetRPS: targetRPS}
}
return StageReport{
StageIdx: idx,
TargetRPS: targetRPS,
TotalRequests: total, TotalRequests: total,
Errors: errs, Errors: errs,
FiveXX: fiveXX, FiveXX: fiveXX,
@ -29,25 +59,28 @@ func WriteJSON(path string, scenario string, h *hdrhistogram.Histogram, total, e
P99Us: h.ValueAtPercentile(99), P99Us: h.ValueAtPercentile(99),
MaxUs: h.Max(), MaxUs: h.Max(),
} }
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
return json.NewEncoder(f).Encode(r)
} }
// WriteBaselineCSV writes a CSV summary across multiple RunReports.
func WriteBaselineCSV(path string, scenarios []RunReport) error { func WriteBaselineCSV(path string, scenarios []RunReport) error {
f, err := os.Create(path) f, err := os.Create(path)
if err != nil { if err != nil {
return err return err
} }
defer f.Close() defer f.Close()
if _, err := f.WriteString("scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms\n"); err != nil { if _, err := f.WriteString("scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms,stages\n"); err != nil {
return err return err
} }
for _, s := range scenarios { for _, s := range scenarios {
_, err := f.WriteString(jsonLine(s) + "\n") _, err := f.WriteString(s.Scenario + "," +
itoa(s.TotalRequests) + "," +
itoa(s.Errors) + "," +
itoa(s.FiveXX) + "," +
ms(s.P50Us) + "," +
ms(s.P95Us) + "," +
ms(s.P99Us) + "," +
ms(s.MaxUs) + "," +
itoa(int64(len(s.Stages))) + "\n")
if err != nil { if err != nil {
return err return err
} }
@ -55,16 +88,6 @@ func WriteBaselineCSV(path string, scenarios []RunReport) error {
return nil return nil
} }
func jsonLine(s RunReport) string {
b, _ := json.Marshal(s)
s2 := string(b)
if len(s2) >= 2 && s2[0] == '{' {
// strip braces for CSV-friendly format
return s.Scenario + "," + itoa(s.TotalRequests) + "," + itoa(s.Errors) + "," + itoa(s.FiveXX) + "," + ms(s.P50Us) + "," + ms(s.P95Us) + "," + ms(s.P99Us) + "," + ms(s.MaxUs)
}
return s2
}
func itoa(n int64) string { func itoa(n int64) string {
if n == 0 { if n == 0 {
return "0" return "0"
@ -88,12 +111,10 @@ func itoa(n int64) string {
} }
func ms(us int64) string { func ms(us int64) string {
// us / 1000 as float
return formatFloat(float64(us) / 1000) return formatFloat(float64(us) / 1000)
} }
func formatFloat(f float64) string { func formatFloat(f float64) string {
// simple 2-decimal format
intPart := int64(f) intPart := int64(f)
frac := int64((f - float64(intPart)) * 100) frac := int64((f - float64(intPart)) * 100)
if frac < 0 { if frac < 0 {

View File

@ -0,0 +1,33 @@
package reporter
// KneeRPS finds the "knee" (turning point) of a multi-stage run.
//
// Heuristic: the first stage where p99 latency grew >50% over the previous
// stage. If no such jump exists (run was healthy throughout), returns the
// highest stage tested (i.e. we never hit the knee).
//
// Returns:
// - kneeRPS: the target_rps at the knee (or highest if no knee found)
// - kneeIdx: the stage index (1-based) where the knee was detected
// - p99Delta: the p99 jump percentage (0.5 = 50% growth)
func KneeRPS(stages []StageReport) (kneeRPS, kneeIdx int, p99Delta float64) {
if len(stages) == 0 {
return 0, 0, 0
}
if len(stages) == 1 {
return stages[0].TargetRPS, stages[0].StageIdx, 0
}
for i := 1; i < len(stages); i++ {
prev := stages[i-1].P99Us
if prev == 0 {
continue
}
growth := float64(stages[i].P99Us-prev) / float64(prev)
if growth > 0.5 {
return stages[i].TargetRPS, stages[i].StageIdx, growth
}
}
// 没找到拐点:返回最高 stage
last := stages[len(stages)-1]
return last.TargetRPS, last.StageIdx, 0
}

View File

@ -3,42 +3,482 @@ package reporter
import ( import (
"fmt" "fmt"
"os" "os"
"strings"
"time"
) )
type ScenarioReport struct { // GenerateMarkdown writes a rich markdown report.
ID string //
Stages []StageReport // Includes:
KneeRPS int // - Header (run metadata: target, scenarios, time, JWT hint)
TopBottleneck string // - Executive summary (per-scenario verdicts + key findings)
} // - Cross-scenario bottleneck analysis
// - Per-scenario detailed sections with:
type StageReport struct { // * Description + business impact + API
RPS int // * Verdict with reasoning
P50Ms float64 // * KPI table vs thresholds
P95Ms float64 // * Knee analysis
P99Ms float64 // * Stage-by-stage breakdown
ErrorRate float64 // * PNG chart
} // * Specific action items
func GenerateMarkdown(path string, meta RunMetadata, scenarios []RunReport, plotDir string) error {
func GenerateMarkdown(path string, scenarios []ScenarioReport) error {
f, err := os.Create(path) f, err := os.Create(path)
if err != nil { if err != nil {
return err return err
} }
defer f.Close() defer f.Close()
fmt.Fprintf(f, "# 压测报告\n\n") writeHeader(f, meta, scenarios)
writeExecutiveSummary(f, scenarios)
writeOverviewTable(f, scenarios)
writeCrossScenarioAnalysis(f, scenarios)
for _, s := range scenarios { for _, s := range scenarios {
fmt.Fprintf(f, "## %s\n\n", s.ID) writeScenarioDetail(f, s, plotDir)
fmt.Fprintf(f, "**拐点 RPS**: %d\n\n", s.KneeRPS) }
fmt.Fprintf(f, "**Top 瓶颈**: %s\n\n", s.TopBottleneck) writeAppendix(f, meta)
fmt.Fprintf(f, "| Stage | RPS | P50ms | P95ms | P99ms | Err%% |\n") return nil
fmt.Fprintf(f, "|-------|-----|-------|-------|-------|------|\n") }
for _, st := range s.Stages {
fmt.Fprintf(f, "| - | %d | %.1f | %.1f | %.1f | %.1f |\n", func writeHeader(f *os.File, meta RunMetadata, scenarios []RunReport) {
st.RPS, st.P50Ms, st.P95Ms, st.P99Ms, st.ErrorRate*100) fmt.Fprintf(f, "# TopFans 压测报告\n\n")
duration := meta.EndTime.Sub(meta.StartTime).Round(time.Second)
fmt.Fprintf(f, "## 📋 运行信息\n\n")
fmt.Fprintf(f, "| 项 | 值 |\n|---|---|\n")
fmt.Fprintf(f, "| **生成时间** | %s |\n", time.Now().Format("2006-01-02 15:04:05 MST"))
if !meta.StartTime.IsZero() {
fmt.Fprintf(f, "| **压测开始** | %s |\n", meta.StartTime.Format("2006-01-02 15:04:05 MST"))
fmt.Fprintf(f, "| **压测结束** | %s |\n", meta.EndTime.Format("2006-01-02 15:04:05 MST"))
fmt.Fprintf(f, "| **总耗时** | %s |\n", duration)
}
fmt.Fprintf(f, "| **目标地址** | `%s` |\n", emptyDash(meta.Target))
fmt.Fprintf(f, "| **测试场景** | %s |\n", strings.Join(meta.Scenarios, ", "))
fmt.Fprintf(f, "| **阶梯模式** | %s%s |\n", emptyDash(meta.StageMode), ifThen(meta.StepSchedule != "", " (`"+meta.StepSchedule+"`)", ""))
if meta.JWTSecretHint != "" {
fmt.Fprintf(f, "| **JWT 签名密钥** | `%s***` (前 8 位) |\n", meta.JWTSecretHint)
}
if meta.ProdSSH != "" {
fmt.Fprintf(f, "| **prod SSH** | `%s` |\n", meta.ProdSSH)
}
if meta.MonitorMode != "" {
fmt.Fprintf(f, "| **监控模式** | %s |\n", meta.MonitorMode)
}
// 总请求数
var totalReq, totalErr, total5xx int64
for _, s := range scenarios {
totalReq += s.TotalRequests
totalErr += s.Errors
total5xx += s.FiveXX
}
fmt.Fprintf(f, "| **总请求数** | %s |\n", commaInt(totalReq))
fmt.Fprintf(f, "| **总错误数** | %s (%.2f%%) |\n", commaInt(totalErr), pct(totalErr, totalReq))
fmt.Fprintf(f, "| **5xx 数** | %s (%.2f%%) |\n", commaInt(total5xx), pct(total5xx, totalReq))
fmt.Fprintf(f, "\n---\n\n")
}
func writeExecutiveSummary(f *os.File, scenarios []RunReport) {
fmt.Fprintf(f, "## 🎯 执行摘要\n\n")
// Count verdicts
counts := map[string]int{"✅": 0, "⚠️": 0, "🚨": 0}
criticalIssues := []string{}
for _, s := range scenarios {
meta, ok := AllScenarios[s.Scenario]
if !ok {
continue
}
_, _, p99Delta := KneeRPS(s.Stages)
knee := p99Delta > 0.5
v := meta.Verdict(s, knee)
counts[v]++
if v == "🚨" {
issue := fmt.Sprintf("- **%s (%s)**: ", s.Scenario, meta.Name)
if errRate := pct(s.Errors, s.TotalRequests); errRate > 1 {
issue += fmt.Sprintf("错误率 %.2f%% ", errRate)
}
if p99Ms := float64(s.P99Us) / 1000; p99Ms > meta.Thresholds.P99MsMax {
issue += fmt.Sprintf("P99 %.0fms (阈值 %.0fms) ", p99Ms, meta.Thresholds.P99MsMax)
}
if knee {
issue += fmt.Sprintf("拐点 stage %d", stagesIdx(s.Stages))
}
criticalIssues = append(criticalIssues, issue)
}
}
// Overall verdict
totalSc := len(scenarios)
fmt.Fprintf(f, "**总览**: ✅ %d 健康 / ⚠️ %d 警告 / 🚨 %d 严重 (共 %d)\n\n",
counts["✅"], counts["⚠️"], counts["🚨"], totalSc)
if len(criticalIssues) == 0 {
fmt.Fprintf(f, "🎉 **所有场景通过健康阈值,系统可承载预期负载。**\n\n")
} else {
fmt.Fprintf(f, "🚨 **关键问题** (%d 个):\n\n", len(criticalIssues))
for _, issue := range criticalIssues {
fmt.Fprintf(f, "%s\n", issue)
} }
fmt.Fprintf(f, "\n") fmt.Fprintf(f, "\n")
} }
return nil
// Per-scenario one-liner
fmt.Fprintf(f, "**场景速览**:\n\n")
for _, s := range scenarios {
meta, ok := AllScenarios[s.Scenario]
if !ok {
continue
}
_, _, p99Delta := KneeRPS(s.Stages)
knee := p99Delta > 0.5
v := meta.Verdict(s, knee)
fmt.Fprintf(f, "- %s **%s %s** — p99=%.0fms, %s", v, s.Scenario, meta.Name, float64(s.P99Us)/1000, errSummary(s))
if knee {
fmt.Fprintf(f, ", ⚠️ 拐点 stage %d", stagesIdx(s.Stages))
}
fmt.Fprintf(f, "\n")
}
fmt.Fprintf(f, "\n---\n\n")
}
func writeOverviewTable(f *os.File, scenarios []RunReport) {
fmt.Fprintf(f, "## 📊 总览表\n\n")
fmt.Fprintf(f, "| 场景 | 描述 | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 拐点 RPS | 状态 |\n")
fmt.Fprintf(f, "|------|------|-------|-----|-----|-------|-------|-------|-------|---------|------|\n")
for _, s := range scenarios {
meta, ok := AllScenarios[s.Scenario]
if !ok {
continue
}
kneeRPS, kneeIdx, p99Delta := KneeRPS(s.Stages)
kneeTriggered := p99Delta > 0.5
v := meta.Verdict(s, kneeTriggered)
kneeStr := "—"
if kneeTriggered {
kneeStr = fmt.Sprintf("%d (stage %d)", kneeRPS, kneeIdx)
}
fmt.Fprintf(f, "| **%s** | %s | %s | %s (%.2f%%) | %s (%.2f%%) | %.0f | %.0f | %.0f | %.0f | %s | %s |\n",
s.Scenario, meta.Name,
commaInt(s.TotalRequests),
commaInt(s.Errors), pct(s.Errors, s.TotalRequests),
commaInt(s.FiveXX), pct(s.FiveXX, s.TotalRequests),
usToMs(s.P50Us), usToMs(s.P95Us), usToMs(s.P99Us), usToMs(s.MaxUs),
kneeStr, v)
}
fmt.Fprintf(f, "\n> 说明: Err 包含 4xx + 5xx,5xx 是子集。错误率 = Err / Total。\n\n")
}
func writeCrossScenarioAnalysis(f *os.File, scenarios []RunReport) {
fmt.Fprintf(f, "## 🔬 跨场景瓶颈分析\n\n")
if len(scenarios) < 2 {
fmt.Fprintf(f, "只有一个场景,无需跨场景分析。\n\n")
return
}
// Find bottleneck: highest P99 relative to threshold
type scored struct {
scenario string
p99Ms float64
ratio float64 // p99 / threshold
}
var scoreds []scored
for _, s := range scenarios {
meta, ok := AllScenarios[s.Scenario]
if !ok {
continue
}
p99Ms := float64(s.P99Us) / 1000
ratio := p99Ms / meta.Thresholds.P99MsMax
scoreds = append(scoreds, scored{s.Scenario, p99Ms, ratio})
}
// Sort by ratio desc
for i := 0; i < len(scoreds); i++ {
for j := i + 1; j < len(scoreds); j++ {
if scoreds[j].ratio > scoreds[i].ratio {
scoreds[i], scoreds[j] = scoreds[j], scoreds[i]
}
}
}
if len(scoreds) > 0 && scoreds[0].ratio > 1 {
fmt.Fprintf(f, "🚨 **瓶颈场景: %s** — P99 是阈值的 %.2f 倍\n\n", scoreds[0].scenario, scoreds[0].ratio)
} else if len(scoreds) > 0 {
fmt.Fprintf(f, "✅ **无明显瓶颈**,所有场景 P99 都在阈值内。\n\n")
}
fmt.Fprintf(f, "**P99 / 阈值 比率** (从高到低):\n\n")
for _, s := range scoreds {
fmt.Fprintf(f, "- %s: %.2fx (%.0fms)\n", s.scenario, s.ratio, s.p99Ms)
}
fmt.Fprintf(f, "\n---\n\n")
}
func writeScenarioDetail(f *os.File, s RunReport, plotDir string) {
meta, ok := AllScenarios[s.Scenario]
if !ok {
fmt.Fprintf(f, "## %s (无元数据)\n\n", s.Scenario)
fmt.Fprintf(f, "```json\n%+v\n```\n\n", s)
return
}
kneeRPS, kneeIdx, p99Delta := KneeRPS(s.Stages)
kneeTriggered := p99Delta > 0.5
verdict := meta.Verdict(s, kneeTriggered)
fmt.Fprintf(f, "## %s %s %s\n\n", verdict, s.Scenario, meta.Name)
fmt.Fprintf(f, "### 📌 测试说明\n\n")
fmt.Fprintf(f, "| 项 | 值 |\n|---|---|\n")
fmt.Fprintf(f, "| **API** | `%s` |\n", meta.API)
fmt.Fprintf(f, "| **负载类型** | %s |\n", workloadLabel(meta.Workload))
fmt.Fprintf(f, "| **业务说明** | %s |\n", meta.Description)
fmt.Fprintf(f, "| **影响范围** | %s |\n", meta.BusinessImp)
fmt.Fprintf(f, "\n")
// KPI vs thresholds
fmt.Fprintf(f, "### 📈 性能指标 vs 健康阈值\n\n")
p50Ms := usToMs(s.P50Us)
p95Ms := usToMs(s.P95Us)
p99Ms := usToMs(s.P99Us)
maxMs := usToMs(s.MaxUs)
errRate := pct(s.Errors, s.TotalRequests)
fiveXXRate := pct(s.FiveXX, s.TotalRequests)
fmt.Fprintf(f, "| 指标 | 实测 | 阈值 | 判定 |\n")
fmt.Fprintf(f, "|------|------|------|------|\n")
fmt.Fprintf(f, "| P50ms | %.0f | ≤%.0f | %s |\n", p50Ms, meta.Thresholds.P50MsMax, thresholdMark(p50Ms, meta.Thresholds.P50MsMax))
fmt.Fprintf(f, "| P95ms | %.0f | ≤%.0f | %s |\n", p95Ms, meta.Thresholds.P95MsMax, thresholdMark(p95Ms, meta.Thresholds.P95MsMax))
fmt.Fprintf(f, "| P99ms | %.0f | ≤%.0f | %s |\n", p99Ms, meta.Thresholds.P99MsMax, thresholdMark(p99Ms, meta.Thresholds.P99MsMax))
fmt.Fprintf(f, "| Maxms | %.0f | — | 参考 |\n", maxMs)
fmt.Fprintf(f, "| 错误率 | %.2f%% | ≤%.2f%% | %s |\n", errRate, meta.Thresholds.ErrorRateMax*100, thresholdMark(errRate/100, meta.Thresholds.ErrorRateMax))
fmt.Fprintf(f, "| 5xx 率 | %.2f%% | ≤%.2f%% | %s |\n", fiveXXRate, meta.Thresholds.FiveXXRateMax*100, thresholdMark(fiveXXRate/100, meta.Thresholds.FiveXXRateMax))
fmt.Fprintf(f, "\n")
// Knee
fmt.Fprintf(f, "### 📍 拐点分析\n\n")
if len(s.Stages) <= 1 {
fmt.Fprintf(f, " 仅 1 个 stage,未做阶梯测试,无法判断拐点。\n\n")
} else if kneeTriggered {
fmt.Fprintf(f, "🚨 **拐点**: stage %d @ %d RPS — p99 暴涨 %.0f%%\n\n",
kneeIdx, kneeRPS, p99Delta*100)
fmt.Fprintf(f, "从 stage %d 到 stage %d,p99 延迟从 %.0fms 涨到 %.0fms (%.1fx)。\n",
kneeIdx-1, kneeIdx, usToMs(s.Stages[kneeIdx-2].P99Us), p99Ms, 1+p99Delta)
fmt.Fprintf(f, "\n**含义**: 系统在 %d RPS 时开始出现性能劣化。建议生产限流到 %d RPS 以下。\n\n",
kneeRPS, kneeRPS)
} else {
fmt.Fprintf(f, "✅ **拐点未触发** — 全程 %d 个 stage 健康运行,最高 %d RPS p99=%.0fms。\n\n",
len(s.Stages), kneeRPS, p99Ms)
}
// Stage table
fmt.Fprintf(f, "### 🔢 阶梯结果\n\n")
if len(s.Stages) == 0 {
fmt.Fprintf(f, "_无 stage 数据_\n\n")
} else {
fmt.Fprintf(f, "| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |\n")
fmt.Fprintf(f, "|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|\n")
for i, st := range s.Stages {
growth := ""
if i > 0 {
prevP99 := float64(s.Stages[i-1].P99Us) / 1000
curP99 := float64(st.P99Us) / 1000
if prevP99 > 0 {
pct := (curP99 - prevP99) / prevP99 * 100
growth = fmt.Sprintf("%+.0f%%", pct)
if pct > 50 {
growth = "🚨 " + growth
}
}
}
fmt.Fprintf(f, "| %d | %d | %s | %s | %s | %.0f | %.0f | %.0f | %.0f | %s |\n",
st.StageIdx, st.TargetRPS,
commaInt(st.TotalRequests), commaInt(st.Errors), commaInt(st.FiveXX),
usToMs(st.P50Us), usToMs(st.P95Us), usToMs(st.P99Us), usToMs(st.MaxUs),
growth)
}
fmt.Fprintf(f, "\n")
}
// Action items
fmt.Fprintf(f, "### 🎯 行动项\n\n")
actionItems(f, s, meta, kneeTriggered, kneeRPS)
// Plot
if plotDir != "" {
plotName := strings.ToLower(s.Scenario) + ".png"
fmt.Fprintf(f, "### 📉 图表\n\n")
fmt.Fprintf(f, "![%s RPS / P99 / Error](%s/%s)\n\n", s.Scenario, plotDir, plotName)
}
fmt.Fprintf(f, "---\n\n")
}
func writeAppendix(f *os.File, meta RunMetadata) {
fmt.Fprintf(f, "## 📎 附录\n\n")
fmt.Fprintf(f, "### 健康阈值说明\n\n")
fmt.Fprintln(f, "- **P50/P95/P99**: 百分位延迟 (毫秒),值越小越好")
fmt.Fprintln(f, "- **错误率**: 4xx+5xx 请求占比,健康 < 1%")
fmt.Fprintln(f, "- **5xx 率**: 服务端错误率,健康 < 0.1%")
fmt.Fprintln(f, "- **拐点**: 阶梯测试中,p99 相对前一 stage 涨幅 > 50% 的第一个 stage")
fmt.Fprintf(f, "\n")
fmt.Fprintf(f, "### 文件清单\n\n")
fmt.Fprintf(f, "```\n")
fmt.Fprintf(f, "reports/\n")
fmt.Fprintf(f, "├── final-report.md (本文件)\n")
fmt.Fprintf(f, "├── baseline.csv (Excel 可打开的汇总)\n")
for _, s := range []string{"S1", "S2", "S3", "S4", "S5", "S6", "S7"} {
fmt.Fprintf(f, "├── %s.json%s\n", strings.ToLower(s), "")
fmt.Fprintf(f, "├── %s.png%s\n", strings.ToLower(s), "")
}
fmt.Fprintf(f, "```\n\n")
fmt.Fprintf(f, "### 如何复现\n\n")
fmt.Fprintf(f, "```bash\n")
fmt.Fprintf(f, "cd /opt/topfans/loadtest\n")
if meta.StepSchedule != "" {
fmt.Fprintf(f, "./loadgen --cmd=run --scenarios=%s --stage=%s --step-schedule='%s' \\\n",
strings.Join(meta.Scenarios, ","), meta.StageMode, meta.StepSchedule)
} else {
fmt.Fprintf(f, "./loadgen --cmd=run --scenarios=%s --stage=%s \\\n",
strings.Join(meta.Scenarios, ","), meta.StageMode)
}
if meta.Target != "" {
fmt.Fprintf(f, " --target=%s \\\n", meta.Target)
}
if meta.MonitorMode != "" {
fmt.Fprintf(f, " --monitor=%s \\\n", meta.MonitorMode)
}
if meta.ProdSSH != "" {
fmt.Fprintf(f, " --prod-ssh=%s\n", meta.ProdSSH)
}
fmt.Fprintf(f, "```\n")
}
// ---- helpers ----
func workloadLabel(w string) string {
switch w {
case "read":
return "📖 读"
case "write_light":
return "✏️ 轻写"
case "write_heavy":
return "🛠️ 重写"
}
return w
}
func thresholdMark(value, threshold float64) string {
if value <= threshold {
return "✅"
}
if value <= threshold*1.5 {
return "⚠️"
}
return "🚨"
}
func errSummary(s RunReport) string {
if s.TotalRequests == 0 {
return "无请求"
}
rate := pct(s.Errors, s.TotalRequests)
return fmt.Sprintf("err %.2f%%", rate)
}
func stagesIdx(stages []StageReport) int {
_, idx, _ := KneeRPS(stages)
return idx
}
func pct(num, denom int64) float64 {
if denom == 0 {
return 0
}
return float64(num) / float64(denom) * 100
}
func usToMs(us int64) float64 {
return float64(us) / 1000
}
func commaInt(n int64) string {
if n == 0 {
return "0"
}
neg := n < 0
if neg {
n = -n
}
s := fmt.Sprintf("%d", n)
// Insert commas
out := []byte{}
for i, c := range s {
if i > 0 && (len(s)-i)%3 == 0 {
out = append(out, ',')
}
out = append(out, byte(c))
}
if neg {
return "-" + string(out)
}
return string(out)
}
func emptyDash(s string) string {
if s == "" {
return "—"
}
return s
}
func ifThen(cond bool, a, b string) string {
if cond {
return a
}
return b
}
// actionItems emits scenario-specific P0/P1/P2 action items.
func actionItems(f *os.File, s RunReport, meta ScenarioMeta, knee bool, _ int) {
p99Ms := usToMs(s.P99Us)
errRate := pct(s.Errors, s.TotalRequests)
fiveXXRate := pct(s.FiveXX, s.TotalRequests)
p99Over := p99Ms > meta.Thresholds.P99MsMax
anyAction := false
if knee {
kneeRPS, kneeIdx, _ := KneeRPS(s.Stages)
fmt.Fprintf(f, "- [ ] **🔴 P0**: 修复 stage %d 拐点 (%d RPS, p99=%.0fms)\n", kneeIdx, kneeRPS, p99Ms)
fmt.Fprintf(f, " - 看 PG 慢查询 (`pg_stat_statements ORDER BY mean_exec_time DESC`)\n")
fmt.Fprintf(f, " - 跑应用层 profile (`pprof http://localhost:PORT/debug/pprof/profile`)\n")
fmt.Fprintf(f, " - 临时方案: 服务端限流到 %d RPS,超限返回 429\n", kneeRPS)
anyAction = true
}
if fiveXXRate > 0.5 {
fmt.Fprintf(f, "- [ ] **🔴 P0**: 5xx 率 %.2f%% — 看 prod 服务日志,定位具体错误\n", fiveXXRate)
anyAction = true
}
if errRate > 1 {
fmt.Fprintf(f, "- [ ] **🟡 P1**: 错误率 %.2f%% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失\n", errRate)
anyAction = true
}
if p99Over && !knee {
fmt.Fprintf(f, "- [ ] **🟡 P1**: P99 %.0fms 超过阈值 %.0fms — 检查是否有个别慢查询\n", p99Ms, meta.Thresholds.P99MsMax)
anyAction = true
}
// Workload-specific suggestions
if meta.Workload == "write_heavy" && (knee || p99Over) {
fmt.Fprintf(f, "- [ ] **🟡 P1**: 写重场景有性能问题 — 考虑把同步写改成异步(消息队列)\n")
anyAction = true
}
if meta.Workload == "read" && (knee || p99Over) {
fmt.Fprintf(f, "- [ ] **🟡 P1**: 读路径有性能问题 — 加 Redis 缓存,减少 DB 直查\n")
anyAction = true
}
if !anyAction {
fmt.Fprintf(f, "✅ 无需行动项 — 所有指标在阈值内。\n")
}
fmt.Fprintf(f, "\n")
} }

View File

@ -0,0 +1,156 @@
package reporter
import "time"
// Thresholds defines health KPIs for a scenario.
type Thresholds struct {
P50MsMax float64 // P50ms should be <= this
P95MsMax float64 // P95ms should be <= this
P99MsMax float64 // P99ms should be <= this
ErrorRateMax float64 // e.g. 0.01 = 1%
FiveXXRateMax float64 // e.g. 0.001 = 0.1%
}
// ScenarioMeta describes what a scenario tests and how to evaluate it.
type ScenarioMeta struct {
ID string // "S1"
Name string // "登录"
API string // "POST /api/v1/auth/login"
Description string // 业务一句话
BusinessImp string // 影响范围 (所有用户 / 写重 / 边缘功能)
Workload string // "read" | "write_light" | "write_heavy"
Thresholds Thresholds
}
// AllScenarios is the registry of known scenarios.
// Keep this in sync with scenarios/s*.go registry.
var AllScenarios = map[string]ScenarioMeta{
"S1": {
ID: "S1",
Name: "用户登录",
API: "POST /api/v1/auth/login",
Description: "用户身份认证,签发 JWT",
BusinessImp: "🔴 所有用户必经路径,失败 = 用户进不来",
Workload: "write_light",
Thresholds: Thresholds{
P50MsMax: 100, P95MsMax: 300, P99MsMax: 1000,
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
},
},
"S2": {
ID: "S2",
Name: "浏览资产详情",
API: "GET /api/v1/assets/{id}",
Description: "高频读路径,典型缓存命中场景",
BusinessImp: "🟢 单用户最高频操作,影响页面加载体验",
Workload: "read",
Thresholds: Thresholds{
P50MsMax: 50, P95MsMax: 150, P99MsMax: 500,
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
},
},
"S3": {
ID: "S3",
Name: "点赞 / 取消点赞",
API: "POST/DELETE /api/v1/social/assets/{id}/like",
Description: "轻量写,社交互动",
BusinessImp: "🟢 写多但单条小,影响点赞数显示",
Workload: "write_light",
Thresholds: Thresholds{
P50MsMax: 80, P95MsMax: 250, P99MsMax: 800,
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
},
},
"S4": {
ID: "S4",
Name: "资产铸造 (mint)",
API: "POST /api/v1/assets/mints/precreate",
Description: "写重路径:OSS 上传 + 签名 + 事务落库",
BusinessImp: "🟡 核心交易,影响创作者产出节奏",
Workload: "write_heavy",
Thresholds: Thresholds{
P50MsMax: 300, P95MsMax: 800, P99MsMax: 2000, // 写重场景阈值更宽
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
},
},
"S5": {
ID: "S5",
Name: "Dashboard 聚合",
API: "聚合多个用户/资产指标",
Description: "后台聚合查询,可能涉及多表 JOIN",
BusinessImp: "🟢 运营场景,非实时关键",
Workload: "read",
Thresholds: Thresholds{
P50MsMax: 200, P95MsMax: 500, P99MsMax: 1500,
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
},
},
"S6": {
ID: "S6",
Name: "热门榜单",
API: "GET /api/v1/rankings/hot",
Description: "排序读,Redis 缓存命中率关键",
BusinessImp: "🟢 首页流量入口,影响新用户第一印象",
Workload: "read",
Thresholds: Thresholds{
P50MsMax: 30, P95MsMax: 100, P99MsMax: 300,
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
},
},
"S7": {
ID: "S7",
Name: "摆展 (place)",
API: "展位分配 + 事务",
Description: "写重路径,涉及展位锁竞争",
BusinessImp: "🟡 创作者核心操作,涉及并发事务",
Workload: "write_heavy",
Thresholds: Thresholds{
P50MsMax: 400, P95MsMax: 1000, P99MsMax: 2500,
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
},
},
}
// Verdict returns one of ✅ (good), ⚠️ (warning), 🚨 (critical).
// Based on thresholds + knee detection.
func (s ScenarioMeta) Verdict(r RunReport, kneeTriggered bool) string {
if len(r.Stages) == 0 {
return "❓"
}
errRate := float64(0)
fiveXXRate := float64(0)
if r.TotalRequests > 0 {
errRate = float64(r.Errors) / float64(r.TotalRequests)
fiveXXRate = float64(r.FiveXX) / float64(r.TotalRequests)
}
p99Ms := float64(r.P99Us) / 1000
// 红色条件:任一严重超标
if errRate > s.Thresholds.ErrorRateMax*2 ||
fiveXXRate > s.Thresholds.FiveXXRateMax*5 ||
p99Ms > s.Thresholds.P99MsMax*2 {
return "🚨"
}
// 黄色条件:接近阈值 或 触发拐点
if errRate > s.Thresholds.ErrorRateMax ||
fiveXXRate > s.Thresholds.FiveXXRateMax ||
p99Ms > s.Thresholds.P99MsMax ||
kneeTriggered {
return "⚠️"
}
return "✅"
}
// RunMetadata captures run-level context for the report header.
type RunMetadata struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Target string `json:"target"`
Scenarios []string `json:"scenarios"`
StepSchedule string `json:"step_schedule,omitempty"`
JWTSecretHint string `json:"jwt_secret_hint,omitempty"`
ProdSSH string `json:"prod_ssh,omitempty"`
MonitorMode string `json:"monitor_mode,omitempty"`
StageMode string `json:"stage_mode"` // "baseline" | "step" | ...
RPSOverride int `json:"rps_override,omitempty"`
}

View File

@ -20,17 +20,20 @@ func doRequest(client *http.Client, req *http.Request, rec *lib.LatencyRecorder,
totalCount.Add(1) totalCount.Add(1)
if err != nil { if err != nil {
errCount.Add(1) errCount.Add(1)
rec.RecordResult(true, false)
checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker) checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker)
return return
} }
defer resp.Body.Close() defer resp.Body.Close()
switch { is5xx := resp.StatusCode >= 500
case resp.StatusCode >= 500: isErr := resp.StatusCode >= 400
if is5xx {
fiveXXCount.Add(1) fiveXXCount.Add(1)
errCount.Add(1) errCount.Add(1)
case resp.StatusCode >= 400: } else if isErr {
errCount.Add(1) errCount.Add(1)
} }
rec.RecordResult(isErr, is5xx)
checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker) checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker)
} }

View File

@ -40,6 +40,10 @@ func (s *s1Login) Run(ctx context.Context, rpsOverride int, durationOverride tim
duration = 2 * time.Minute duration = 2 * time.Minute
} }
// S1 doesn't internally iterate stages, so wrap entire run as stage 1
s.rec.BeginStage(1, targetRPS)
defer s.rec.EndStage()
ticker := time.NewTicker(time.Second / time.Duration(targetRPS)) ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
defer ticker.Stop() defer ticker.Stop()
timeout := time.NewTimer(duration) timeout := time.NewTimer(duration)

View File

@ -38,6 +38,10 @@ func (s *s2Read) Run(ctx context.Context, rpsOverride int, durationOverride time
duration = 2 * time.Minute duration = 2 * time.Minute
} }
// S2 doesn't internally iterate stages, wrap entire run as stage 1
s.rec.BeginStage(1, targetRPS)
defer s.rec.EndStage()
ticker := time.NewTicker(time.Second / time.Duration(targetRPS)) ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
defer ticker.Stop() defer ticker.Stop()
timeout := time.NewTimer(duration) timeout := time.NewTimer(duration)

View File

@ -39,6 +39,10 @@ func (s *s3Like) Run(ctx context.Context, rpsOverride int, durationOverride time
duration = 2 * time.Minute duration = 2 * time.Minute
} }
// S3 doesn't internally iterate stages, wrap entire run as stage 1
s.rec.BeginStage(1, targetRPS)
defer s.rec.EndStage()
ticker := time.NewTicker(time.Second / time.Duration(targetRPS)) ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
defer ticker.Stop() defer ticker.Stop()
timeout := time.NewTimer(duration) timeout := time.NewTimer(duration)

View File

@ -37,11 +37,18 @@ func (s *s4Mint) Run(ctx context.Context, rpsOverride int, durationOverride time
if len(stages) == 0 { if len(stages) == 0 {
stages = []int{5, 10, 20, 30, 50, 80} stages = []int{5, 10, 20, 30, 50, 80}
} }
stageDuration := 2 * time.Minute
if durationOverride > 0 && durationOverride < stageDuration {
stageDuration = durationOverride
}
for stageIdx, stageRPS := range stages { for stageIdx, stageRPS := range stages {
logf("S4 stage %d/%d: %d RPS × 2min", stageIdx+1, len(stages), stageRPS) logf("S4 stage %d/%d: %d RPS × %v", stageIdx+1, len(stages), stageRPS, stageDuration)
if err := s.runStage(ctx, stageRPS, 2*time.Minute); err != nil { s.rec.BeginStage(stageIdx+1, stageRPS)
if err := s.runStage(ctx, stageRPS, stageDuration); err != nil {
s.rec.EndStage()
return err return err
} }
s.rec.EndStage()
logf("S4 stage %d done, resetting mint data...", stageIdx+1) logf("S4 stage %d done, resetting mint data...", stageIdx+1)
if s.prodSSH != "" { if s.prodSSH != "" {
cmd := exec.Command("ssh", s.prodSSH, "bash /opt/topfans/loadtest/scripts/mint_reset.sh") cmd := exec.Command("ssh", s.prodSSH, "bash /opt/topfans/loadtest/scripts/mint_reset.sh")

View File

@ -0,0 +1,35 @@
#!/bin/bash
# ===================================================================
# prod seed 一键运行脚本
# 用途:从 /opt/topfans/docker/.env.prod 读 DB/JWT 凭据,跑 seed 工具
# 使用:ssh root@101.132.250.62 "bash /opt/topfans/loadtest/scripts/prod_seed.sh"
# ===================================================================
set -euo pipefail
ENV_FILE="/opt/topfans/docker/.env.prod"
LOADTEST_DIR="/opt/topfans/loadtest"
if [[ ! -f "$ENV_FILE" ]]; then
echo "$ENV_FILE 不存在"
exit 1
fi
export DB_PASSWORD=$(grep '^DB_PASSWORD=' "$ENV_FILE" | cut -d= -f2)
export JWT_SECRET=$(grep '^JWT_SECRET=' "$ENV_FILE" | cut -d= -f2)
cd "$LOADTEST_DIR"
echo "=========================================="
echo "prod seed - 准备 loadtest 数据"
echo "DB host: localhost (容器内)"
echo "DB name: topfans"
echo "JWT secret: ${JWT_SECRET:0:10}..."
echo "=========================================="
./seed --db-name=topfans --jwt-secret="$JWT_SECRET"
echo ""
echo "✅ seed 完成。生成的文件:"
ls -la users.csv
echo ""
echo "下一步: ./loadgen --cmd=preflight --target=http://localhost:8080"

View File

@ -1,67 +1,188 @@
# seed - 压测数据准备工具 # seed - 压测数据准备工具
## 用途 > 给 prod 凌晨压测灌 1000 个测试用户 + 资产 + JWT,数据用 `star_id=999900` 物理隔离。
在 prod 本地插入 1000 个测试用户、5000 资产、3000 booth_slots、2000 exhibitions、10000 friendships,签 1000 个 JWT,写 `users.csv` ---
## 一句话总结
`./seed`,数据库里多出 1000 个用户 + 5000 个 assets + 2000 个 exhibitions,本地多出 `users.csv` (含 JWT)。
---
## 编译 ## 编译
```bash ```bash
cd backend && go build -o seed ./scripts/loadgen/seed/ cd backend
go build -o bin/seed ./scripts/loadgen/seed/
# 或
make loadgen-build
``` ```
## 在 prod 上跑 ---
## 在 prod 上跑 (凌晨 T0 = 02:00)
```bash ```bash
# 1. 上传二进制
scp seed root@101.132.250.62:/opt/topfans/loadtest/
# 2. SSH 上去跑
ssh root@101.132.250.62 ssh root@101.132.250.62
cd /opt/topfans/loadtest cd /opt/topfans/loadtest
export DB_PASSWORD=$(cat /opt/topfans/docker/.env.prod | grep DB_PASSWORD | cut -d= -f2) bash scripts/prod_seed.sh
export JWT_SECRET=$(cat /opt/topfans/docker/.env.prod | grep JWT_SECRET | cut -d= -f2)
./seed --db-name=topfans --jwt-secret="$JWT_SECRET"
``` ```
## 清理 这个脚本会自动:
1. 读 `/opt/topfans/docker/.env.prod` 拿 DB_PASSWORD + JWT_SECRET
2. 跑 seed (插入 23k 行测试数据)
3. 自动重置 PG 序列 (CLAUDE.md 规范)
4. 写 `users.csv` (含 1000 个 JWT)
**预计耗时**:30-60 秒
---
## 在本地 docker 跑 (开发联调)
```bash ```bash
# 保留 1000 users + 资产(下次复用) cd backend/scripts/loadgen/seed
./seed --cleanup
# 全删(包括账号本身) # 1. 生成 bcrypt 哈希 (与 tokens.go 硬编码的 "Test@123" 匹配)
./seed --cleanup --full python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \
> loadtest_bcrypt.txt
# 只重签 token(第二轮压测 JWT 过期时) # 2. 跑 seed (假设本地 docker postgres 在 15432)
./seed --reset-tokens --jwt-secret="$JWT_SECRET" cd /Users/liulujian/Documents/code/TopFansByGithub/backend
DB_PASSWORD=123456 \
JWT_SECRET=topfans-secret-key-local-dev-only \
./bin/seed \
--db-name=top-fans \
--db-host=localhost \
--db-port=15432 \
--db-user=postgres
``` ```
## 本地 docker 联调(开发阶段) **注意**: `loadtest_bcrypt.txt` 必须在 seed 二进制运行的**当前目录**(代码用相对路径读)。
---
## 命令行参数
```
./bin/seed --help
Usage of ./bin/seed:
-cleanup # 跑清理 (默认保留 1000 users)
-cleanup-star-id int # 要清的 star_id (默认 999900, 防止误删)
-full # 配合 -cleanup: 也删用户和 stars
-reset # 删旧数据再 seed (隐含 --cleanup 行为)
-reset-tokens # 只重签 JWT (数据保留)
-jwt-secret string # JWT 密钥 (默认 $JWT_SECRET)
-db-host string # PG host (默认 localhost)
-db-port int # PG port (默认 5432)
-db-name string # PG 数据库 (prod=topfans, 本地=top-fans)
-db-user string # PG user (默认 postgres)
-db-password string # PG 密码 (默认 $DB_PASSWORD)
```
---
## 三种"清理"模式对比
| 命令 | 删 stars | 删 users | 删 assets/exhibits | 用途 |
|------|---------|---------|-------------------|------|
| `./seed --cleanup` | ❌ | ❌ | ✅ | 压完一轮,清理资产但保留账号 |
| `./seed --cleanup --full` | ✅ | ✅ | ✅ | 全部清,下次重新 seed |
| `./seed --reset` | ❌ | ❌ | ✅ | 等同 `--cleanup`(保留用户) |
| `./seed --reset-tokens` | ❌ | ❌ | ❌ | 只重新签 JWT,数据不动 |
**典型流程**:
```bash
# 第 1 轮压测 (02:00-03:00)
./seed # 灌数据
./loadgen --cmd=run --scenarios=S1,S2,S4 # 压测
./seed --cleanup # 压完清理资产
# 第 2 轮压测 (下周,JWT 过期了)
./seed --reset-tokens --jwt-secret=$JWT_SECRET # 只重签 JWT
./loadgen --cmd=run --scenarios=S1,S2,S4 # 复测
# 完全重来 (例如改了用户模型)
./seed --cleanup --full # 全删
./seed # 重新灌
```
---
## 数据规模
| 表 | 行数 | 备注 |
|----|------|------|
| `stars` | +1 | star_id=999900 |
| `users` | +1000 | mobile 19900000001 ~ 19900001000 |
| `fan_profiles` | +1000 | 每个 user 一个 |
| `crystal_transaction_records` | +1000+ | 初始水晶 |
| `assets` | +5000 | 每个 user ~5 个 |
| `booth_slots` | +3000 | |
| `exhibitions` | +2000 | |
| `friendships` | +10000 | |
| **TOTAL** | **~23k 行** | |
---
## 关键设计
### 1. star_id 隔离
所有测试数据用 `star_id = 999900`,**不影响**真实业务 (87, 88, 91, 93, 94, 95)。
### 2. PG max_connections = 50
prod 已将 `POSTGRES_MAX_CONNECTIONS` 从 100 调到 50,避免被测试数据耗尽连接池。
### 3. CLAUDE.md 序列重置
seed 末尾自动 `setval()` 所有相关表的 sequence,避免后续 GORM 插入报 duplicate key。
### 4. JWT 7 天过期
跨周第二轮压测前需 `--reset-tokens` 重签。
### 5. bcrypt 哈希与密码硬编码
- `tokens.go` 硬编码密码为 `"Test@123"`(写到 users.csv 的 password 列)
- `loadtest_bcrypt.txt` 是这个密码的 bcrypt(cost=10) 哈希
- 二者必须匹配,否则 login 会报 500
---
## 常见问题
### Q: 跑完 seed 但 login 报"密码错误"?
A: `loadtest_bcrypt.txt` 没匹配上 `Test@123`
```bash
python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \
> loadtest_bcrypt.txt
./seed --cleanup --full && ./seed
```
### Q: 想换密码怎么办?
A: 同时改两个地方:
1. `tokens.go``u.Mobile, "Test@123"` → 你的密码
2. `loadtest_bcrypt.txt` 重新生成
### Q: "loadtest_bcrypt.txt: no such file or directory"?
A: seed 用相对路径读这个文件,必须在 seed 目录跑(或者把文件 cp 到当前目录)。
### Q: --reset 没生效,users 还是旧的?
A: 因为 `--reset` 等同 `--cleanup`(保留用户)。要删用户用 `--cleanup --full`
---
## 单元测试
```bash ```bash
cd backend cd backend
go build -o bin/seed ./scripts/loadgen/seed/ go test ./scripts/loadgen/seed/ -v
DB_PASSWORD=postgres123 JWT_SECRET=topfans-secret-key-local-dev-only \
./bin/seed --db-name=top-fans --db-host=localhost
```
## 关键约束
- **star_id = 999900**:所有数据用此 star_id 隔离,不影响真实业务
- **PG max_connections = 50**:Task 5 已将 `POSTGRES_MAX_CONNECTIONS` 从 100 改到 50
- **CLAUDE.md 序列重置**:ResetSequences 会在 seed 末尾自动同步所有相关表的 sequence,避免后续 GORM 插入报 duplicate key
- **JWT 7 天过期**:跨周第二轮压测前需 `--reset-tokens` 重签
## 测试
```bash
cd backend && go test ./scripts/loadgen/seed/ -v
``` ```
5 个测试: 5 个测试:
- `TestMobileNumbering`:mobile 编号正确性 - `TestMobileNumbering`: mobile 编号正确性
- `TestSequenceMapping`:loadtestSeqs 映射 - `TestSequenceMapping`: loadtestSeqs 映射
- `TestPKColumnMapping`:pkColumns 映射(关键 stars/star_id, booth_slots/slot_id) - `TestPKColumnMapping`: pkColumns 映射(关键 stars/star_id, booth_slots/slot_id)
- `TestCleanupRejectsInvalidStarID`:cleanup 拒绝非 loadtest star_id - `TestCleanupRejectsInvalidStarID`: cleanup 拒绝非 loadtest star_id
- `TestJoinInt64`:CSV 序列化辅助函数 - `TestJoinInt64`: CSV 序列化辅助函数
**测试状态**: 5/5 PASS

View File

@ -470,7 +470,7 @@ defineExpose({
.creation-grid { .creation-grid {
display: flex; display: flex;
flex-wrap: wrap; flex-wrap: wrap;
justify-content: space-between; justify-content: space-around;
padding-bottom: 120rpx; padding-bottom: 120rpx;
} }

View File

@ -456,6 +456,8 @@ onUnmounted(() => {
min-height: 0; min-height: 0;
border-radius: 12px; border-radius: 12px;
overflow: hidden; overflow: hidden;
position: relative;
z-index: 2;
} }
.ranking-tabs { .ranking-tabs {
@ -636,6 +638,7 @@ onUnmounted(() => {
/* box-shadow: 2px 2px 4.5px 0px #f04b4b40; */ /* box-shadow: 2px 2px 4.5px 0px #f04b4b40; */
box-shadow: 2px 4px 4px 0px #c92f2f5c; box-shadow: 2px 4px 4px 0px #c92f2f5c;
margin-bottom: 36.8rpx; margin-bottom: 36.8rpx;
z-index: 3;
} }
/* 单行布局:藏品图片 + 头像 + 点赞信息 + TOP 标签 */ /* 单行布局:藏品图片 + 头像 + 点赞信息 + TOP 标签 */