feat:修改压测配置
This commit is contained in:
parent
0aa02cc3a4
commit
1bc86f0447
@ -1,7 +1,7 @@
|
|||||||
# TopFans Backend Makefile
|
# TopFans Backend Makefile
|
||||||
# 用于简化开发流程
|
# 用于简化开发流程
|
||||||
|
|
||||||
.PHONY: help install-swagger gen-swagger update-swagger start-swagger start-all stop-all clean build run all
|
.PHONY: help install-swagger gen-swagger update-swagger start-swagger start-all stop-all clean build run all loadgen-build loadgen-test loadgen-vet loadgen-ci
|
||||||
|
|
||||||
# 默认目标
|
# 默认目标
|
||||||
help:
|
help:
|
||||||
@ -23,6 +23,11 @@ help:
|
|||||||
@echo " make run - 运行 Gateway"
|
@echo " make run - 运行 Gateway"
|
||||||
@echo " make all - 安装依赖 + 生成文档 + 构建"
|
@echo " make all - 安装依赖 + 生成文档 + 构建"
|
||||||
@echo ""
|
@echo ""
|
||||||
|
@echo "压测工具:"
|
||||||
|
@echo " make loadgen-build - 编译 seed + loadgen 到 bin/"
|
||||||
|
@echo " make loadgen-test - 运行 loadgen 单元测试"
|
||||||
|
@echo " make loadgen-vet - go vet 静态检查"
|
||||||
|
@echo ""
|
||||||
@echo "清理:"
|
@echo "清理:"
|
||||||
@echo " make clean - 清理生成的文件"
|
@echo " make clean - 清理生成的文件"
|
||||||
@echo ""
|
@echo ""
|
||||||
@ -37,6 +42,11 @@ help:
|
|||||||
@echo " make run - 运行 Gateway"
|
@echo " make run - 运行 Gateway"
|
||||||
@echo " make all - 安装依赖 + 生成文档 + 构建"
|
@echo " make all - 安装依赖 + 生成文档 + 构建"
|
||||||
@echo ""
|
@echo ""
|
||||||
|
@echo "压测工具:"
|
||||||
|
@echo " make loadgen-build - 编译 seed + loadgen 到 bin/"
|
||||||
|
@echo " make loadgen-test - 运行 loadgen 单元测试"
|
||||||
|
@echo " make loadgen-vet - go vet 静态检查"
|
||||||
|
@echo ""
|
||||||
@echo "清理:"
|
@echo "清理:"
|
||||||
@echo " make clean - 清理生成的文件"
|
@echo " make clean - 清理生成的文件"
|
||||||
|
|
||||||
@ -92,8 +102,32 @@ clean:
|
|||||||
@rm -rf backend/gateway/docs/*.go
|
@rm -rf backend/gateway/docs/*.go
|
||||||
@rm -rf backend/gateway/docs/*.json
|
@rm -rf backend/gateway/docs/*.json
|
||||||
@rm -rf backend/gateway/docs/*.yaml
|
@rm -rf backend/gateway/docs/*.yaml
|
||||||
|
@rm -rf backend/bin/
|
||||||
@echo "✅ 清理完成"
|
@echo "✅ 清理完成"
|
||||||
|
|
||||||
|
# ==================== Loadgen / 压测工具 ====================
|
||||||
|
|
||||||
|
# 编译 seed 和 loadgen 二进制到 bin/
|
||||||
|
loadgen-build:
|
||||||
|
@echo "编译 loadgen 工具..."
|
||||||
|
@mkdir -p bin
|
||||||
|
@go build -ldflags="-s -w" -o bin/seed ./scripts/loadgen/seed/
|
||||||
|
@go build -ldflags="-s -w" -o bin/loadgen ./scripts/loadgen/loadgen/
|
||||||
|
@echo "✅ seed + loadgen → bin/"
|
||||||
|
|
||||||
|
# 运行 loadgen 单元测试 (当前 23 个测试, 应全过)
|
||||||
|
loadgen-test:
|
||||||
|
@echo "运行 loadgen 单元测试..."
|
||||||
|
@go test -count=1 ./scripts/loadgen/...
|
||||||
|
|
||||||
|
# go vet 静态检查
|
||||||
|
loadgen-vet:
|
||||||
|
@echo "go vet loadgen..."
|
||||||
|
@go vet ./scripts/loadgen/...
|
||||||
|
|
||||||
|
# loadgen 完整 CI 入口: vet + test + build
|
||||||
|
loadgen-ci: loadgen-vet loadgen-test loadgen-build
|
||||||
|
|
||||||
# 全部:安装依赖 + 生成文档 + 构建
|
# 全部:安装依赖 + 生成文档 + 构建
|
||||||
all: install-swagger gen-swagger build
|
all: install-swagger gen-swagger build
|
||||||
@echo ""
|
@echo ""
|
||||||
|
|||||||
23
backend/reports/S1.json
Normal file
23
backend/reports/S1.json
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"scenario": "S1",
|
||||||
|
"total_requests": 8,
|
||||||
|
"errors": 0,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 73919,
|
||||||
|
"p95_us": 83071,
|
||||||
|
"p99_us": 83071,
|
||||||
|
"max_us": 83071,
|
||||||
|
"stages": [
|
||||||
|
{
|
||||||
|
"stage_idx": 1,
|
||||||
|
"target_rps": 1,
|
||||||
|
"total_requests": 8,
|
||||||
|
"errors": 0,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 73919,
|
||||||
|
"p95_us": 83071,
|
||||||
|
"p99_us": 83071,
|
||||||
|
"max_us": 83071
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
23
backend/reports/S2.json
Normal file
23
backend/reports/S2.json
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"scenario": "S2",
|
||||||
|
"total_requests": 8,
|
||||||
|
"errors": 8,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 1552,
|
||||||
|
"p95_us": 2909,
|
||||||
|
"p99_us": 2909,
|
||||||
|
"max_us": 2909,
|
||||||
|
"stages": [
|
||||||
|
{
|
||||||
|
"stage_idx": 1,
|
||||||
|
"target_rps": 1,
|
||||||
|
"total_requests": 8,
|
||||||
|
"errors": 8,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 1552,
|
||||||
|
"p95_us": 2909,
|
||||||
|
"p99_us": 2909,
|
||||||
|
"max_us": 2909
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
45
backend/reports/S4.json
Normal file
45
backend/reports/S4.json
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
{
|
||||||
|
"scenario": "S4",
|
||||||
|
"total_requests": 18,
|
||||||
|
"errors": 18,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 1210,
|
||||||
|
"p95_us": 2161,
|
||||||
|
"p99_us": 2161,
|
||||||
|
"max_us": 2161,
|
||||||
|
"stages": [
|
||||||
|
{
|
||||||
|
"stage_idx": 1,
|
||||||
|
"target_rps": 1,
|
||||||
|
"total_requests": 3,
|
||||||
|
"errors": 3,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 4143,
|
||||||
|
"p95_us": 8943,
|
||||||
|
"p99_us": 8943,
|
||||||
|
"max_us": 8943
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"stage_idx": 2,
|
||||||
|
"target_rps": 2,
|
||||||
|
"total_requests": 6,
|
||||||
|
"errors": 6,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 1314,
|
||||||
|
"p95_us": 2044,
|
||||||
|
"p99_us": 2044,
|
||||||
|
"max_us": 2044
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"stage_idx": 3,
|
||||||
|
"target_rps": 3,
|
||||||
|
"total_requests": 9,
|
||||||
|
"errors": 9,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 1210,
|
||||||
|
"p95_us": 2161,
|
||||||
|
"p99_us": 2161,
|
||||||
|
"max_us": 2161
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
4
backend/reports/baseline.csv
Normal file
4
backend/reports/baseline.csv
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms,stages
|
||||||
|
S1,8,0,0,73.91,83.07,83.07,83.07,1
|
||||||
|
S2,8,8,0,1.55,2.90,2.90,2.90,1
|
||||||
|
S4,18,18,0,1.20,2.16,2.16,2.16,3
|
||||||
|
227
backend/reports/final-report.md
Normal file
227
backend/reports/final-report.md
Normal file
@ -0,0 +1,227 @@
|
|||||||
|
# TopFans 压测报告
|
||||||
|
|
||||||
|
## 📋 运行信息
|
||||||
|
|
||||||
|
| 项 | 值 |
|
||||||
|
|---|---|
|
||||||
|
| **生成时间** | 2026-06-15 20:05:56 CST |
|
||||||
|
| **压测开始** | 2026-06-15 20:05:47 CST |
|
||||||
|
| **压测结束** | 2026-06-15 20:05:56 CST |
|
||||||
|
| **总耗时** | 9s |
|
||||||
|
| **目标地址** | `http://localhost:8080` |
|
||||||
|
| **测试场景** | S4 |
|
||||||
|
| **阶梯模式** | step (`1,2,3`) |
|
||||||
|
| **JWT 签名密钥** | `topfans-***` (前 8 位) |
|
||||||
|
| **监控模式** | off |
|
||||||
|
| **总请求数** | 34 |
|
||||||
|
| **总错误数** | 26 (76.47%) |
|
||||||
|
| **5xx 数** | 0 (0.00%) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎯 执行摘要
|
||||||
|
|
||||||
|
**总览**: ✅ 1 健康 / ⚠️ 0 警告 / 🚨 2 严重 (共 3)
|
||||||
|
|
||||||
|
🚨 **关键问题** (2 个):
|
||||||
|
|
||||||
|
- **S2 (浏览资产详情)**: 错误率 100.00%
|
||||||
|
- **S4 (资产铸造 (mint))**: 错误率 100.00%
|
||||||
|
|
||||||
|
**场景速览**:
|
||||||
|
|
||||||
|
- ✅ **S1 用户登录** — p99=83ms, err 0.00%
|
||||||
|
- 🚨 **S2 浏览资产详情** — p99=3ms, err 100.00%
|
||||||
|
- 🚨 **S4 资产铸造 (mint)** — p99=2ms, err 100.00%
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📊 总览表
|
||||||
|
|
||||||
|
| 场景 | 描述 | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 拐点 RPS | 状态 |
|
||||||
|
|------|------|-------|-----|-----|-------|-------|-------|-------|---------|------|
|
||||||
|
| **S1** | 用户登录 | 8 | 0 (0.00%) | 0 (0.00%) | 74 | 83 | 83 | 83 | — | ✅ |
|
||||||
|
| **S2** | 浏览资产详情 | 8 | 8 (100.00%) | 0 (0.00%) | 2 | 3 | 3 | 3 | — | 🚨 |
|
||||||
|
| **S4** | 资产铸造 (mint) | 18 | 18 (100.00%) | 0 (0.00%) | 1 | 2 | 2 | 2 | — | 🚨 |
|
||||||
|
|
||||||
|
> 说明: Err 包含 4xx + 5xx,5xx 是子集。错误率 = Err / Total。
|
||||||
|
|
||||||
|
## 🔬 跨场景瓶颈分析
|
||||||
|
|
||||||
|
✅ **无明显瓶颈**,所有场景 P99 都在阈值内。
|
||||||
|
|
||||||
|
**P99 / 阈值 比率** (从高到低):
|
||||||
|
|
||||||
|
- S1: 0.08x (83ms)
|
||||||
|
- S2: 0.01x (3ms)
|
||||||
|
- S4: 0.00x (2ms)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✅ S1 用户登录
|
||||||
|
|
||||||
|
### 📌 测试说明
|
||||||
|
|
||||||
|
| 项 | 值 |
|
||||||
|
|---|---|
|
||||||
|
| **API** | `POST /api/v1/auth/login` |
|
||||||
|
| **负载类型** | ✏️ 轻写 |
|
||||||
|
| **业务说明** | 用户身份认证,签发 JWT |
|
||||||
|
| **影响范围** | 🔴 所有用户必经路径,失败 = 用户进不来 |
|
||||||
|
|
||||||
|
### 📈 性能指标 vs 健康阈值
|
||||||
|
|
||||||
|
| 指标 | 实测 | 阈值 | 判定 |
|
||||||
|
|------|------|------|------|
|
||||||
|
| P50ms | 74 | ≤100 | ✅ |
|
||||||
|
| P95ms | 83 | ≤300 | ✅ |
|
||||||
|
| P99ms | 83 | ≤1000 | ✅ |
|
||||||
|
| Maxms | 83 | — | ℹ️ 参考 |
|
||||||
|
| 错误率 | 0.00% | ≤1.00% | ✅ |
|
||||||
|
| 5xx 率 | 0.00% | ≤0.10% | ✅ |
|
||||||
|
|
||||||
|
### 📍 拐点分析
|
||||||
|
|
||||||
|
ℹ️ 仅 1 个 stage,未做阶梯测试,无法判断拐点。
|
||||||
|
|
||||||
|
### 🔢 阶梯结果
|
||||||
|
|
||||||
|
| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |
|
||||||
|
|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|
|
||||||
|
| 1 | 1 | 8 | 0 | 0 | 74 | 83 | 83 | 83 | |
|
||||||
|
|
||||||
|
### 🎯 行动项
|
||||||
|
|
||||||
|
✅ 无需行动项 — 所有指标在阈值内。
|
||||||
|
|
||||||
|
### 📉 图表
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚨 S2 浏览资产详情
|
||||||
|
|
||||||
|
### 📌 测试说明
|
||||||
|
|
||||||
|
| 项 | 值 |
|
||||||
|
|---|---|
|
||||||
|
| **API** | `GET /api/v1/assets/{id}` |
|
||||||
|
| **负载类型** | 📖 读 |
|
||||||
|
| **业务说明** | 高频读路径,典型缓存命中场景 |
|
||||||
|
| **影响范围** | 🟢 单用户最高频操作,影响页面加载体验 |
|
||||||
|
|
||||||
|
### 📈 性能指标 vs 健康阈值
|
||||||
|
|
||||||
|
| 指标 | 实测 | 阈值 | 判定 |
|
||||||
|
|------|------|------|------|
|
||||||
|
| P50ms | 2 | ≤50 | ✅ |
|
||||||
|
| P95ms | 3 | ≤150 | ✅ |
|
||||||
|
| P99ms | 3 | ≤500 | ✅ |
|
||||||
|
| Maxms | 3 | — | ℹ️ 参考 |
|
||||||
|
| 错误率 | 100.00% | ≤1.00% | 🚨 |
|
||||||
|
| 5xx 率 | 0.00% | ≤0.10% | ✅ |
|
||||||
|
|
||||||
|
### 📍 拐点分析
|
||||||
|
|
||||||
|
ℹ️ 仅 1 个 stage,未做阶梯测试,无法判断拐点。
|
||||||
|
|
||||||
|
### 🔢 阶梯结果
|
||||||
|
|
||||||
|
| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |
|
||||||
|
|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|
|
||||||
|
| 1 | 1 | 8 | 8 | 0 | 2 | 3 | 3 | 3 | |
|
||||||
|
|
||||||
|
### 🎯 行动项
|
||||||
|
|
||||||
|
- [ ] **🟡 P1**: 错误率 100.00% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失
|
||||||
|
|
||||||
|
### 📉 图表
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚨 S4 资产铸造 (mint)
|
||||||
|
|
||||||
|
### 📌 测试说明
|
||||||
|
|
||||||
|
| 项 | 值 |
|
||||||
|
|---|---|
|
||||||
|
| **API** | `POST /api/v1/assets/mints/precreate` |
|
||||||
|
| **负载类型** | 🛠️ 重写 |
|
||||||
|
| **业务说明** | 写重路径:OSS 上传 + 签名 + 事务落库 |
|
||||||
|
| **影响范围** | 🟡 核心交易,影响创作者产出节奏 |
|
||||||
|
|
||||||
|
### 📈 性能指标 vs 健康阈值
|
||||||
|
|
||||||
|
| 指标 | 实测 | 阈值 | 判定 |
|
||||||
|
|------|------|------|------|
|
||||||
|
| P50ms | 1 | ≤300 | ✅ |
|
||||||
|
| P95ms | 2 | ≤800 | ✅ |
|
||||||
|
| P99ms | 2 | ≤2000 | ✅ |
|
||||||
|
| Maxms | 2 | — | ℹ️ 参考 |
|
||||||
|
| 错误率 | 100.00% | ≤1.00% | 🚨 |
|
||||||
|
| 5xx 率 | 0.00% | ≤0.10% | ✅ |
|
||||||
|
|
||||||
|
### 📍 拐点分析
|
||||||
|
|
||||||
|
✅ **拐点未触发** — 全程 3 个 stage 健康运行,最高 3 RPS p99=2ms。
|
||||||
|
|
||||||
|
### 🔢 阶梯结果
|
||||||
|
|
||||||
|
| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |
|
||||||
|
|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|
|
||||||
|
| 1 | 1 | 3 | 3 | 0 | 4 | 9 | 9 | 9 | |
|
||||||
|
| 2 | 2 | 6 | 6 | 0 | 1 | 2 | 2 | 2 | -77% |
|
||||||
|
| 3 | 3 | 9 | 9 | 0 | 1 | 2 | 2 | 2 | +6% |
|
||||||
|
|
||||||
|
### 🎯 行动项
|
||||||
|
|
||||||
|
- [ ] **🟡 P1**: 错误率 100.00% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失
|
||||||
|
|
||||||
|
### 📉 图表
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📎 附录
|
||||||
|
|
||||||
|
### 健康阈值说明
|
||||||
|
|
||||||
|
- **P50/P95/P99**: 百分位延迟 (毫秒),值越小越好
|
||||||
|
- **错误率**: 4xx+5xx 请求占比,健康 < 1%
|
||||||
|
- **5xx 率**: 服务端错误率,健康 < 0.1%
|
||||||
|
- **拐点**: 阶梯测试中,p99 相对前一 stage 涨幅 > 50% 的第一个 stage
|
||||||
|
|
||||||
|
### 文件清单
|
||||||
|
|
||||||
|
```
|
||||||
|
reports/
|
||||||
|
├── final-report.md (本文件)
|
||||||
|
├── baseline.csv (Excel 可打开的汇总)
|
||||||
|
├── s1.json
|
||||||
|
├── s1.png
|
||||||
|
├── s2.json
|
||||||
|
├── s2.png
|
||||||
|
├── s3.json
|
||||||
|
├── s3.png
|
||||||
|
├── s4.json
|
||||||
|
├── s4.png
|
||||||
|
├── s5.json
|
||||||
|
├── s5.png
|
||||||
|
├── s6.json
|
||||||
|
├── s6.png
|
||||||
|
├── s7.json
|
||||||
|
├── s7.png
|
||||||
|
```
|
||||||
|
|
||||||
|
### 如何复现
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /opt/topfans/loadtest
|
||||||
|
./loadgen --cmd=run --scenarios=S4 --stage=step --step-schedule='1,2,3' \
|
||||||
|
--target=http://localhost:8080 \
|
||||||
|
--monitor=off \
|
||||||
|
```
|
||||||
12
backend/reports/run-metadata.json
Normal file
12
backend/reports/run-metadata.json
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"start_time": "2026-06-15T20:05:47.357522+08:00",
|
||||||
|
"end_time": "2026-06-15T20:05:56.380495+08:00",
|
||||||
|
"target": "http://localhost:8080",
|
||||||
|
"scenarios": [
|
||||||
|
"S4"
|
||||||
|
],
|
||||||
|
"step_schedule": "1,2,3",
|
||||||
|
"jwt_secret_hint": "topfans-",
|
||||||
|
"monitor_mode": "off",
|
||||||
|
"stage_mode": "step"
|
||||||
|
}
|
||||||
BIN
backend/reports/s1.png
Normal file
BIN
backend/reports/s1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
BIN
backend/reports/s2.png
Normal file
BIN
backend/reports/s2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 13 KiB |
BIN
backend/reports/s4.png
Normal file
BIN
backend/reports/s4.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
@ -1,69 +1,129 @@
|
|||||||
# 后端服务压测工具
|
# 后端服务压测工具 (loadgen)
|
||||||
|
|
||||||
为部署在阿里云单机(4G/2C)的 TopFans 后端微服务设计。
|
> 给阿里云单机 (4G/2C) TopFans 后端微服务用的压测 + 数据准备工具集。
|
||||||
|
> 凌晨 02:00-06:00 业务低峰执行,数据物理隔离 `star_id=999900`。
|
||||||
|
|
||||||
## 目录
|
---
|
||||||
|
|
||||||
|
## 📚 文档地图
|
||||||
|
|
||||||
|
| 文档 | 用途 | 谁要看 |
|
||||||
|
|------|------|--------|
|
||||||
|
| **README.md** (本文) | 工具集概览 + 5 分钟入门 | 所有人 |
|
||||||
|
| [RUNBOOK.md](RUNBOOK.md) | 凌晨压测**一步一步**操作手册 | on-call 工程师 |
|
||||||
|
| [REPORT_GUIDE.md](REPORT_GUIDE.md) | 压测报告**怎么读** + 瓶颈定位 + 行动项模板 | 看报告的工程师 / TL |
|
||||||
|
| [seed/README.md](seed/README.md) | seed 工具细节 (数据准备) | 第一次跑压测的人 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🧰 工具集概览
|
||||||
|
|
||||||
```
|
```
|
||||||
backend/scripts/loadgen/
|
loadgen/
|
||||||
├── seed/ # 数据准备工具(CLI)
|
├── seed/ # 数据准备 CLI (生成 1000 个测试用户 + 资产 + JWT)
|
||||||
│ ├── main.go # seed CLI 入口
|
├── loadgen/ # 压测主程序 (7 个场景,6 维熔断,带 reporter)
|
||||||
│ ├── stars.go users.go profiles.go assets.go
|
├── monitor/ # 监控栈 (Prometheus + Grafana,可选)
|
||||||
│ ├── slots_and_exhibits.go friendships.go
|
├── recover/ # 紧急灭火 (一键停 + 数据库恢复)
|
||||||
│ ├── tokens.go sequences.go cleanup.go
|
├── scripts/ # 部署到 prod 的辅助脚本
|
||||||
│ ├── seed_test.go # 单元测试
|
└── reports/ # 跑测产出 (gitignore,scp 拉回本地)
|
||||||
│ └── README.md
|
|
||||||
├── loadgen/ # 压测主程序
|
|
||||||
│ ├── main.go # loadgen CLI 入口
|
|
||||||
│ ├── preflight.go verify.go # 7 项开压前检查 + 压后验证
|
|
||||||
│ ├── lib/ # 核心库(16 个测试全过)
|
|
||||||
│ │ ├── csv.go client.go hdr.go log.go ramp.go
|
|
||||||
│ │ ├── circuit.go ssh_metrics.go config.go
|
|
||||||
│ │ └── *_test.go
|
|
||||||
│ ├── scenarios/ # 7 个场景(已注册)
|
|
||||||
│ │ ├── s1_login.go s2_read.go s3_like.go s4_mint.go
|
|
||||||
│ │ ├── s5_dashboard.go s6_ranking.go s7_place.go
|
|
||||||
│ │ ├── common.go scenarios.go
|
|
||||||
│ │ └── scenarios_test.go
|
|
||||||
│ └── reporter/ # 报告生成
|
|
||||||
│ ├── json.go csv.go plot.go markdown.go
|
|
||||||
├── monitor/ # 监控栈
|
|
||||||
│ ├── sample.sh # 后台采样(写到 metrics-feed.jsonl)
|
|
||||||
│ ├── docker-compose.monitor.yml
|
|
||||||
│ ├── prometheus.yml
|
|
||||||
│ └── grafana-dashboards/ # 4 个预置面板
|
|
||||||
├── recover/ # 一键灭火 + 备份还原
|
|
||||||
│ ├── emergency-stop.sh
|
|
||||||
│ └── restore-from-backup.sh
|
|
||||||
├── scripts/ # 部署到 prod
|
|
||||||
│ └── mint_reset.sh
|
|
||||||
└── reports/ # 跑测产出(gitignore)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## 编译
|
### 核心 CLI: `bin/seed` + `bin/loadgen`
|
||||||
|
|
||||||
|
| 命令 | 作用 |
|
||||||
|
|------|------|
|
||||||
|
| `./bin/seed` | 灌测试数据 → `users.csv` + 数据库 |
|
||||||
|
| `./bin/seed --cleanup` | 清理测试数据 (保留 1000 用户) |
|
||||||
|
| `./bin/seed --cleanup --full` | 全部删掉 (账号本身) |
|
||||||
|
| `./bin/seed --reset-tokens` | 只重签 JWT (跨周压测用) |
|
||||||
|
| `./bin/loadgen --cmd=preflight` | 7 项开压前检查 |
|
||||||
|
| `./bin/loadgen --cmd=run --scenarios=S1` | 跑场景 |
|
||||||
|
| `./bin/loadgen --cmd=report` | 生成 markdown 报告 + PNG 图表 |
|
||||||
|
|
||||||
|
### 7 个场景
|
||||||
|
|
||||||
|
| ID | 场景 | 默认 RPS | 写/读 | 关键 API |
|
||||||
|
|----|------|---------|------|---------|
|
||||||
|
| S1 | Login | 15 | 写(轻) | `POST /api/v1/auth/login` |
|
||||||
|
| S2 | Read | 250 | 读 | `GET /api/v1/assets/{id}` |
|
||||||
|
| S3 | Like | 50 | 写(轻) | `POST/DELETE /api/v1/social/assets/{id}/like` |
|
||||||
|
| S4 | Mint | 1-5 | **写(重)** | `POST /api/v1/assets/mints/precreate` |
|
||||||
|
| S5 | Dashboard | — | 读聚合 | (dashboard 聚合) |
|
||||||
|
| S6 | Ranking | 300 | 读 | `GET /api/v1/rankings/hot` |
|
||||||
|
| S7 | Place | 1-5 | **写(重)** | (摆展事务) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 5 分钟入门 (本地 docker)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. 编译 (Linux prod 部署用,本地 darwin 直接 go build)
|
||||||
|
cd backend
|
||||||
|
make loadgen-build
|
||||||
|
|
||||||
|
# 2. 准备数据 (需要本地 docker postgres)
|
||||||
|
cd scripts/loadgen/seed
|
||||||
|
# 生成 bcrypt 哈希 (与 tokens.go 硬编码的 "Test@123" 匹配)
|
||||||
|
python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \
|
||||||
|
> loadtest_bcrypt.txt
|
||||||
|
# 跑 seed (用本地 docker 的 env)
|
||||||
|
DB_PASSWORD=123456 \
|
||||||
|
JWT_SECRET=topfans-secret-key-local-dev-only \
|
||||||
|
/Users/liulujian/Documents/code/TopFansByGithub/backend/bin/seed \
|
||||||
|
--db-name=top-fans --db-host=localhost --db-port=15432 --db-user=postgres
|
||||||
|
|
||||||
|
# 3. 复制 users.csv 到 backend 目录
|
||||||
|
cp users.csv ../../../users.csv
|
||||||
|
|
||||||
|
# 4. 开压前检查
|
||||||
|
cd ../../../ # = backend
|
||||||
|
JWT_SECRET=topfans-secret-key-local-dev-only \
|
||||||
|
./bin/loadgen --cmd=preflight --target=http://localhost:8080
|
||||||
|
|
||||||
|
# 5. 烟雾测试 (30 秒,1 RPS)
|
||||||
|
JWT_SECRET=topfans-secret-key-local-dev-only \
|
||||||
|
./bin/loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \
|
||||||
|
--target=http://localhost:8080 --monitor=off
|
||||||
|
|
||||||
|
# 6. 生成报告
|
||||||
|
JWT_SECRET=topfans-secret-key-local-dev-only \
|
||||||
|
./bin/loadgen --cmd=report --input=./reports --output=./reports/final-report.md
|
||||||
|
open reports/final-report.md # macOS
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔨 编译
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd backend
|
cd backend
|
||||||
|
make loadgen-build # 编译 seed + loadgen 到 bin/
|
||||||
|
make loadgen-test # 单元测试 (23 个)
|
||||||
|
make loadgen-vet # go vet
|
||||||
|
make loadgen-ci # vet + test + build (CI 单步)
|
||||||
|
```
|
||||||
|
|
||||||
|
手动编译 (Linux prod):
|
||||||
|
```bash
|
||||||
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/seed ./scripts/loadgen/seed/
|
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/seed ./scripts/loadgen/seed/
|
||||||
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/loadgen ./scripts/loadgen/loadgen/
|
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o bin/loadgen ./scripts/loadgen/loadgen/
|
||||||
```
|
```
|
||||||
|
|
||||||
## 测试
|
---
|
||||||
|
|
||||||
```bash
|
## 🛡️ 安全设计
|
||||||
cd backend
|
|
||||||
go test ./scripts/loadgen/...
|
|
||||||
```
|
|
||||||
|
|
||||||
**当前测试状态** (截至 Phase 7 完结):
|
### 数据隔离
|
||||||
- `seed` 包: 5/5 PASS
|
所有测试数据用 `star_id = 999900` 物理隔离,**不影响**真实业务 star_id (87, 88, 91, 93, 94, 95)。
|
||||||
- `loadgen/lib` 包: 16/16 PASS
|
|
||||||
- `loadgen/scenarios` 包: 2/2 PASS
|
|
||||||
- 共 23 个测试全过
|
|
||||||
|
|
||||||
## 关键特性
|
### CLAUDE.md 序列重置
|
||||||
|
seed 工具末尾自动同步所有相关表的 PG 序列(避免后续 GORM 插入报 duplicate key)。
|
||||||
|
|
||||||
### 1. 6 维红线判停(自动熔断)
|
### 凌晨窗口
|
||||||
|
执行窗口:**02:00 - 06:00** 业务低峰。
|
||||||
|
紧急灭火: `recover/emergency-stop.sh` 一键停 + `restore-from-backup.sh` 5-8min 还原。
|
||||||
|
|
||||||
|
### 6 维红线熔断 (自动停)
|
||||||
|
|
||||||
| # | 红线 | 阈值 | 数据源 |
|
| # | 红线 | 阈值 | 数据源 |
|
||||||
|---|------|------|--------|
|
|---|------|------|--------|
|
||||||
@ -74,20 +134,108 @@ go test ./scripts/loadgen/...
|
|||||||
| R5 | 磁盘空闲 | < 5GB 持续 30s | metrics-feed |
|
| R5 | 磁盘空闲 | < 5GB 持续 30s | metrics-feed |
|
||||||
| R6 | OOM 事件 | 瞬时触发 | metrics-feed |
|
| R6 | OOM 事件 | 瞬时触发 | metrics-feed |
|
||||||
|
|
||||||
### 2. CLAUDE.md 序列重置
|
---
|
||||||
|
|
||||||
seed 工具自动同步所有相关表的 PG 序列(避免后续 GORM 插入报 duplicate key)。
|
## 📊 报告产出
|
||||||
|
|
||||||
### 3. 数据隔离
|
跑完 + `--cmd=report` 后,`reports/` 下:
|
||||||
|
|
||||||
所有测试数据用 `star_id = 999900` 物理隔离,不影响真实业务 star_id (87, 88, 91, 93, 94, 95)。
|
```
|
||||||
|
reports/
|
||||||
|
├── S1.json # 原始数据 (含 stages)
|
||||||
|
├── S2.json
|
||||||
|
├── S4.json
|
||||||
|
├── baseline.csv # Excel 友好的汇总
|
||||||
|
├── s1.png # RPS / P99 / Error 曲线
|
||||||
|
├── s2.png
|
||||||
|
├── s4.png
|
||||||
|
└── final-report.md # ← 主要看这个
|
||||||
|
```
|
||||||
|
|
||||||
### 4. 凌晨窗口
|
`final-report.md` 包含:
|
||||||
|
1. **总览表** (所有场景一行一个,7 列)
|
||||||
|
2. **每个场景的 ⚠️ 拐点 RPS** (自动算:第一个 p99 涨 >50% 的 stage)
|
||||||
|
3. **阶梯结果表** (每 stage 的 RPS / p50 / p95 / p99 / err / 5xx)
|
||||||
|
4. **PNG 曲线图** (RPS / P99 / Error 三条线)
|
||||||
|
|
||||||
执行窗口:凌晨 02:00-06:00 业务低峰。emergency-stop 一键回滚,restore-from-backup.sh 5-8min 还原。
|
详细读法见 [REPORT_GUIDE.md](REPORT_GUIDE.md)。
|
||||||
|
|
||||||
## 详细文档
|
---
|
||||||
|
|
||||||
|
## 🧪 测试状态
|
||||||
|
|
||||||
|
```
|
||||||
|
seed: 5/5 PASS
|
||||||
|
loadgen/lib: 16/16 PASS
|
||||||
|
scenarios: 2/2 PASS
|
||||||
|
TOTAL: 23/23 PASS
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📁 完整目录
|
||||||
|
|
||||||
|
```
|
||||||
|
backend/scripts/loadgen/
|
||||||
|
├── README.md # ← 你在这里
|
||||||
|
├── RUNBOOK.md # ← 凌晨压测操作手册
|
||||||
|
├── REPORT_GUIDE.md # ← 报告怎么读
|
||||||
|
├── seed/ # 数据准备工具
|
||||||
|
│ ├── main.go # CLI 入口
|
||||||
|
│ ├── stars.go users.go profiles.go assets.go
|
||||||
|
│ ├── slots_and_exhibits.go friendships.go
|
||||||
|
│ ├── tokens.go sequences.go cleanup.go
|
||||||
|
│ ├── seed_test.go # 单元测试
|
||||||
|
│ ├── loadtest_bcrypt.txt # Test@123 哈希 (与 tokens.go 匹配)
|
||||||
|
│ └── README.md
|
||||||
|
├── loadgen/ # 压测主程序
|
||||||
|
│ ├── main.go # CLI 入口
|
||||||
|
│ ├── preflight.go verify.go # 7 项开压前检查 + 压后验证
|
||||||
|
│ ├── lib/ # 核心库
|
||||||
|
│ │ ├── csv.go # users.csv 解析
|
||||||
|
│ │ ├── client.go # HTTP client
|
||||||
|
│ │ ├── hdr.go # 延迟直方图 + per-stage 计数
|
||||||
|
│ │ ├── log.go ramp.go # 日志 + 阶梯调度
|
||||||
|
│ │ ├── circuit.go # 6 维熔断
|
||||||
|
│ │ ├── ssh_metrics.go # prod server metrics 抓取
|
||||||
|
│ │ ├── config.go
|
||||||
|
│ │ └── *_test.go # 16 个测试
|
||||||
|
│ ├── scenarios/ # 7 个场景
|
||||||
|
│ │ ├── s1_login.go
|
||||||
|
│ │ ├── s2_read.go
|
||||||
|
│ │ ├── s3_like.go
|
||||||
|
│ │ ├── s4_mint.go # 支持多 stage
|
||||||
|
│ │ ├── s5_dashboard.go
|
||||||
|
│ │ ├── s6_ranking.go
|
||||||
|
│ │ ├── s7_place.go
|
||||||
|
│ │ ├── common.go # doRequest + DefaultBaseURL
|
||||||
|
│ │ ├── scenarios.go # 注册表
|
||||||
|
│ │ ├── helpers.go
|
||||||
|
│ │ └── scenarios_test.go
|
||||||
|
│ └── reporter/ # 报告生成
|
||||||
|
│ ├── json.go # RunReport + StageReport
|
||||||
|
│ ├── csv.go # baseline.csv
|
||||||
|
│ ├── plot.go # PNG 曲线 (gonum)
|
||||||
|
│ ├── markdown.go # final-report.md
|
||||||
|
│ └── knee.go # KneeRPS 自动算
|
||||||
|
├── monitor/ # 监控栈 (可选)
|
||||||
|
│ ├── sample.sh # 后台采样到 metrics-feed.jsonl
|
||||||
|
│ ├── docker-compose.monitor.yml
|
||||||
|
│ ├── prometheus.yml
|
||||||
|
│ └── grafana-dashboards/ # 4 个预置面板
|
||||||
|
├── recover/ # 紧急灭火
|
||||||
|
│ ├── emergency-stop.sh
|
||||||
|
│ └── restore-from-backup.sh
|
||||||
|
├── scripts/ # prod 辅助
|
||||||
|
│ ├── mint_reset.sh # S4 之间的 mint 数据清理
|
||||||
|
│ └── prod_seed.sh # 一键跑 seed (读 prod env)
|
||||||
|
└── reports/ # 跑测产出 (gitignore)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 详细设计
|
||||||
|
|
||||||
- **设计文档**: `docs/superpowers/specs/2026-06-12-load-testing-design.md`
|
- **设计文档**: `docs/superpowers/specs/2026-06-12-load-testing-design.md`
|
||||||
- **实施计划**: `docs/superpowers/plans/2026-06-12-load-testing.md`
|
- **实施计划**: `docs/superpowers/plans/2026-06-12-load-testing.md`
|
||||||
- **seed 工具说明**: `seed/README.md`
|
- **seed 工具说明**: [seed/README.md](seed/README.md)
|
||||||
|
|||||||
266
backend/scripts/loadgen/REPORT_GUIDE.md
Normal file
266
backend/scripts/loadgen/REPORT_GUIDE.md
Normal file
@ -0,0 +1,266 @@
|
|||||||
|
# REPORT_GUIDE — 压测报告怎么读
|
||||||
|
|
||||||
|
> **目标读者**:看完压测报告后,需要判断"系统能扛住吗"+"哪里是瓶颈"+"下一步改什么"的工程师
|
||||||
|
> **报告路径**:`reports/final-report.md` (主) + `reports/{scenario}.json` (原始) + `reports/{scenario}.png` (图)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 报告目录结构
|
||||||
|
|
||||||
|
```
|
||||||
|
reports/
|
||||||
|
├── S1.json # 场景 1 原始数据 (程序读)
|
||||||
|
├── S2.json # 场景 2
|
||||||
|
├── S4.json # 场景 4
|
||||||
|
├── baseline.csv # Excel 可打开的汇总表
|
||||||
|
├── s1.png # 场景 1 曲线图 (RPS / P99 / Error)
|
||||||
|
├── s2.png
|
||||||
|
├── s4.png
|
||||||
|
└── final-report.md # ← 你要看的总报告
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 三步读完报告
|
||||||
|
|
||||||
|
### 第 1 步:看汇总表 (1 分钟)
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
| Scenario | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | Stages |
|
||||||
|
|----------|-------|-----|-----|-------|-------|-------|-------|--------|
|
||||||
|
| S1 | 12500 | 0 | 0 | 86.59 | 119.23 | 200.50 | 450 | 5 |
|
||||||
|
| S2 | 25000 | 5 | 0 | 12.30 | 35.00 | 88.00 | 250 | 5 |
|
||||||
|
| S4 | 600 | 12 | 2 | 200.00 | 500.00 | 850.00 | 1200 | 4 |
|
||||||
|
```
|
||||||
|
|
||||||
|
**每个字段的含义**:
|
||||||
|
|
||||||
|
| 字段 | 含义 | 健康参考 (4G/2C prod) |
|
||||||
|
|------|------|----------------------|
|
||||||
|
| `Scenario` | 场景 ID (S1=登录, S2=读, S3=点赞, S4=铸造, ...) | — |
|
||||||
|
| `Total` | 该场景总请求数 | 越大越好,代表你扛住了 |
|
||||||
|
| `Err` | 客户端+服务端错误总和 | **< 1%** |
|
||||||
|
| `5xx` | 服务端错误 (500-599) | **< 0.1%** (1‰) |
|
||||||
|
| `P50ms` | 50% 请求在这个时间内 | < 100ms |
|
||||||
|
| `P95ms` | 95% 请求在这个时间内 | < 300ms |
|
||||||
|
| `P99ms` | 99% 请求在这个时间内 | < 1000ms (S4 写重可放宽到 2000ms) |
|
||||||
|
| `Maxms` | 最慢的一次请求 | 一般 3-5x P99 |
|
||||||
|
| `Stages` | 阶梯测试的阶段数 | = step-schedule 的元素数 |
|
||||||
|
|
||||||
|
**判断模板**:
|
||||||
|
- ✅ 全绿 → 系统扛得住,准备上线
|
||||||
|
- ⚠️ 某个 S* Err > 1% → 优先看那个场景
|
||||||
|
- 🚨 某个 S* 5xx > 1% → 服务端有问题,看 §3 定位
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 第 2 步:看拐点 (KneeRPS) (2 分钟)
|
||||||
|
|
||||||
|
每个 scenario 标题下会出现一行:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
**⚠️ 拐点**: stage 3 @ 3 RPS (p99 暴涨 514%)
|
||||||
|
```
|
||||||
|
|
||||||
|
**含义**: 当 RPS 升到 3 时,p99 延迟比 stage 2 暴涨 514% (5.14 倍)。
|
||||||
|
|
||||||
|
**判定逻辑** (在 `reporter/knee.go`):
|
||||||
|
- 逐 stage 比 p99
|
||||||
|
- 第一次涨幅 > 50% 时,标记为拐点
|
||||||
|
- 全程没涨 > 50% → 显示 "✅ 拐点未触发"
|
||||||
|
|
||||||
|
**怎么用这个数字**:
|
||||||
|
- **S1 拐点 RPS = 15** → 你的登录服务,超过 15 QPS 就开始劣化。生产预估峰值 10 QPS,留 50% buffer
|
||||||
|
- **S4 拐点 RPS = 2** → 铸造接口很重,2 QPS 就劣化了。要么优化,要么限流
|
||||||
|
|
||||||
|
**举例**:
|
||||||
|
| 拐点 RPS | 业务含义 | 行动项 |
|
||||||
|
|---------|---------|--------|
|
||||||
|
| ≥ 期望峰值的 2x | ✅ 健康 | 上线,加监控 |
|
||||||
|
| ≈ 期望峰值 | ⚠️ 临界 | 加缓存 / 异步化 / 限流 |
|
||||||
|
| < 期望峰值 | 🚨 不达标 | 重构 + 复测 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 第 3 步:看阶梯表 + 曲线图 (5 分钟)
|
||||||
|
|
||||||
|
**阶梯表** (md 里每个场景下):
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
### 阶梯结果
|
||||||
|
| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms |
|
||||||
|
|-------|-----------|-------|-----|-----|-------|-------|-------|-------|
|
||||||
|
| 1 | 2 | 600 | 0 | 0 | 80 | 100 | 110 | 130 |
|
||||||
|
| 2 | 5 | 1500 | 0 | 0 | 82 | 105 | 115 | 140 |
|
||||||
|
| 3 | 10 | 3000 | 0 | 0 | 85 | 110 | 130 | 180 |
|
||||||
|
| 4 | 15 | 4500 | 0 | 0 | 95 | 130 | 200 | 350 |
|
||||||
|
| 5 | 20 | 6000 | 5 | 0 | 120 | 200 | 450 | 800 |
|
||||||
|
```
|
||||||
|
|
||||||
|
**怎么读**:
|
||||||
|
|
||||||
|
- **Total** 应该是 `TargetRPS × Duration` (近似,因为有误差)
|
||||||
|
- **P99ms** 应该随 TargetRPS 上升**平滑增加** (10-30% 涨幅/stage 是正常)
|
||||||
|
- **Err / 5xx** 应该全程 < 1%
|
||||||
|
- **如果某 stage 突然 P99 翻倍** → 拐点,看上面 KneeRPS
|
||||||
|
|
||||||
|
**曲线图** (`s1.png` 等):
|
||||||
|
|
||||||
|
- **X 轴**: Stage 编号 (1, 2, 3, ...)
|
||||||
|
- **Y 轴**: 三个值 — RPS (蓝)、P99ms (绿)、Error% (红)
|
||||||
|
- **怎么看**:
|
||||||
|
- 三条线**平稳上升** = 正常
|
||||||
|
- **P99 突然陡升** = 拐点
|
||||||
|
- **Error% 突然跳起来** = 服务挂了
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 定位瓶颈 — 常见模式
|
||||||
|
|
||||||
|
### 模式 1: P99 阶梯上升,但 Error 一直 0
|
||||||
|
**含义**: 系统扛得住,但在变慢。
|
||||||
|
**原因**: GC 抖动 / DB 慢查询 / 锁竞争。
|
||||||
|
**行动**:
|
||||||
|
1. 看 PG 慢查询日志: `pg_stat_statements` ORDER BY `mean_exec_time` DESC
|
||||||
|
2. 看应用层 profile: `pprof` heap + cpu
|
||||||
|
3. 检查连接池配置: 可能太小
|
||||||
|
|
||||||
|
### 模式 2: P99 阶梯上升 + Error 也开始涨
|
||||||
|
**含义**: 系统到极限。
|
||||||
|
**原因**: 资源耗尽 (CPU 100%, 连接池满, DB 锁)。
|
||||||
|
**行动**:
|
||||||
|
1. 看 server metrics feed: `tail -f metrics-feed.jsonl`
|
||||||
|
2. `top` 看 CPU/内存,`iostat` 看 IO
|
||||||
|
3. 检查是否有连接泄漏 (`netstat | grep TIME_WAIT`)
|
||||||
|
|
||||||
|
### 模式 3: 阶梯早期就 5xx > 5%
|
||||||
|
**含义**: 系统本身有问题,不是负载问题。
|
||||||
|
**原因**: 代码 bug / 配置错误 / 依赖缺失。
|
||||||
|
**行动**:
|
||||||
|
1. 看 5xx 的具体响应体 (在 log 里)
|
||||||
|
2. 检查 error 码,对照业务错误码定义
|
||||||
|
3. 看是不是 auth/JWT 过期
|
||||||
|
|
||||||
|
### 模式 4: 第一个 stage P99 很高,后续反而低
|
||||||
|
**含义**: 热身不够 / 缓存没预热。
|
||||||
|
**原因**: Redis 冷启动 / JIT 编译 / DB 连接池启动慢。
|
||||||
|
**行动**:
|
||||||
|
1. 第一次 stage 加长 (例如先 2min 预热)
|
||||||
|
2. 或者用 `--rps=1` 先跑 1-2min 预热,再开阶梯
|
||||||
|
|
||||||
|
### 模式 5: S4 (Mint) 在很低的 RPS 就拐
|
||||||
|
**含义**: 写路径太重。
|
||||||
|
**原因**: 铸造涉及事务 / 签名 / OSS 上传,本身就是慢操作。
|
||||||
|
**行动**:
|
||||||
|
1. 检查 mint 是不是同步阻塞 (能不能异步化?)
|
||||||
|
2. 看 mint 数据是否需要落库 (能否用 append-only?)
|
||||||
|
3. 考虑限流: 服务端拒绝 > 2 QPS 的 mint 请求
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. 怎么写出行动项
|
||||||
|
|
||||||
|
读完报告,应该能回答三个问题:
|
||||||
|
|
||||||
|
### Q1: 系统能扛住业务预期峰值吗?
|
||||||
|
- 业务预期峰值 → 比对拐点 RPS
|
||||||
|
- 拐点 ≥ 2x 峰值 → ✅ 可以上线
|
||||||
|
- 拐点 ≈ 1x 峰值 → ⚠️ 加监控告警,谨慎上线
|
||||||
|
- 拐点 < 峰值 → 🚨 必须先优化
|
||||||
|
|
||||||
|
### Q2: 拐点在哪里?为什么?
|
||||||
|
看哪个 stage 拐的,然后:
|
||||||
|
- **CPU 100%** → 计算密集,优化算法或加机器
|
||||||
|
- **DB CPU 100%** → 慢查询,加索引或读写分离
|
||||||
|
- **PG 连接数满** → 连接池配置 / 服务降级
|
||||||
|
- **PG 锁等待** → 事务设计问题
|
||||||
|
- **磁盘 IO 满** → 加 SSD 或缓存
|
||||||
|
|
||||||
|
### Q3: 下一步改什么?
|
||||||
|
|
||||||
|
行动项模板:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
## [Loadtest 2026-06-15] 行动项
|
||||||
|
|
||||||
|
### P0 (上线前必修)
|
||||||
|
- [ ] **S2 Read 拐点 100 RPS < 业务预期 150 RPS**
|
||||||
|
- 根因: PG `assets` 表全表扫描,10 万行
|
||||||
|
- 修复: 加 `idx_assets_star_id_status` 索引
|
||||||
|
- Owner: @dba
|
||||||
|
|
||||||
|
### P1 (1 周内修)
|
||||||
|
- [ ] **S4 Mint 拐点 2 RPS**
|
||||||
|
- 根因: 同步写 OSS + 同步落库
|
||||||
|
- 修复: mint 流程拆成 precreate + 后台 worker
|
||||||
|
- Owner: @backend
|
||||||
|
|
||||||
|
### P2 (技术债)
|
||||||
|
- [ ] 压测期间 CPU 持续 80%,考虑扩容到 4C
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. JSON 原始数据怎么读 (高级)
|
||||||
|
|
||||||
|
`reports/S1.json` 长这样:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"scenario": "S1",
|
||||||
|
"total_requests": 12500,
|
||||||
|
"errors": 5,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 86591,
|
||||||
|
"p95_us": 119231,
|
||||||
|
"p99_us": 200502,
|
||||||
|
"max_us": 450000,
|
||||||
|
"stages": [
|
||||||
|
{
|
||||||
|
"stage_idx": 1,
|
||||||
|
"target_rps": 2,
|
||||||
|
"total_requests": 600,
|
||||||
|
"errors": 0,
|
||||||
|
"five_xx": 0,
|
||||||
|
"p50_us": 80000,
|
||||||
|
"p95_us": 100000,
|
||||||
|
"p99_us": 110000,
|
||||||
|
"max_us": 130000
|
||||||
|
},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**单位说明**:
|
||||||
|
- 所有 `_us` 后缀 = microseconds (微秒,1ms = 1000us)
|
||||||
|
- 例: `p99_us: 200502` = 200.5 ms
|
||||||
|
|
||||||
|
**怎么用**:
|
||||||
|
- 画自己的图 (用 Excel/Google Sheets 打开 baseline.csv 最方便)
|
||||||
|
- 跟历史报告对比 (跨版本性能回归)
|
||||||
|
- CI 集成: 解析 JSON,断言 P99 < 某个阈值
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. 常见问题
|
||||||
|
|
||||||
|
### Q: "5xx=0 但 Err=5" 是什么意思?
|
||||||
|
A: 5xx 是服务端错,Err 是总错 (含 4xx)。Err > 5xx 表示有客户端错 (一般是 401/403/404)。看 log 里具体错误码。
|
||||||
|
|
||||||
|
### Q: 为什么 P50 很低但 P99 很高?
|
||||||
|
A: 正常 — 长尾效应。99% 都快但 1% 慢。如果 P99 太高说明有少数请求卡住,看是不是 GC / 锁 / IO 抖动。
|
||||||
|
|
||||||
|
### Q: Max 比 P99 高很多,是不是异常?
|
||||||
|
A: 可能是单个网络抖动,正常。Max / P99 < 5x 都是健康。
|
||||||
|
|
||||||
|
### Q: 同一个场景不同次跑,数据差很多?
|
||||||
|
A: 检查 prod 是否有其他流量在跑 (业务)。压测应在凌晨,业务低峰。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 进一步
|
||||||
|
|
||||||
|
- 想优化场景,见 `seed/README.md`
|
||||||
|
- 想加新场景,在 `scenarios/` 新建 `s8_xxx.go`,模仿 s1_login.go 的 BeginStage/EndStage 模式
|
||||||
|
- 想加新的红线指标,见 `lib/circuit.go`
|
||||||
366
backend/scripts/loadgen/RUNBOOK.md
Normal file
366
backend/scripts/loadgen/RUNBOOK.md
Normal file
@ -0,0 +1,366 @@
|
|||||||
|
# RUNBOOK — 凌晨压测执行手册
|
||||||
|
|
||||||
|
> **目标读者**:负责 prod 凌晨压测的 on-call 工程师
|
||||||
|
> **执行窗口**:02:00 - 06:00 (业务低峰)
|
||||||
|
> **预计总耗时**:1.5 - 4 小时 (按场景数)
|
||||||
|
> **风险等级**:🟡 中 (会写 23k+ 测试数据,但物理隔离 star_id=999900)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. 前置检查 (T-1 天)
|
||||||
|
|
||||||
|
### 0.1 确认 prod 状态
|
||||||
|
```bash
|
||||||
|
# SSH 到 prod
|
||||||
|
ssh root@101.132.250.62
|
||||||
|
|
||||||
|
# 确认 prod 网关正常
|
||||||
|
curl -sS http://localhost:8080/health
|
||||||
|
# 期望: {"service":"top-fans-gateway","status":"ok"}
|
||||||
|
|
||||||
|
# 确认磁盘空间 > 10GB (R5 红线需要)
|
||||||
|
df -h /opt
|
||||||
|
# 期望: Avail > 10G
|
||||||
|
```
|
||||||
|
|
||||||
|
### 0.2 确认阿里云快照 < 24h
|
||||||
|
- 登录 ECS 控制台 → 实例 → 磁盘与镜像 → 快照
|
||||||
|
- 必须有 < 24h 的快照,**否则不要开压**
|
||||||
|
- 没有的话先手动触发:实例 → 更多 → 磁盘和镜像 → 创建快照
|
||||||
|
|
||||||
|
### 0.3 备份数据库
|
||||||
|
```bash
|
||||||
|
ssh root@101.132.250.62
|
||||||
|
mkdir -p /opt/topfans/backups
|
||||||
|
pg_dump -h localhost -U postgres topfans > /opt/topfans/backups/pre-loadtest-$(date +%Y%m%d-%H%M).sql
|
||||||
|
ls -lh /opt/topfans/backups/pre-loadtest-*.sql
|
||||||
|
# 期望: 文件 > 50MB
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 上传/确认工具 (T-30min)
|
||||||
|
|
||||||
|
### 1.1 确认工具已上传到 prod
|
||||||
|
```bash
|
||||||
|
ssh root@101.132.250.62
|
||||||
|
ls -la /opt/topfans/loadtest/
|
||||||
|
# 必须看到:
|
||||||
|
# seed (二进制)
|
||||||
|
# loadgen (二进制)
|
||||||
|
# loadtest_bcrypt.txt
|
||||||
|
# scripts/prod_seed.sh
|
||||||
|
# README.md
|
||||||
|
# reports/ (空目录)
|
||||||
|
```
|
||||||
|
|
||||||
|
如果文件缺失,本地重新上传:
|
||||||
|
```bash
|
||||||
|
# 本地 (从 backend 目录)
|
||||||
|
cd /Users/liulujian/Documents/code/TopFansByGithub/backend
|
||||||
|
|
||||||
|
# 重新编译
|
||||||
|
make loadgen-build
|
||||||
|
|
||||||
|
# 上传
|
||||||
|
scp bin/seed bin/loadgen root@101.132.250.62:/opt/topfans/loadtest/
|
||||||
|
scp scripts/loadgen/seed/loadtest_bcrypt.txt root@101.132.250.62:/opt/topfans/loadtest/
|
||||||
|
scp scripts/loadgen/scripts/prod_seed.sh root@101.132.250.62:/opt/topfans/loadtest/scripts/
|
||||||
|
ssh root@101.132.250.62 "chmod +x /opt/topfans/loadtest/{seed,loadgen} /opt/topfans/loadtest/scripts/prod_seed.sh"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.2 重新生成 bcrypt 哈希 (如果你改了密码策略)
|
||||||
|
```bash
|
||||||
|
# 本地
|
||||||
|
cd backend/scripts/loadgen/seed
|
||||||
|
|
||||||
|
# 生成与 tokens.go 硬编码密码 (默认 "Test@123") 匹配的哈希
|
||||||
|
python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \
|
||||||
|
> loadtest_bcrypt.txt
|
||||||
|
|
||||||
|
# 上传覆盖
|
||||||
|
scp loadtest_bcrypt.txt root@101.132.250.62:/opt/topfans/loadtest/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 数据准备 (T0 = 02:00)
|
||||||
|
|
||||||
|
### 2.1 SSH 到 prod
|
||||||
|
```bash
|
||||||
|
ssh root@101.132.250.62
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.2 一键跑 seed (生产数据灌入)
|
||||||
|
```bash
|
||||||
|
cd /opt/topfans/loadtest
|
||||||
|
bash scripts/prod_seed.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**这一步骤会做什么**:
|
||||||
|
- 读 `/opt/topfans/docker/.env.prod` 拿 DB_PASSWORD + JWT_SECRET
|
||||||
|
- 插入 star_id=999900 测试明星 (1 行)
|
||||||
|
- 插入 1000 个测试用户 (mobile 19900000001 - 19900001000)
|
||||||
|
- 插入 1000 个 fan_profile + crystal
|
||||||
|
- 插入 5000 个 assets
|
||||||
|
- 插入 3000 个 booth_slots + 2000 个 exhibitions
|
||||||
|
- 插入 10000 个 friendships
|
||||||
|
- **重置所有相关表的 PG 序列** (CLAUDE.md 规范,避免后续 GORM 插入报 duplicate key)
|
||||||
|
- 签 1000 个 JWT,写到 `users.csv`
|
||||||
|
|
||||||
|
**预计耗时**:30 - 60 秒
|
||||||
|
|
||||||
|
**预期输出**:
|
||||||
|
```
|
||||||
|
✓ stars seeded
|
||||||
|
✓ 1000 users seeded
|
||||||
|
✓ 1000 fan_profiles + crystal seeded
|
||||||
|
✓ 5000 assets seeded
|
||||||
|
✓ 3000 booth_slots + 2000 exhibitions seeded
|
||||||
|
✓ 10000 friendships seeded
|
||||||
|
✓ sequences reset
|
||||||
|
✅ users.csv written: 1000 rows
|
||||||
|
✅ seed + tokens completed
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 开压前 7 项检查 (T0+1min)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /opt/topfans/loadtest
|
||||||
|
./loadgen --cmd=preflight --target=http://localhost:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
**预期全部 PASS**:
|
||||||
|
```
|
||||||
|
✓ ① Gateway /health HTTP 200
|
||||||
|
✓ ② SSH to prod (省略,如不需要 server metrics)
|
||||||
|
✓ ③ pg_dump backup > 50MB (你的备份)
|
||||||
|
✓ ④ 阿里云快照 < 24h (人工确认)
|
||||||
|
✓ ⑤ prod 磁盘空闲 > 10GB free > 10G
|
||||||
|
✓ ⑥ users.csv 1000 rows rows=1000
|
||||||
|
✓ ⑦ JWT_SECRET set set
|
||||||
|
|
||||||
|
ALL CHECKS PASSED — 可以开压
|
||||||
|
```
|
||||||
|
|
||||||
|
**如果有 FAIL**:见 "附录 A: 故障排查"
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. 烟雾测试 (T0+2min) — 强烈推荐
|
||||||
|
|
||||||
|
> 这一步只花 30 秒,但能提前发现 90% 的集成问题,省后面 1 小时排错
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /opt/topfans/loadtest
|
||||||
|
JWT_SECRET=$(grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod | cut -d= -f2) \
|
||||||
|
./loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \
|
||||||
|
--target=http://localhost:8080 --monitor=off 2>&1 | tee reports/smoke-s1.log
|
||||||
|
```
|
||||||
|
|
||||||
|
**预期**:
|
||||||
|
```
|
||||||
|
📊 S1: total=30 err=0 5xx=0 p99=200ms stages=1
|
||||||
|
✅ loadgen done. total=30 err=0 fiveXX=0
|
||||||
|
```
|
||||||
|
|
||||||
|
**判定**:
|
||||||
|
- ✅ total=30, err=0 → 进入正式压测
|
||||||
|
- ❌ total < 30 → 跑挂了,查 `reports/smoke-s1.log`
|
||||||
|
- ❌ err > 0 → auth/JWT 问题,检查 `users.csv` 和 JWT_SECRET
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. 正式压测 (T0+3min)
|
||||||
|
|
||||||
|
### 5.1 选择策略
|
||||||
|
|
||||||
|
**Plan B 推荐** (S1 + S2 + S4,~1.5 小时):
|
||||||
|
```bash
|
||||||
|
cd /opt/topfans/loadtest
|
||||||
|
export JWT_SECRET=$(grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod | cut -d= -f2)
|
||||||
|
export PROD_SSH=root@101.132.250.62
|
||||||
|
|
||||||
|
# === 场景 1: Login (02:05-02:30, 25min) ===
|
||||||
|
./loadgen --cmd=run --scenarios=S1 \
|
||||||
|
--stage=step --step-schedule='2,5,10,15,20' \
|
||||||
|
--duration=5m --target=http://localhost:8080 \
|
||||||
|
--monitor=full --prod-ssh=$PROD_SSH \
|
||||||
|
--inter-scenario-pause=0s 2>&1 | tee reports/s1.log
|
||||||
|
# 预期: 5 个 stage,每 stage 5min,p99 应随 RPS 阶梯上升
|
||||||
|
|
||||||
|
# === 场景 2: Read (02:35-03:00, 25min) ===
|
||||||
|
./loadgen --cmd=run --scenarios=S2 \
|
||||||
|
--stage=step --step-schedule='10,30,60,100,150' \
|
||||||
|
--duration=5m --target=http://localhost:8080 \
|
||||||
|
--monitor=full --prod-ssh=$PROD_SSH \
|
||||||
|
--inter-scenario-pause=0s 2>&1 | tee reports/s2.log
|
||||||
|
|
||||||
|
# === 场景 4: Mint (03:05-03:30, 25min, 写重,保守) ===
|
||||||
|
./loadgen --cmd=run --scenarios=S4 \
|
||||||
|
--stage=step --step-schedule='1,2,3,5' \
|
||||||
|
--duration=5m --target=http://localhost:8080 \
|
||||||
|
--monitor=full --prod-ssh=$PROD_SSH \
|
||||||
|
--inter-scenario-pause=0s 2>&1 | tee reports/s4.log
|
||||||
|
```
|
||||||
|
|
||||||
|
**Plan A 全量** (S1-S7,~3.5 小时):
|
||||||
|
```bash
|
||||||
|
# S1-S7 全部跑,S4/S7 写重场景保守
|
||||||
|
SCENARIOS="S1,S2,S3,S4,S5,S6,S7"
|
||||||
|
SCHEDULES_BY_SCENARIO='{"S1":"2,5,10,15,20","S2":"10,30,60,100,150","S3":"5,15,30,50","S4":"1,2,3,5","S5":"5,10,20,40","S6":"20,50,100,150","S7":"1,2,3,5"}'
|
||||||
|
# (目前 loadgen 一次只支持一个 schedule,需要跑 7 次)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.2 每个场景跑完后做什么
|
||||||
|
1. 检查 `reports/{scenario}.log` 末尾的 `📊` 行
|
||||||
|
2. 记录 total / err / 5xx / p99 / stages
|
||||||
|
3. 如果 `🚨 circuit breaker tripped` 触发,**立即停**,见附录 B
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. 生成报告 (T+1min)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /opt/topfans/loadtest
|
||||||
|
./loadgen --cmd=report --input=./reports --output=./reports/final-report.md
|
||||||
|
```
|
||||||
|
|
||||||
|
**产出**:
|
||||||
|
```
|
||||||
|
reports/
|
||||||
|
├── S1.json
|
||||||
|
├── S2.json
|
||||||
|
├── S4.json
|
||||||
|
├── baseline.csv # Excel 可直接打开
|
||||||
|
├── s1.png # RPS/P99/Error 曲线图
|
||||||
|
├── s2.png
|
||||||
|
├── s4.png
|
||||||
|
└── final-report.md # 人看的报告
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 收尾 (T+2min)
|
||||||
|
|
||||||
|
### 7.1 拉报告到本地
|
||||||
|
```bash
|
||||||
|
# 本地
|
||||||
|
mkdir -p ~/Desktop/loadtest-report-$(date +%Y%m%d)
|
||||||
|
scp -r root@101.132.250.62:/opt/topfans/loadtest/reports/* ~/Desktop/loadtest-report-$(date +%Y%m%d)/
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7.2 决定是否清理测试数据
|
||||||
|
|
||||||
|
| 情况 | 动作 |
|
||||||
|
|------|------|
|
||||||
|
| 数据分析完,后续不需要 | `./seed --cleanup --full` |
|
||||||
|
| 数据还要保留做下一轮 | `./seed --cleanup` (保留 1000 用户,清理关联数据) |
|
||||||
|
| 只是 JWT 过期 | `./seed --reset-tokens --jwt-secret=$JWT_SECRET` |
|
||||||
|
| **生产事故** | `./seed --cleanup --full` + 立即回滚,见附录 C |
|
||||||
|
|
||||||
|
### 7.3 (可选) 关闭监控后台采样
|
||||||
|
```bash
|
||||||
|
# 如果你启动了 monitor/sample.sh,杀掉
|
||||||
|
ssh root@101.132.250.62 "pkill -f 'monitor/sample.sh'"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. 报告分析 (T+30min,白天)
|
||||||
|
|
||||||
|
见 `REPORT_GUIDE.md` — 教你怎么读 `final-report.md`,定位瓶颈,写行动项。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 附录 A: 故障排查
|
||||||
|
|
||||||
|
### A.1 preflight FAIL: users.csv 不存在
|
||||||
|
**原因**: 上次 seed 没跑成功
|
||||||
|
**修复**: `cd /opt/topfans/loadtest && bash scripts/prod_seed.sh`
|
||||||
|
|
||||||
|
### A.2 preflight FAIL: 阿里云快照 < 24h
|
||||||
|
**原因**: 没备份
|
||||||
|
**修复**: 在 ECS 控制台手动建快照,等就绪后重跑 preflight
|
||||||
|
|
||||||
|
### A.3 烟雾测试 FAIL: 大量 4xx
|
||||||
|
**原因**: JWT_SECRET 不匹配 / users.csv 过期
|
||||||
|
**修复**:
|
||||||
|
```bash
|
||||||
|
# 1. 确认 JWT_SECRET
|
||||||
|
grep '^JWT_SECRET=' /opt/topfans/docker/.env.prod
|
||||||
|
|
||||||
|
# 2. 重签 token (数据保留)
|
||||||
|
./seed --reset-tokens --jwt-secret=$JWT_SECRET
|
||||||
|
|
||||||
|
# 3. 重跑
|
||||||
|
./loadgen --cmd=run --scenarios=S1 --stage=baseline --rps=1 --duration=30s \
|
||||||
|
--target=http://localhost:8080 --monitor=off
|
||||||
|
```
|
||||||
|
|
||||||
|
### A.4 烟雾测试 FAIL: 大量 5xx
|
||||||
|
**原因**: 网关/服务挂了
|
||||||
|
**修复**: 先看 `docker ps` 确认服务在,`curl /health` 确认网关活
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 附录 B: Circuit Breaker 触发 (🚨)
|
||||||
|
|
||||||
|
如果出现 `🚨 circuit breaker tripped!`,**立即**:
|
||||||
|
1. **Ctrl+C** 停止当前 loadgen (会 graceful shutdown,等待当前请求完成)
|
||||||
|
2. 立即判断:
|
||||||
|
- 5xx > 10% 持续 10s → 服务有问题,见附录 C
|
||||||
|
- 仅客户端错率高 → 测试问题,可能是 step 跳太猛
|
||||||
|
3. **降低 RPS 重跑** 或 **改天再试**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 附录 C: 紧急灭火 (production 被打挂了)
|
||||||
|
|
||||||
|
**判定**: 服务真实报错(不是测试客户端问题),prod 用户受影响。
|
||||||
|
|
||||||
|
**立即执行** (按顺序,每步 30s 内):
|
||||||
|
```bash
|
||||||
|
ssh root@101.132.250.62
|
||||||
|
|
||||||
|
# 1. 停 loadgen + 监控
|
||||||
|
pkill -f 'bin/loadgen'
|
||||||
|
pkill -f 'monitor/sample.sh'
|
||||||
|
|
||||||
|
# 2. 清测试数据 (1 秒)
|
||||||
|
cd /opt/topfans/loadtest
|
||||||
|
./seed --cleanup --full
|
||||||
|
|
||||||
|
# 3. 重启服务 (让 prod 回到 baseline)
|
||||||
|
cd /opt/topfans/docker
|
||||||
|
docker-compose -f docker-compose.prod.yml --profile prod restart
|
||||||
|
|
||||||
|
# 4. (最严重情况) 从备份还原
|
||||||
|
bash /opt/topfans/loadtest/recover/restore-from-backup.sh
|
||||||
|
# 输入 backup 文件路径,预计 5-8 分钟
|
||||||
|
```
|
||||||
|
|
||||||
|
**事后**:
|
||||||
|
- 写事故复盘
|
||||||
|
- 修压测发现的 bug
|
||||||
|
- 调整 step schedule (下一次更保守)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 附录 D: 常用 cheat sheet
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看 loadtest 进程
|
||||||
|
ssh root@101.132.250.62 "ps aux | grep -E '(loadgen|sample)' | grep -v grep"
|
||||||
|
|
||||||
|
# 看实时日志
|
||||||
|
ssh root@101.132.250.62 "tail -f /opt/topfans/loadtest/reports/*.log"
|
||||||
|
|
||||||
|
# 看 metrics feed
|
||||||
|
ssh root@101.132.250.62 "tail -f /opt/topfans/loadtest/metrics-feed.jsonl"
|
||||||
|
|
||||||
|
# 测一下网关还活着
|
||||||
|
ssh root@101.132.250.62 "curl -sS http://localhost:8080/health"
|
||||||
|
```
|
||||||
@ -2,13 +2,36 @@ package lib
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
"github.com/HdrHistogram/hdrhistogram-go"
|
"github.com/HdrHistogram/hdrhistogram-go"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// LatencyRecorder tracks latency histogram + per-stage counters.
|
||||||
|
//
|
||||||
|
// Concurrency model: a single LatencyRecorder is shared across all scenarios.
|
||||||
|
// Per-scenario isolation: callers MUST call Reset() at scenario boundaries.
|
||||||
|
// Per-stage isolation: callers MUST call BeginStage() at stage boundaries
|
||||||
|
// (which clears histogram + zero stage counters).
|
||||||
type LatencyRecorder struct {
|
type LatencyRecorder struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
h *hdrhistogram.Histogram
|
h *hdrhistogram.Histogram
|
||||||
|
|
||||||
|
stageTotal atomic.Int64
|
||||||
|
stageErrors atomic.Int64
|
||||||
|
stageFiveXX atomic.Int64
|
||||||
|
|
||||||
|
stages []StageSnapshot
|
||||||
|
}
|
||||||
|
|
||||||
|
// StageSnapshot is the per-stage data captured by EndStage.
|
||||||
|
type StageSnapshot struct {
|
||||||
|
StageIdx int
|
||||||
|
TargetRPS int
|
||||||
|
Histogram *hdrhistogram.Histogram
|
||||||
|
TotalRequests int64
|
||||||
|
Errors int64
|
||||||
|
FiveXX int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewLatencyRecorder() *LatencyRecorder {
|
func NewLatencyRecorder() *LatencyRecorder {
|
||||||
@ -17,6 +40,7 @@ func NewLatencyRecorder() *LatencyRecorder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record stores a latency sample (in microseconds).
|
||||||
func (r *LatencyRecorder) Record(latencyUs int64) {
|
func (r *LatencyRecorder) Record(latencyUs int64) {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
@ -26,8 +50,79 @@ func (r *LatencyRecorder) Record(latencyUs int64) {
|
|||||||
_ = r.h.RecordValue(latencyUs)
|
_ = r.h.RecordValue(latencyUs)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RecordResult increments per-stage error/5xx counters based on HTTP status code.
|
||||||
|
// isError: status >= 400 or transport error
|
||||||
|
// is5xx: status >= 500
|
||||||
|
func (r *LatencyRecorder) RecordResult(isError, is5xx bool) {
|
||||||
|
if isError {
|
||||||
|
r.stageErrors.Add(1)
|
||||||
|
}
|
||||||
|
if is5xx {
|
||||||
|
r.stageFiveXX.Add(1)
|
||||||
|
}
|
||||||
|
r.stageTotal.Add(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Snapshot returns a copy of the current histogram (for use by circuit-breaker).
|
||||||
|
// Does NOT affect per-stage counters.
|
||||||
func (r *LatencyRecorder) Snapshot() *hdrhistogram.Histogram {
|
func (r *LatencyRecorder) Snapshot() *hdrhistogram.Histogram {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
return hdrhistogram.Import(r.h.Export())
|
return hdrhistogram.Import(r.h.Export())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reset clears the histogram, per-stage counters, AND accumulated stages.
|
||||||
|
// Call between scenarios.
|
||||||
|
func (r *LatencyRecorder) Reset() {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
r.h = hdrhistogram.New(1, 30_000_000, 3)
|
||||||
|
r.stages = nil
|
||||||
|
r.stageTotal.Store(0)
|
||||||
|
r.stageErrors.Store(0)
|
||||||
|
r.stageFiveXX.Store(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ClearStages drops accumulated stage data but keeps the current histogram and counters.
|
||||||
|
// Use when you want stages to remain but accumulated list to be discarded.
|
||||||
|
func (r *LatencyRecorder) ClearStages() {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
r.stages = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// BeginStage marks the start of a new stage at TargetRPS RPS.
|
||||||
|
// Resets histogram AND per-stage counters. Stages slice gains a new entry.
|
||||||
|
func (r *LatencyRecorder) BeginStage(idx, targetRPS int) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
r.h = hdrhistogram.New(1, 30_000_000, 3)
|
||||||
|
r.stageTotal.Store(0)
|
||||||
|
r.stageErrors.Store(0)
|
||||||
|
r.stageFiveXX.Store(0)
|
||||||
|
r.stages = append(r.stages, StageSnapshot{StageIdx: idx, TargetRPS: targetRPS})
|
||||||
|
}
|
||||||
|
|
||||||
|
// EndStage freezes the histogram + per-stage counters into the latest stage entry.
|
||||||
|
// Must be called after BeginStage and after the stage has produced some traffic.
|
||||||
|
func (r *LatencyRecorder) EndStage() {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
if len(r.stages) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
last := &r.stages[len(r.stages)-1]
|
||||||
|
last.Histogram = hdrhistogram.Import(r.h.Export())
|
||||||
|
last.TotalRequests = r.stageTotal.Load()
|
||||||
|
last.Errors = r.stageErrors.Load()
|
||||||
|
last.FiveXX = r.stageFiveXX.Load()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stages returns a copy of accumulated stage snapshots.
|
||||||
|
func (r *LatencyRecorder) Stages() []StageSnapshot {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
out := make([]StageSnapshot, len(r.stages))
|
||||||
|
copy(out, r.stages)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|||||||
@ -66,6 +66,31 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d
|
|||||||
// 让 scenarios 用 --target 而不是写死的 prod IP
|
// 让 scenarios 用 --target 而不是写死的 prod IP
|
||||||
scenarios.DefaultBaseURL = target
|
scenarios.DefaultBaseURL = target
|
||||||
|
|
||||||
|
// 写 run-metadata.json (供 --cmd=report 使用)
|
||||||
|
runStart := time.Now()
|
||||||
|
defer func() {
|
||||||
|
meta := reporter.RunMetadata{
|
||||||
|
StartTime: runStart,
|
||||||
|
EndTime: time.Now(),
|
||||||
|
Target: target,
|
||||||
|
Scenarios: strings.Split(scenarioIDs, ","),
|
||||||
|
StepSchedule: stepSchedule,
|
||||||
|
StageMode: stage,
|
||||||
|
RPSOverride: rps,
|
||||||
|
MonitorMode: monitorMode,
|
||||||
|
ProdSSH: prodSSH,
|
||||||
|
}
|
||||||
|
// 取 JWT_SECRET 前 8 位作为 hint
|
||||||
|
if jwtSecret := os.Getenv("JWT_SECRET"); len(jwtSecret) >= 8 {
|
||||||
|
meta.JWTSecretHint = jwtSecret[:8]
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll("reports", 0o755); err == nil {
|
||||||
|
if data, err := json.MarshalIndent(meta, "", " "); err == nil {
|
||||||
|
_ = os.WriteFile(filepath.Join("reports", "run-metadata.json"), data, 0o644)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
users, err := lib.LoadUsers("users.csv")
|
users, err := lib.LoadUsers("users.csv")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("load users.csv: %w (先跑 `seed` 生成 users.csv)", err)
|
return fmt.Errorf("load users.csv: %w (先跑 `seed` 生成 users.csv)", err)
|
||||||
@ -126,6 +151,14 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
log.Printf("=== scenario %d/%d: %s ===", idx+1, len(ids), id)
|
log.Printf("=== scenario %d/%d: %s ===", idx+1, len(ids), id)
|
||||||
|
|
||||||
|
// 场景开始:快照 delta 基线,清空 stage 累积
|
||||||
|
recorder.ClearStages()
|
||||||
|
recorder.Reset()
|
||||||
|
prevTotal := totalCount.Load()
|
||||||
|
prevErr := errCount.Load()
|
||||||
|
prev5xx := fiveXXCount.Load()
|
||||||
|
|
||||||
s, err := scenarios.Get(id, client, users, &errCount, &totalCount, &fiveXXCount, recorder, breaker, prodSSH)
|
s, err := scenarios.Get(id, client, users, &errCount, &totalCount, &fiveXXCount, recorder, breaker, prodSSH)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("scenario %s: %w", id, err)
|
return fmt.Errorf("scenario %s: %w", id, err)
|
||||||
@ -133,6 +166,38 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d
|
|||||||
if err := s.Run(ctx, rps, duration, dashboard, breaker, stages); err != nil {
|
if err := s.Run(ctx, rps, duration, dashboard, breaker, stages); err != nil {
|
||||||
return fmt.Errorf("run scenario %s: %w", id, err)
|
return fmt.Errorf("run scenario %s: %w", id, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 场景结束:写 per-scenario JSON (含 stages)
|
||||||
|
scenarioTotal := totalCount.Load() - prevTotal
|
||||||
|
scenarioErr := errCount.Load() - prevErr
|
||||||
|
scenario5xx := fiveXXCount.Load() - prev5xx
|
||||||
|
scenarioStages := recorder.Stages()
|
||||||
|
|
||||||
|
stageReports := make([]reporter.StageReport, 0, len(scenarioStages))
|
||||||
|
for _, ss := range scenarioStages {
|
||||||
|
stageReports = append(stageReports, reporter.MakeStageReport(
|
||||||
|
ss.StageIdx, ss.TargetRPS, ss.Histogram,
|
||||||
|
ss.TotalRequests, ss.Errors, ss.FiveXX,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
rr := reporter.RunReport{
|
||||||
|
Scenario: id,
|
||||||
|
TotalRequests: scenarioTotal,
|
||||||
|
Errors: scenarioErr,
|
||||||
|
FiveXX: scenario5xx,
|
||||||
|
P50Us: recorder.Snapshot().ValueAtPercentile(50),
|
||||||
|
P95Us: recorder.Snapshot().ValueAtPercentile(95),
|
||||||
|
P99Us: recorder.Snapshot().ValueAtPercentile(99),
|
||||||
|
MaxUs: recorder.Snapshot().Max(),
|
||||||
|
Stages: stageReports,
|
||||||
|
}
|
||||||
|
scenarioPath := filepath.Join("reports", id+".json")
|
||||||
|
if err := reporter.WriteJSON(scenarioPath, rr); err != nil {
|
||||||
|
return fmt.Errorf("write %s: %w", scenarioPath, err)
|
||||||
|
}
|
||||||
|
log.Printf("📊 %s: total=%d err=%d 5xx=%d p99=%dms stages=%d",
|
||||||
|
id, scenarioTotal, scenarioErr, scenario5xx, rr.P99Us/1000, len(stageReports))
|
||||||
|
|
||||||
if breaker.State() == lib.CircuitTripped {
|
if breaker.State() == lib.CircuitTripped {
|
||||||
log.Printf("⚠️ circuit tripped, stopping")
|
log.Printf("⚠️ circuit tripped, stopping")
|
||||||
break
|
break
|
||||||
@ -143,11 +208,8 @@ func runLoadgen(target, scenarioIDs, stage, stepSchedule string, rps, vus int, d
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// write final report
|
|
||||||
if err := reporter.WriteJSON("report.json", scenarioIDs, recorder.Snapshot(), totalCount.Load(), errCount.Load(), fiveXXCount.Load()); err != nil {
|
|
||||||
return fmt.Errorf("write report: %w", err)
|
|
||||||
}
|
|
||||||
log.Printf("✅ loadgen done. total=%d err=%d fiveXX=%d", totalCount.Load(), errCount.Load(), fiveXXCount.Load())
|
log.Printf("✅ loadgen done. total=%d err=%d fiveXX=%d", totalCount.Load(), errCount.Load(), fiveXXCount.Load())
|
||||||
|
log.Printf("💡 下一步: ./loadgen --cmd=report --input=./reports --output=./reports/final-report.md")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -186,20 +248,33 @@ func runReport(inputDir, output string) error {
|
|||||||
return fmt.Errorf("--input required for cmd=report")
|
return fmt.Errorf("--input required for cmd=report")
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1. 收集 reports/run-*/ 下的 *.json
|
// 1. 递归收集 reports/ 下的所有 *.json (filepath.Glob 不支持 **, 用 WalkDir)
|
||||||
var scenarioReports []reporter.RunReport
|
var scenarioReports []reporter.RunReport
|
||||||
matches, _ := filepath.Glob(filepath.Join(inputDir, "**", "*.json"))
|
err := filepath.WalkDir(inputDir, func(path string, d os.DirEntry, walkErr error) error {
|
||||||
for _, m := range matches {
|
if walkErr != nil {
|
||||||
data, err := os.ReadFile(m)
|
return nil
|
||||||
|
}
|
||||||
|
if d.IsDir() || !strings.HasSuffix(path, ".json") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// 跳过元数据文件 (它是 RunMetadata 不是 RunReport)
|
||||||
|
if strings.HasSuffix(path, "run-metadata.json") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
return nil
|
||||||
}
|
}
|
||||||
var rr reporter.RunReport
|
var rr reporter.RunReport
|
||||||
if err := json.Unmarshal(data, &rr); err != nil {
|
if err := json.Unmarshal(data, &rr); err != nil {
|
||||||
log.Printf("skip %s: %v", m, err)
|
log.Printf("skip %s: %v", path, err)
|
||||||
continue
|
return nil
|
||||||
}
|
}
|
||||||
scenarioReports = append(scenarioReports, rr)
|
scenarioReports = append(scenarioReports, rr)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("walk %s: %w", inputDir, err)
|
||||||
}
|
}
|
||||||
if len(scenarioReports) == 0 {
|
if len(scenarioReports) == 0 {
|
||||||
return fmt.Errorf("no JSON reports found in %s", inputDir)
|
return fmt.Errorf("no JSON reports found in %s", inputDir)
|
||||||
@ -213,17 +288,41 @@ func runReport(inputDir, output string) error {
|
|||||||
}
|
}
|
||||||
log.Printf("wrote %s", baselinePath)
|
log.Printf("wrote %s", baselinePath)
|
||||||
|
|
||||||
// 3. 转 ScenarioReport (供 markdown 用)
|
// 3. 生成每个 scenario 的 PNG 图表
|
||||||
scenarioMarkdownReports := make([]reporter.ScenarioReport, 0, len(scenarioReports))
|
|
||||||
for _, r := range scenarioReports {
|
for _, r := range scenarioReports {
|
||||||
scenarioMarkdownReports = append(scenarioMarkdownReports, reporter.ScenarioReport{
|
if len(r.Stages) < 1 {
|
||||||
ID: r.Scenario,
|
continue
|
||||||
KneeRPS: 0, // 拐点需要分析 raw data 算,简化版留 0
|
}
|
||||||
})
|
plotPath := filepath.Join(inputDir, strings.ToLower(r.Scenario)+".png")
|
||||||
|
samples := make([]reporter.Sample, 0, len(r.Stages))
|
||||||
|
for _, st := range r.Stages {
|
||||||
|
tot := st.TotalRequests
|
||||||
|
errRate := float64(0)
|
||||||
|
if tot > 0 {
|
||||||
|
errRate = float64(st.Errors) / float64(tot)
|
||||||
|
}
|
||||||
|
samples = append(samples, reporter.Sample{
|
||||||
|
RPS: float64(st.TargetRPS),
|
||||||
|
P99Ms: float64(st.P99Us) / 1000,
|
||||||
|
ErrorRate: errRate,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if err := reporter.PlotRPSLatencyError(r.Scenario, samples, plotPath); err != nil {
|
||||||
|
log.Printf("⚠️ plot %s failed: %v", r.Scenario, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
log.Printf("wrote %s", plotPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. markdown
|
// 4. 读 run-metadata.json (可选,runLoadgen 写入)
|
||||||
if err := reporter.GenerateMarkdown(output, scenarioMarkdownReports); err != nil {
|
var meta reporter.RunMetadata
|
||||||
|
metaPath := filepath.Join(inputDir, "run-metadata.json")
|
||||||
|
if data, err := os.ReadFile(metaPath); err == nil {
|
||||||
|
_ = json.Unmarshal(data, &meta)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. markdown (引用生成的 PNG)
|
||||||
|
if err := reporter.GenerateMarkdown(output, meta, scenarioReports, "./"); err != nil {
|
||||||
return fmt.Errorf("write markdown: %w", err)
|
return fmt.Errorf("write markdown: %w", err)
|
||||||
}
|
}
|
||||||
log.Printf("wrote %s", output)
|
log.Printf("wrote %s", output)
|
||||||
|
|||||||
@ -7,20 +7,50 @@ import (
|
|||||||
"github.com/HdrHistogram/hdrhistogram-go"
|
"github.com/HdrHistogram/hdrhistogram-go"
|
||||||
)
|
)
|
||||||
|
|
||||||
type RunReport struct {
|
type StageReport struct {
|
||||||
Scenario string `json:"scenario"`
|
StageIdx int `json:"stage_idx"`
|
||||||
TotalRequests int64 `json:"total_requests"`
|
TargetRPS int `json:"target_rps"`
|
||||||
Errors int64 `json:"errors"`
|
TotalRequests int64 `json:"total_requests"`
|
||||||
FiveXX int64 `json:"five_xx"`
|
Errors int64 `json:"errors"`
|
||||||
P50Us int64 `json:"p50_us"`
|
FiveXX int64 `json:"five_xx"`
|
||||||
P95Us int64 `json:"p95_us"`
|
P50Us int64 `json:"p50_us"`
|
||||||
P99Us int64 `json:"p99_us"`
|
P95Us int64 `json:"p95_us"`
|
||||||
MaxUs int64 `json:"max_us"`
|
P99Us int64 `json:"p99_us"`
|
||||||
|
MaxUs int64 `json:"max_us"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func WriteJSON(path string, scenario string, h *hdrhistogram.Histogram, total, errs, fiveXX int64) error {
|
type RunReport struct {
|
||||||
r := RunReport{
|
Scenario string `json:"scenario"`
|
||||||
Scenario: scenario,
|
TotalRequests int64 `json:"total_requests"`
|
||||||
|
Errors int64 `json:"errors"`
|
||||||
|
FiveXX int64 `json:"five_xx"`
|
||||||
|
P50Us int64 `json:"p50_us"`
|
||||||
|
P95Us int64 `json:"p95_us"`
|
||||||
|
P99Us int64 `json:"p99_us"`
|
||||||
|
MaxUs int64 `json:"max_us"`
|
||||||
|
Stages []StageReport `json:"stages,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteJSON writes a RunReport (single scenario, optional per-stage data) to path.
|
||||||
|
func WriteJSON(path string, r RunReport) error {
|
||||||
|
f, err := os.Create(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
enc := json.NewEncoder(f)
|
||||||
|
enc.SetIndent("", " ")
|
||||||
|
return enc.Encode(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// MakeStageReport fills a StageReport from a histogram + counters.
|
||||||
|
func MakeStageReport(idx, targetRPS int, h *hdrhistogram.Histogram, total, errs, fiveXX int64) StageReport {
|
||||||
|
if h == nil {
|
||||||
|
return StageReport{StageIdx: idx, TargetRPS: targetRPS}
|
||||||
|
}
|
||||||
|
return StageReport{
|
||||||
|
StageIdx: idx,
|
||||||
|
TargetRPS: targetRPS,
|
||||||
TotalRequests: total,
|
TotalRequests: total,
|
||||||
Errors: errs,
|
Errors: errs,
|
||||||
FiveXX: fiveXX,
|
FiveXX: fiveXX,
|
||||||
@ -29,25 +59,28 @@ func WriteJSON(path string, scenario string, h *hdrhistogram.Histogram, total, e
|
|||||||
P99Us: h.ValueAtPercentile(99),
|
P99Us: h.ValueAtPercentile(99),
|
||||||
MaxUs: h.Max(),
|
MaxUs: h.Max(),
|
||||||
}
|
}
|
||||||
f, err := os.Create(path)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
return json.NewEncoder(f).Encode(r)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WriteBaselineCSV writes a CSV summary across multiple RunReports.
|
||||||
func WriteBaselineCSV(path string, scenarios []RunReport) error {
|
func WriteBaselineCSV(path string, scenarios []RunReport) error {
|
||||||
f, err := os.Create(path)
|
f, err := os.Create(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
if _, err := f.WriteString("scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms\n"); err != nil {
|
if _, err := f.WriteString("scenario,total,errors,five_xx,p50_ms,p95_ms,p99_ms,max_ms,stages\n"); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
for _, s := range scenarios {
|
for _, s := range scenarios {
|
||||||
_, err := f.WriteString(jsonLine(s) + "\n")
|
_, err := f.WriteString(s.Scenario + "," +
|
||||||
|
itoa(s.TotalRequests) + "," +
|
||||||
|
itoa(s.Errors) + "," +
|
||||||
|
itoa(s.FiveXX) + "," +
|
||||||
|
ms(s.P50Us) + "," +
|
||||||
|
ms(s.P95Us) + "," +
|
||||||
|
ms(s.P99Us) + "," +
|
||||||
|
ms(s.MaxUs) + "," +
|
||||||
|
itoa(int64(len(s.Stages))) + "\n")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -55,16 +88,6 @@ func WriteBaselineCSV(path string, scenarios []RunReport) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func jsonLine(s RunReport) string {
|
|
||||||
b, _ := json.Marshal(s)
|
|
||||||
s2 := string(b)
|
|
||||||
if len(s2) >= 2 && s2[0] == '{' {
|
|
||||||
// strip braces for CSV-friendly format
|
|
||||||
return s.Scenario + "," + itoa(s.TotalRequests) + "," + itoa(s.Errors) + "," + itoa(s.FiveXX) + "," + ms(s.P50Us) + "," + ms(s.P95Us) + "," + ms(s.P99Us) + "," + ms(s.MaxUs)
|
|
||||||
}
|
|
||||||
return s2
|
|
||||||
}
|
|
||||||
|
|
||||||
func itoa(n int64) string {
|
func itoa(n int64) string {
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
return "0"
|
return "0"
|
||||||
@ -88,12 +111,10 @@ func itoa(n int64) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ms(us int64) string {
|
func ms(us int64) string {
|
||||||
// us / 1000 as float
|
|
||||||
return formatFloat(float64(us) / 1000)
|
return formatFloat(float64(us) / 1000)
|
||||||
}
|
}
|
||||||
|
|
||||||
func formatFloat(f float64) string {
|
func formatFloat(f float64) string {
|
||||||
// simple 2-decimal format
|
|
||||||
intPart := int64(f)
|
intPart := int64(f)
|
||||||
frac := int64((f - float64(intPart)) * 100)
|
frac := int64((f - float64(intPart)) * 100)
|
||||||
if frac < 0 {
|
if frac < 0 {
|
||||||
|
|||||||
33
backend/scripts/loadgen/loadgen/reporter/knee.go
Normal file
33
backend/scripts/loadgen/loadgen/reporter/knee.go
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
package reporter
|
||||||
|
|
||||||
|
// KneeRPS finds the "knee" (turning point) of a multi-stage run.
|
||||||
|
//
|
||||||
|
// Heuristic: the first stage where p99 latency grew >50% over the previous
|
||||||
|
// stage. If no such jump exists (run was healthy throughout), returns the
|
||||||
|
// highest stage tested (i.e. we never hit the knee).
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - kneeRPS: the target_rps at the knee (or highest if no knee found)
|
||||||
|
// - kneeIdx: the stage index (1-based) where the knee was detected
|
||||||
|
// - p99Delta: the p99 jump percentage (0.5 = 50% growth)
|
||||||
|
func KneeRPS(stages []StageReport) (kneeRPS, kneeIdx int, p99Delta float64) {
|
||||||
|
if len(stages) == 0 {
|
||||||
|
return 0, 0, 0
|
||||||
|
}
|
||||||
|
if len(stages) == 1 {
|
||||||
|
return stages[0].TargetRPS, stages[0].StageIdx, 0
|
||||||
|
}
|
||||||
|
for i := 1; i < len(stages); i++ {
|
||||||
|
prev := stages[i-1].P99Us
|
||||||
|
if prev == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
growth := float64(stages[i].P99Us-prev) / float64(prev)
|
||||||
|
if growth > 0.5 {
|
||||||
|
return stages[i].TargetRPS, stages[i].StageIdx, growth
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 没找到拐点:返回最高 stage
|
||||||
|
last := stages[len(stages)-1]
|
||||||
|
return last.TargetRPS, last.StageIdx, 0
|
||||||
|
}
|
||||||
@ -3,42 +3,482 @@ package reporter
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ScenarioReport struct {
|
// GenerateMarkdown writes a rich markdown report.
|
||||||
ID string
|
//
|
||||||
Stages []StageReport
|
// Includes:
|
||||||
KneeRPS int
|
// - Header (run metadata: target, scenarios, time, JWT hint)
|
||||||
TopBottleneck string
|
// - Executive summary (per-scenario verdicts + key findings)
|
||||||
}
|
// - Cross-scenario bottleneck analysis
|
||||||
|
// - Per-scenario detailed sections with:
|
||||||
type StageReport struct {
|
// * Description + business impact + API
|
||||||
RPS int
|
// * Verdict with reasoning
|
||||||
P50Ms float64
|
// * KPI table vs thresholds
|
||||||
P95Ms float64
|
// * Knee analysis
|
||||||
P99Ms float64
|
// * Stage-by-stage breakdown
|
||||||
ErrorRate float64
|
// * PNG chart
|
||||||
}
|
// * Specific action items
|
||||||
|
func GenerateMarkdown(path string, meta RunMetadata, scenarios []RunReport, plotDir string) error {
|
||||||
func GenerateMarkdown(path string, scenarios []ScenarioReport) error {
|
|
||||||
f, err := os.Create(path)
|
f, err := os.Create(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
fmt.Fprintf(f, "# 压测报告\n\n")
|
writeHeader(f, meta, scenarios)
|
||||||
|
writeExecutiveSummary(f, scenarios)
|
||||||
|
writeOverviewTable(f, scenarios)
|
||||||
|
writeCrossScenarioAnalysis(f, scenarios)
|
||||||
for _, s := range scenarios {
|
for _, s := range scenarios {
|
||||||
fmt.Fprintf(f, "## %s\n\n", s.ID)
|
writeScenarioDetail(f, s, plotDir)
|
||||||
fmt.Fprintf(f, "**拐点 RPS**: %d\n\n", s.KneeRPS)
|
}
|
||||||
fmt.Fprintf(f, "**Top 瓶颈**: %s\n\n", s.TopBottleneck)
|
writeAppendix(f, meta)
|
||||||
fmt.Fprintf(f, "| Stage | RPS | P50ms | P95ms | P99ms | Err%% |\n")
|
return nil
|
||||||
fmt.Fprintf(f, "|-------|-----|-------|-------|-------|------|\n")
|
}
|
||||||
for _, st := range s.Stages {
|
|
||||||
fmt.Fprintf(f, "| - | %d | %.1f | %.1f | %.1f | %.1f |\n",
|
func writeHeader(f *os.File, meta RunMetadata, scenarios []RunReport) {
|
||||||
st.RPS, st.P50Ms, st.P95Ms, st.P99Ms, st.ErrorRate*100)
|
fmt.Fprintf(f, "# TopFans 压测报告\n\n")
|
||||||
|
duration := meta.EndTime.Sub(meta.StartTime).Round(time.Second)
|
||||||
|
fmt.Fprintf(f, "## 📋 运行信息\n\n")
|
||||||
|
fmt.Fprintf(f, "| 项 | 值 |\n|---|---|\n")
|
||||||
|
fmt.Fprintf(f, "| **生成时间** | %s |\n", time.Now().Format("2006-01-02 15:04:05 MST"))
|
||||||
|
if !meta.StartTime.IsZero() {
|
||||||
|
fmt.Fprintf(f, "| **压测开始** | %s |\n", meta.StartTime.Format("2006-01-02 15:04:05 MST"))
|
||||||
|
fmt.Fprintf(f, "| **压测结束** | %s |\n", meta.EndTime.Format("2006-01-02 15:04:05 MST"))
|
||||||
|
fmt.Fprintf(f, "| **总耗时** | %s |\n", duration)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "| **目标地址** | `%s` |\n", emptyDash(meta.Target))
|
||||||
|
fmt.Fprintf(f, "| **测试场景** | %s |\n", strings.Join(meta.Scenarios, ", "))
|
||||||
|
fmt.Fprintf(f, "| **阶梯模式** | %s%s |\n", emptyDash(meta.StageMode), ifThen(meta.StepSchedule != "", " (`"+meta.StepSchedule+"`)", ""))
|
||||||
|
if meta.JWTSecretHint != "" {
|
||||||
|
fmt.Fprintf(f, "| **JWT 签名密钥** | `%s***` (前 8 位) |\n", meta.JWTSecretHint)
|
||||||
|
}
|
||||||
|
if meta.ProdSSH != "" {
|
||||||
|
fmt.Fprintf(f, "| **prod SSH** | `%s` |\n", meta.ProdSSH)
|
||||||
|
}
|
||||||
|
if meta.MonitorMode != "" {
|
||||||
|
fmt.Fprintf(f, "| **监控模式** | %s |\n", meta.MonitorMode)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 总请求数
|
||||||
|
var totalReq, totalErr, total5xx int64
|
||||||
|
for _, s := range scenarios {
|
||||||
|
totalReq += s.TotalRequests
|
||||||
|
totalErr += s.Errors
|
||||||
|
total5xx += s.FiveXX
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "| **总请求数** | %s |\n", commaInt(totalReq))
|
||||||
|
fmt.Fprintf(f, "| **总错误数** | %s (%.2f%%) |\n", commaInt(totalErr), pct(totalErr, totalReq))
|
||||||
|
fmt.Fprintf(f, "| **5xx 数** | %s (%.2f%%) |\n", commaInt(total5xx), pct(total5xx, totalReq))
|
||||||
|
fmt.Fprintf(f, "\n---\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeExecutiveSummary(f *os.File, scenarios []RunReport) {
|
||||||
|
fmt.Fprintf(f, "## 🎯 执行摘要\n\n")
|
||||||
|
|
||||||
|
// Count verdicts
|
||||||
|
counts := map[string]int{"✅": 0, "⚠️": 0, "🚨": 0}
|
||||||
|
criticalIssues := []string{}
|
||||||
|
for _, s := range scenarios {
|
||||||
|
meta, ok := AllScenarios[s.Scenario]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_, _, p99Delta := KneeRPS(s.Stages)
|
||||||
|
knee := p99Delta > 0.5
|
||||||
|
v := meta.Verdict(s, knee)
|
||||||
|
counts[v]++
|
||||||
|
|
||||||
|
if v == "🚨" {
|
||||||
|
issue := fmt.Sprintf("- **%s (%s)**: ", s.Scenario, meta.Name)
|
||||||
|
if errRate := pct(s.Errors, s.TotalRequests); errRate > 1 {
|
||||||
|
issue += fmt.Sprintf("错误率 %.2f%% ", errRate)
|
||||||
|
}
|
||||||
|
if p99Ms := float64(s.P99Us) / 1000; p99Ms > meta.Thresholds.P99MsMax {
|
||||||
|
issue += fmt.Sprintf("P99 %.0fms (阈值 %.0fms) ", p99Ms, meta.Thresholds.P99MsMax)
|
||||||
|
}
|
||||||
|
if knee {
|
||||||
|
issue += fmt.Sprintf("拐点 stage %d", stagesIdx(s.Stages))
|
||||||
|
}
|
||||||
|
criticalIssues = append(criticalIssues, issue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Overall verdict
|
||||||
|
totalSc := len(scenarios)
|
||||||
|
fmt.Fprintf(f, "**总览**: ✅ %d 健康 / ⚠️ %d 警告 / 🚨 %d 严重 (共 %d)\n\n",
|
||||||
|
counts["✅"], counts["⚠️"], counts["🚨"], totalSc)
|
||||||
|
|
||||||
|
if len(criticalIssues) == 0 {
|
||||||
|
fmt.Fprintf(f, "🎉 **所有场景通过健康阈值,系统可承载预期负载。**\n\n")
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(f, "🚨 **关键问题** (%d 个):\n\n", len(criticalIssues))
|
||||||
|
for _, issue := range criticalIssues {
|
||||||
|
fmt.Fprintf(f, "%s\n", issue)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(f, "\n")
|
fmt.Fprintf(f, "\n")
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
|
// Per-scenario one-liner
|
||||||
|
fmt.Fprintf(f, "**场景速览**:\n\n")
|
||||||
|
for _, s := range scenarios {
|
||||||
|
meta, ok := AllScenarios[s.Scenario]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_, _, p99Delta := KneeRPS(s.Stages)
|
||||||
|
knee := p99Delta > 0.5
|
||||||
|
v := meta.Verdict(s, knee)
|
||||||
|
fmt.Fprintf(f, "- %s **%s %s** — p99=%.0fms, %s", v, s.Scenario, meta.Name, float64(s.P99Us)/1000, errSummary(s))
|
||||||
|
if knee {
|
||||||
|
fmt.Fprintf(f, ", ⚠️ 拐点 stage %d", stagesIdx(s.Stages))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "\n")
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "\n---\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeOverviewTable(f *os.File, scenarios []RunReport) {
|
||||||
|
fmt.Fprintf(f, "## 📊 总览表\n\n")
|
||||||
|
fmt.Fprintf(f, "| 场景 | 描述 | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 拐点 RPS | 状态 |\n")
|
||||||
|
fmt.Fprintf(f, "|------|------|-------|-----|-----|-------|-------|-------|-------|---------|------|\n")
|
||||||
|
for _, s := range scenarios {
|
||||||
|
meta, ok := AllScenarios[s.Scenario]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
kneeRPS, kneeIdx, p99Delta := KneeRPS(s.Stages)
|
||||||
|
kneeTriggered := p99Delta > 0.5
|
||||||
|
v := meta.Verdict(s, kneeTriggered)
|
||||||
|
kneeStr := "—"
|
||||||
|
if kneeTriggered {
|
||||||
|
kneeStr = fmt.Sprintf("%d (stage %d)", kneeRPS, kneeIdx)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "| **%s** | %s | %s | %s (%.2f%%) | %s (%.2f%%) | %.0f | %.0f | %.0f | %.0f | %s | %s |\n",
|
||||||
|
s.Scenario, meta.Name,
|
||||||
|
commaInt(s.TotalRequests),
|
||||||
|
commaInt(s.Errors), pct(s.Errors, s.TotalRequests),
|
||||||
|
commaInt(s.FiveXX), pct(s.FiveXX, s.TotalRequests),
|
||||||
|
usToMs(s.P50Us), usToMs(s.P95Us), usToMs(s.P99Us), usToMs(s.MaxUs),
|
||||||
|
kneeStr, v)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "\n> 说明: Err 包含 4xx + 5xx,5xx 是子集。错误率 = Err / Total。\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeCrossScenarioAnalysis(f *os.File, scenarios []RunReport) {
|
||||||
|
fmt.Fprintf(f, "## 🔬 跨场景瓶颈分析\n\n")
|
||||||
|
if len(scenarios) < 2 {
|
||||||
|
fmt.Fprintf(f, "只有一个场景,无需跨场景分析。\n\n")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find bottleneck: highest P99 relative to threshold
|
||||||
|
type scored struct {
|
||||||
|
scenario string
|
||||||
|
p99Ms float64
|
||||||
|
ratio float64 // p99 / threshold
|
||||||
|
}
|
||||||
|
var scoreds []scored
|
||||||
|
for _, s := range scenarios {
|
||||||
|
meta, ok := AllScenarios[s.Scenario]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
p99Ms := float64(s.P99Us) / 1000
|
||||||
|
ratio := p99Ms / meta.Thresholds.P99MsMax
|
||||||
|
scoreds = append(scoreds, scored{s.Scenario, p99Ms, ratio})
|
||||||
|
}
|
||||||
|
// Sort by ratio desc
|
||||||
|
for i := 0; i < len(scoreds); i++ {
|
||||||
|
for j := i + 1; j < len(scoreds); j++ {
|
||||||
|
if scoreds[j].ratio > scoreds[i].ratio {
|
||||||
|
scoreds[i], scoreds[j] = scoreds[j], scoreds[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(scoreds) > 0 && scoreds[0].ratio > 1 {
|
||||||
|
fmt.Fprintf(f, "🚨 **瓶颈场景: %s** — P99 是阈值的 %.2f 倍\n\n", scoreds[0].scenario, scoreds[0].ratio)
|
||||||
|
} else if len(scoreds) > 0 {
|
||||||
|
fmt.Fprintf(f, "✅ **无明显瓶颈**,所有场景 P99 都在阈值内。\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(f, "**P99 / 阈值 比率** (从高到低):\n\n")
|
||||||
|
for _, s := range scoreds {
|
||||||
|
fmt.Fprintf(f, "- %s: %.2fx (%.0fms)\n", s.scenario, s.ratio, s.p99Ms)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "\n---\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeScenarioDetail(f *os.File, s RunReport, plotDir string) {
|
||||||
|
meta, ok := AllScenarios[s.Scenario]
|
||||||
|
if !ok {
|
||||||
|
fmt.Fprintf(f, "## %s (无元数据)\n\n", s.Scenario)
|
||||||
|
fmt.Fprintf(f, "```json\n%+v\n```\n\n", s)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
kneeRPS, kneeIdx, p99Delta := KneeRPS(s.Stages)
|
||||||
|
kneeTriggered := p99Delta > 0.5
|
||||||
|
verdict := meta.Verdict(s, kneeTriggered)
|
||||||
|
|
||||||
|
fmt.Fprintf(f, "## %s %s %s\n\n", verdict, s.Scenario, meta.Name)
|
||||||
|
fmt.Fprintf(f, "### 📌 测试说明\n\n")
|
||||||
|
fmt.Fprintf(f, "| 项 | 值 |\n|---|---|\n")
|
||||||
|
fmt.Fprintf(f, "| **API** | `%s` |\n", meta.API)
|
||||||
|
fmt.Fprintf(f, "| **负载类型** | %s |\n", workloadLabel(meta.Workload))
|
||||||
|
fmt.Fprintf(f, "| **业务说明** | %s |\n", meta.Description)
|
||||||
|
fmt.Fprintf(f, "| **影响范围** | %s |\n", meta.BusinessImp)
|
||||||
|
fmt.Fprintf(f, "\n")
|
||||||
|
|
||||||
|
// KPI vs thresholds
|
||||||
|
fmt.Fprintf(f, "### 📈 性能指标 vs 健康阈值\n\n")
|
||||||
|
p50Ms := usToMs(s.P50Us)
|
||||||
|
p95Ms := usToMs(s.P95Us)
|
||||||
|
p99Ms := usToMs(s.P99Us)
|
||||||
|
maxMs := usToMs(s.MaxUs)
|
||||||
|
errRate := pct(s.Errors, s.TotalRequests)
|
||||||
|
fiveXXRate := pct(s.FiveXX, s.TotalRequests)
|
||||||
|
fmt.Fprintf(f, "| 指标 | 实测 | 阈值 | 判定 |\n")
|
||||||
|
fmt.Fprintf(f, "|------|------|------|------|\n")
|
||||||
|
fmt.Fprintf(f, "| P50ms | %.0f | ≤%.0f | %s |\n", p50Ms, meta.Thresholds.P50MsMax, thresholdMark(p50Ms, meta.Thresholds.P50MsMax))
|
||||||
|
fmt.Fprintf(f, "| P95ms | %.0f | ≤%.0f | %s |\n", p95Ms, meta.Thresholds.P95MsMax, thresholdMark(p95Ms, meta.Thresholds.P95MsMax))
|
||||||
|
fmt.Fprintf(f, "| P99ms | %.0f | ≤%.0f | %s |\n", p99Ms, meta.Thresholds.P99MsMax, thresholdMark(p99Ms, meta.Thresholds.P99MsMax))
|
||||||
|
fmt.Fprintf(f, "| Maxms | %.0f | — | ℹ️ 参考 |\n", maxMs)
|
||||||
|
fmt.Fprintf(f, "| 错误率 | %.2f%% | ≤%.2f%% | %s |\n", errRate, meta.Thresholds.ErrorRateMax*100, thresholdMark(errRate/100, meta.Thresholds.ErrorRateMax))
|
||||||
|
fmt.Fprintf(f, "| 5xx 率 | %.2f%% | ≤%.2f%% | %s |\n", fiveXXRate, meta.Thresholds.FiveXXRateMax*100, thresholdMark(fiveXXRate/100, meta.Thresholds.FiveXXRateMax))
|
||||||
|
fmt.Fprintf(f, "\n")
|
||||||
|
|
||||||
|
// Knee
|
||||||
|
fmt.Fprintf(f, "### 📍 拐点分析\n\n")
|
||||||
|
if len(s.Stages) <= 1 {
|
||||||
|
fmt.Fprintf(f, "ℹ️ 仅 1 个 stage,未做阶梯测试,无法判断拐点。\n\n")
|
||||||
|
} else if kneeTriggered {
|
||||||
|
fmt.Fprintf(f, "🚨 **拐点**: stage %d @ %d RPS — p99 暴涨 %.0f%%\n\n",
|
||||||
|
kneeIdx, kneeRPS, p99Delta*100)
|
||||||
|
fmt.Fprintf(f, "从 stage %d 到 stage %d,p99 延迟从 %.0fms 涨到 %.0fms (%.1fx)。\n",
|
||||||
|
kneeIdx-1, kneeIdx, usToMs(s.Stages[kneeIdx-2].P99Us), p99Ms, 1+p99Delta)
|
||||||
|
fmt.Fprintf(f, "\n**含义**: 系统在 %d RPS 时开始出现性能劣化。建议生产限流到 %d RPS 以下。\n\n",
|
||||||
|
kneeRPS, kneeRPS)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(f, "✅ **拐点未触发** — 全程 %d 个 stage 健康运行,最高 %d RPS p99=%.0fms。\n\n",
|
||||||
|
len(s.Stages), kneeRPS, p99Ms)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stage table
|
||||||
|
fmt.Fprintf(f, "### 🔢 阶梯结果\n\n")
|
||||||
|
if len(s.Stages) == 0 {
|
||||||
|
fmt.Fprintf(f, "_无 stage 数据_\n\n")
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(f, "| Stage | TargetRPS | Total | Err | 5xx | P50ms | P95ms | P99ms | Maxms | 涨幅 |\n")
|
||||||
|
fmt.Fprintf(f, "|-------|-----------|-------|-----|-----|-------|-------|-------|-------|------|\n")
|
||||||
|
for i, st := range s.Stages {
|
||||||
|
growth := ""
|
||||||
|
if i > 0 {
|
||||||
|
prevP99 := float64(s.Stages[i-1].P99Us) / 1000
|
||||||
|
curP99 := float64(st.P99Us) / 1000
|
||||||
|
if prevP99 > 0 {
|
||||||
|
pct := (curP99 - prevP99) / prevP99 * 100
|
||||||
|
growth = fmt.Sprintf("%+.0f%%", pct)
|
||||||
|
if pct > 50 {
|
||||||
|
growth = "🚨 " + growth
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "| %d | %d | %s | %s | %s | %.0f | %.0f | %.0f | %.0f | %s |\n",
|
||||||
|
st.StageIdx, st.TargetRPS,
|
||||||
|
commaInt(st.TotalRequests), commaInt(st.Errors), commaInt(st.FiveXX),
|
||||||
|
usToMs(st.P50Us), usToMs(st.P95Us), usToMs(st.P99Us), usToMs(st.MaxUs),
|
||||||
|
growth)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Action items
|
||||||
|
fmt.Fprintf(f, "### 🎯 行动项\n\n")
|
||||||
|
actionItems(f, s, meta, kneeTriggered, kneeRPS)
|
||||||
|
|
||||||
|
// Plot
|
||||||
|
if plotDir != "" {
|
||||||
|
plotName := strings.ToLower(s.Scenario) + ".png"
|
||||||
|
fmt.Fprintf(f, "### 📉 图表\n\n")
|
||||||
|
fmt.Fprintf(f, "\n\n", s.Scenario, plotDir, plotName)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(f, "---\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeAppendix(f *os.File, meta RunMetadata) {
|
||||||
|
fmt.Fprintf(f, "## 📎 附录\n\n")
|
||||||
|
fmt.Fprintf(f, "### 健康阈值说明\n\n")
|
||||||
|
fmt.Fprintln(f, "- **P50/P95/P99**: 百分位延迟 (毫秒),值越小越好")
|
||||||
|
fmt.Fprintln(f, "- **错误率**: 4xx+5xx 请求占比,健康 < 1%")
|
||||||
|
fmt.Fprintln(f, "- **5xx 率**: 服务端错误率,健康 < 0.1%")
|
||||||
|
fmt.Fprintln(f, "- **拐点**: 阶梯测试中,p99 相对前一 stage 涨幅 > 50% 的第一个 stage")
|
||||||
|
fmt.Fprintf(f, "\n")
|
||||||
|
fmt.Fprintf(f, "### 文件清单\n\n")
|
||||||
|
fmt.Fprintf(f, "```\n")
|
||||||
|
fmt.Fprintf(f, "reports/\n")
|
||||||
|
fmt.Fprintf(f, "├── final-report.md (本文件)\n")
|
||||||
|
fmt.Fprintf(f, "├── baseline.csv (Excel 可打开的汇总)\n")
|
||||||
|
for _, s := range []string{"S1", "S2", "S3", "S4", "S5", "S6", "S7"} {
|
||||||
|
fmt.Fprintf(f, "├── %s.json%s\n", strings.ToLower(s), "")
|
||||||
|
fmt.Fprintf(f, "├── %s.png%s\n", strings.ToLower(s), "")
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "```\n\n")
|
||||||
|
fmt.Fprintf(f, "### 如何复现\n\n")
|
||||||
|
fmt.Fprintf(f, "```bash\n")
|
||||||
|
fmt.Fprintf(f, "cd /opt/topfans/loadtest\n")
|
||||||
|
if meta.StepSchedule != "" {
|
||||||
|
fmt.Fprintf(f, "./loadgen --cmd=run --scenarios=%s --stage=%s --step-schedule='%s' \\\n",
|
||||||
|
strings.Join(meta.Scenarios, ","), meta.StageMode, meta.StepSchedule)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(f, "./loadgen --cmd=run --scenarios=%s --stage=%s \\\n",
|
||||||
|
strings.Join(meta.Scenarios, ","), meta.StageMode)
|
||||||
|
}
|
||||||
|
if meta.Target != "" {
|
||||||
|
fmt.Fprintf(f, " --target=%s \\\n", meta.Target)
|
||||||
|
}
|
||||||
|
if meta.MonitorMode != "" {
|
||||||
|
fmt.Fprintf(f, " --monitor=%s \\\n", meta.MonitorMode)
|
||||||
|
}
|
||||||
|
if meta.ProdSSH != "" {
|
||||||
|
fmt.Fprintf(f, " --prod-ssh=%s\n", meta.ProdSSH)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "```\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- helpers ----
|
||||||
|
|
||||||
|
func workloadLabel(w string) string {
|
||||||
|
switch w {
|
||||||
|
case "read":
|
||||||
|
return "📖 读"
|
||||||
|
case "write_light":
|
||||||
|
return "✏️ 轻写"
|
||||||
|
case "write_heavy":
|
||||||
|
return "🛠️ 重写"
|
||||||
|
}
|
||||||
|
return w
|
||||||
|
}
|
||||||
|
|
||||||
|
func thresholdMark(value, threshold float64) string {
|
||||||
|
if value <= threshold {
|
||||||
|
return "✅"
|
||||||
|
}
|
||||||
|
if value <= threshold*1.5 {
|
||||||
|
return "⚠️"
|
||||||
|
}
|
||||||
|
return "🚨"
|
||||||
|
}
|
||||||
|
|
||||||
|
func errSummary(s RunReport) string {
|
||||||
|
if s.TotalRequests == 0 {
|
||||||
|
return "无请求"
|
||||||
|
}
|
||||||
|
rate := pct(s.Errors, s.TotalRequests)
|
||||||
|
return fmt.Sprintf("err %.2f%%", rate)
|
||||||
|
}
|
||||||
|
|
||||||
|
func stagesIdx(stages []StageReport) int {
|
||||||
|
_, idx, _ := KneeRPS(stages)
|
||||||
|
return idx
|
||||||
|
}
|
||||||
|
|
||||||
|
func pct(num, denom int64) float64 {
|
||||||
|
if denom == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return float64(num) / float64(denom) * 100
|
||||||
|
}
|
||||||
|
|
||||||
|
func usToMs(us int64) float64 {
|
||||||
|
return float64(us) / 1000
|
||||||
|
}
|
||||||
|
|
||||||
|
func commaInt(n int64) string {
|
||||||
|
if n == 0 {
|
||||||
|
return "0"
|
||||||
|
}
|
||||||
|
neg := n < 0
|
||||||
|
if neg {
|
||||||
|
n = -n
|
||||||
|
}
|
||||||
|
s := fmt.Sprintf("%d", n)
|
||||||
|
// Insert commas
|
||||||
|
out := []byte{}
|
||||||
|
for i, c := range s {
|
||||||
|
if i > 0 && (len(s)-i)%3 == 0 {
|
||||||
|
out = append(out, ',')
|
||||||
|
}
|
||||||
|
out = append(out, byte(c))
|
||||||
|
}
|
||||||
|
if neg {
|
||||||
|
return "-" + string(out)
|
||||||
|
}
|
||||||
|
return string(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
func emptyDash(s string) string {
|
||||||
|
if s == "" {
|
||||||
|
return "—"
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func ifThen(cond bool, a, b string) string {
|
||||||
|
if cond {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// actionItems emits scenario-specific P0/P1/P2 action items.
|
||||||
|
func actionItems(f *os.File, s RunReport, meta ScenarioMeta, knee bool, _ int) {
|
||||||
|
p99Ms := usToMs(s.P99Us)
|
||||||
|
errRate := pct(s.Errors, s.TotalRequests)
|
||||||
|
fiveXXRate := pct(s.FiveXX, s.TotalRequests)
|
||||||
|
p99Over := p99Ms > meta.Thresholds.P99MsMax
|
||||||
|
|
||||||
|
anyAction := false
|
||||||
|
|
||||||
|
if knee {
|
||||||
|
kneeRPS, kneeIdx, _ := KneeRPS(s.Stages)
|
||||||
|
fmt.Fprintf(f, "- [ ] **🔴 P0**: 修复 stage %d 拐点 (%d RPS, p99=%.0fms)\n", kneeIdx, kneeRPS, p99Ms)
|
||||||
|
fmt.Fprintf(f, " - 看 PG 慢查询 (`pg_stat_statements ORDER BY mean_exec_time DESC`)\n")
|
||||||
|
fmt.Fprintf(f, " - 跑应用层 profile (`pprof http://localhost:PORT/debug/pprof/profile`)\n")
|
||||||
|
fmt.Fprintf(f, " - 临时方案: 服务端限流到 %d RPS,超限返回 429\n", kneeRPS)
|
||||||
|
anyAction = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if fiveXXRate > 0.5 {
|
||||||
|
fmt.Fprintf(f, "- [ ] **🔴 P0**: 5xx 率 %.2f%% — 看 prod 服务日志,定位具体错误\n", fiveXXRate)
|
||||||
|
anyAction = true
|
||||||
|
}
|
||||||
|
if errRate > 1 {
|
||||||
|
fmt.Fprintf(f, "- [ ] **🟡 P1**: 错误率 %.2f%% — 检查 4xx 错误码,看是否 JWT 过期 / 数据缺失\n", errRate)
|
||||||
|
anyAction = true
|
||||||
|
}
|
||||||
|
if p99Over && !knee {
|
||||||
|
fmt.Fprintf(f, "- [ ] **🟡 P1**: P99 %.0fms 超过阈值 %.0fms — 检查是否有个别慢查询\n", p99Ms, meta.Thresholds.P99MsMax)
|
||||||
|
anyAction = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Workload-specific suggestions
|
||||||
|
if meta.Workload == "write_heavy" && (knee || p99Over) {
|
||||||
|
fmt.Fprintf(f, "- [ ] **🟡 P1**: 写重场景有性能问题 — 考虑把同步写改成异步(消息队列)\n")
|
||||||
|
anyAction = true
|
||||||
|
}
|
||||||
|
if meta.Workload == "read" && (knee || p99Over) {
|
||||||
|
fmt.Fprintf(f, "- [ ] **🟡 P1**: 读路径有性能问题 — 加 Redis 缓存,减少 DB 直查\n")
|
||||||
|
anyAction = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if !anyAction {
|
||||||
|
fmt.Fprintf(f, "✅ 无需行动项 — 所有指标在阈值内。\n")
|
||||||
|
}
|
||||||
|
fmt.Fprintf(f, "\n")
|
||||||
}
|
}
|
||||||
|
|||||||
156
backend/scripts/loadgen/loadgen/reporter/meta.go
Normal file
156
backend/scripts/loadgen/loadgen/reporter/meta.go
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
package reporter
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// Thresholds defines health KPIs for a scenario.
|
||||||
|
type Thresholds struct {
|
||||||
|
P50MsMax float64 // P50ms should be <= this
|
||||||
|
P95MsMax float64 // P95ms should be <= this
|
||||||
|
P99MsMax float64 // P99ms should be <= this
|
||||||
|
ErrorRateMax float64 // e.g. 0.01 = 1%
|
||||||
|
FiveXXRateMax float64 // e.g. 0.001 = 0.1%
|
||||||
|
}
|
||||||
|
|
||||||
|
// ScenarioMeta describes what a scenario tests and how to evaluate it.
|
||||||
|
type ScenarioMeta struct {
|
||||||
|
ID string // "S1"
|
||||||
|
Name string // "登录"
|
||||||
|
API string // "POST /api/v1/auth/login"
|
||||||
|
Description string // 业务一句话
|
||||||
|
BusinessImp string // 影响范围 (所有用户 / 写重 / 边缘功能)
|
||||||
|
Workload string // "read" | "write_light" | "write_heavy"
|
||||||
|
Thresholds Thresholds
|
||||||
|
}
|
||||||
|
|
||||||
|
// AllScenarios is the registry of known scenarios.
|
||||||
|
// Keep this in sync with scenarios/s*.go registry.
|
||||||
|
var AllScenarios = map[string]ScenarioMeta{
|
||||||
|
"S1": {
|
||||||
|
ID: "S1",
|
||||||
|
Name: "用户登录",
|
||||||
|
API: "POST /api/v1/auth/login",
|
||||||
|
Description: "用户身份认证,签发 JWT",
|
||||||
|
BusinessImp: "🔴 所有用户必经路径,失败 = 用户进不来",
|
||||||
|
Workload: "write_light",
|
||||||
|
Thresholds: Thresholds{
|
||||||
|
P50MsMax: 100, P95MsMax: 300, P99MsMax: 1000,
|
||||||
|
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"S2": {
|
||||||
|
ID: "S2",
|
||||||
|
Name: "浏览资产详情",
|
||||||
|
API: "GET /api/v1/assets/{id}",
|
||||||
|
Description: "高频读路径,典型缓存命中场景",
|
||||||
|
BusinessImp: "🟢 单用户最高频操作,影响页面加载体验",
|
||||||
|
Workload: "read",
|
||||||
|
Thresholds: Thresholds{
|
||||||
|
P50MsMax: 50, P95MsMax: 150, P99MsMax: 500,
|
||||||
|
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"S3": {
|
||||||
|
ID: "S3",
|
||||||
|
Name: "点赞 / 取消点赞",
|
||||||
|
API: "POST/DELETE /api/v1/social/assets/{id}/like",
|
||||||
|
Description: "轻量写,社交互动",
|
||||||
|
BusinessImp: "🟢 写多但单条小,影响点赞数显示",
|
||||||
|
Workload: "write_light",
|
||||||
|
Thresholds: Thresholds{
|
||||||
|
P50MsMax: 80, P95MsMax: 250, P99MsMax: 800,
|
||||||
|
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"S4": {
|
||||||
|
ID: "S4",
|
||||||
|
Name: "资产铸造 (mint)",
|
||||||
|
API: "POST /api/v1/assets/mints/precreate",
|
||||||
|
Description: "写重路径:OSS 上传 + 签名 + 事务落库",
|
||||||
|
BusinessImp: "🟡 核心交易,影响创作者产出节奏",
|
||||||
|
Workload: "write_heavy",
|
||||||
|
Thresholds: Thresholds{
|
||||||
|
P50MsMax: 300, P95MsMax: 800, P99MsMax: 2000, // 写重场景阈值更宽
|
||||||
|
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"S5": {
|
||||||
|
ID: "S5",
|
||||||
|
Name: "Dashboard 聚合",
|
||||||
|
API: "聚合多个用户/资产指标",
|
||||||
|
Description: "后台聚合查询,可能涉及多表 JOIN",
|
||||||
|
BusinessImp: "🟢 运营场景,非实时关键",
|
||||||
|
Workload: "read",
|
||||||
|
Thresholds: Thresholds{
|
||||||
|
P50MsMax: 200, P95MsMax: 500, P99MsMax: 1500,
|
||||||
|
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"S6": {
|
||||||
|
ID: "S6",
|
||||||
|
Name: "热门榜单",
|
||||||
|
API: "GET /api/v1/rankings/hot",
|
||||||
|
Description: "排序读,Redis 缓存命中率关键",
|
||||||
|
BusinessImp: "🟢 首页流量入口,影响新用户第一印象",
|
||||||
|
Workload: "read",
|
||||||
|
Thresholds: Thresholds{
|
||||||
|
P50MsMax: 30, P95MsMax: 100, P99MsMax: 300,
|
||||||
|
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"S7": {
|
||||||
|
ID: "S7",
|
||||||
|
Name: "摆展 (place)",
|
||||||
|
API: "展位分配 + 事务",
|
||||||
|
Description: "写重路径,涉及展位锁竞争",
|
||||||
|
BusinessImp: "🟡 创作者核心操作,涉及并发事务",
|
||||||
|
Workload: "write_heavy",
|
||||||
|
Thresholds: Thresholds{
|
||||||
|
P50MsMax: 400, P95MsMax: 1000, P99MsMax: 2500,
|
||||||
|
ErrorRateMax: 0.01, FiveXXRateMax: 0.001,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verdict returns one of ✅ (good), ⚠️ (warning), 🚨 (critical).
|
||||||
|
// Based on thresholds + knee detection.
|
||||||
|
func (s ScenarioMeta) Verdict(r RunReport, kneeTriggered bool) string {
|
||||||
|
if len(r.Stages) == 0 {
|
||||||
|
return "❓"
|
||||||
|
}
|
||||||
|
errRate := float64(0)
|
||||||
|
fiveXXRate := float64(0)
|
||||||
|
if r.TotalRequests > 0 {
|
||||||
|
errRate = float64(r.Errors) / float64(r.TotalRequests)
|
||||||
|
fiveXXRate = float64(r.FiveXX) / float64(r.TotalRequests)
|
||||||
|
}
|
||||||
|
p99Ms := float64(r.P99Us) / 1000
|
||||||
|
|
||||||
|
// 红色条件:任一严重超标
|
||||||
|
if errRate > s.Thresholds.ErrorRateMax*2 ||
|
||||||
|
fiveXXRate > s.Thresholds.FiveXXRateMax*5 ||
|
||||||
|
p99Ms > s.Thresholds.P99MsMax*2 {
|
||||||
|
return "🚨"
|
||||||
|
}
|
||||||
|
// 黄色条件:接近阈值 或 触发拐点
|
||||||
|
if errRate > s.Thresholds.ErrorRateMax ||
|
||||||
|
fiveXXRate > s.Thresholds.FiveXXRateMax ||
|
||||||
|
p99Ms > s.Thresholds.P99MsMax ||
|
||||||
|
kneeTriggered {
|
||||||
|
return "⚠️"
|
||||||
|
}
|
||||||
|
return "✅"
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunMetadata captures run-level context for the report header.
|
||||||
|
type RunMetadata struct {
|
||||||
|
StartTime time.Time `json:"start_time"`
|
||||||
|
EndTime time.Time `json:"end_time"`
|
||||||
|
Target string `json:"target"`
|
||||||
|
Scenarios []string `json:"scenarios"`
|
||||||
|
StepSchedule string `json:"step_schedule,omitempty"`
|
||||||
|
JWTSecretHint string `json:"jwt_secret_hint,omitempty"`
|
||||||
|
ProdSSH string `json:"prod_ssh,omitempty"`
|
||||||
|
MonitorMode string `json:"monitor_mode,omitempty"`
|
||||||
|
StageMode string `json:"stage_mode"` // "baseline" | "step" | ...
|
||||||
|
RPSOverride int `json:"rps_override,omitempty"`
|
||||||
|
}
|
||||||
@ -20,17 +20,20 @@ func doRequest(client *http.Client, req *http.Request, rec *lib.LatencyRecorder,
|
|||||||
totalCount.Add(1)
|
totalCount.Add(1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
errCount.Add(1)
|
errCount.Add(1)
|
||||||
|
rec.RecordResult(true, false)
|
||||||
checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker)
|
checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
switch {
|
is5xx := resp.StatusCode >= 500
|
||||||
case resp.StatusCode >= 500:
|
isErr := resp.StatusCode >= 400
|
||||||
|
if is5xx {
|
||||||
fiveXXCount.Add(1)
|
fiveXXCount.Add(1)
|
||||||
errCount.Add(1)
|
errCount.Add(1)
|
||||||
case resp.StatusCode >= 400:
|
} else if isErr {
|
||||||
errCount.Add(1)
|
errCount.Add(1)
|
||||||
}
|
}
|
||||||
|
rec.RecordResult(isErr, is5xx)
|
||||||
checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker)
|
checkBreaker(client, rec, errCount, totalCount, fiveXXCount, breaker)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -40,6 +40,10 @@ func (s *s1Login) Run(ctx context.Context, rpsOverride int, durationOverride tim
|
|||||||
duration = 2 * time.Minute
|
duration = 2 * time.Minute
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// S1 doesn't internally iterate stages, so wrap entire run as stage 1
|
||||||
|
s.rec.BeginStage(1, targetRPS)
|
||||||
|
defer s.rec.EndStage()
|
||||||
|
|
||||||
ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
|
ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
timeout := time.NewTimer(duration)
|
timeout := time.NewTimer(duration)
|
||||||
|
|||||||
@ -38,6 +38,10 @@ func (s *s2Read) Run(ctx context.Context, rpsOverride int, durationOverride time
|
|||||||
duration = 2 * time.Minute
|
duration = 2 * time.Minute
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// S2 doesn't internally iterate stages, wrap entire run as stage 1
|
||||||
|
s.rec.BeginStage(1, targetRPS)
|
||||||
|
defer s.rec.EndStage()
|
||||||
|
|
||||||
ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
|
ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
timeout := time.NewTimer(duration)
|
timeout := time.NewTimer(duration)
|
||||||
|
|||||||
@ -39,6 +39,10 @@ func (s *s3Like) Run(ctx context.Context, rpsOverride int, durationOverride time
|
|||||||
duration = 2 * time.Minute
|
duration = 2 * time.Minute
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// S3 doesn't internally iterate stages, wrap entire run as stage 1
|
||||||
|
s.rec.BeginStage(1, targetRPS)
|
||||||
|
defer s.rec.EndStage()
|
||||||
|
|
||||||
ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
|
ticker := time.NewTicker(time.Second / time.Duration(targetRPS))
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
timeout := time.NewTimer(duration)
|
timeout := time.NewTimer(duration)
|
||||||
|
|||||||
@ -37,11 +37,18 @@ func (s *s4Mint) Run(ctx context.Context, rpsOverride int, durationOverride time
|
|||||||
if len(stages) == 0 {
|
if len(stages) == 0 {
|
||||||
stages = []int{5, 10, 20, 30, 50, 80}
|
stages = []int{5, 10, 20, 30, 50, 80}
|
||||||
}
|
}
|
||||||
|
stageDuration := 2 * time.Minute
|
||||||
|
if durationOverride > 0 && durationOverride < stageDuration {
|
||||||
|
stageDuration = durationOverride
|
||||||
|
}
|
||||||
for stageIdx, stageRPS := range stages {
|
for stageIdx, stageRPS := range stages {
|
||||||
logf("S4 stage %d/%d: %d RPS × 2min", stageIdx+1, len(stages), stageRPS)
|
logf("S4 stage %d/%d: %d RPS × %v", stageIdx+1, len(stages), stageRPS, stageDuration)
|
||||||
if err := s.runStage(ctx, stageRPS, 2*time.Minute); err != nil {
|
s.rec.BeginStage(stageIdx+1, stageRPS)
|
||||||
|
if err := s.runStage(ctx, stageRPS, stageDuration); err != nil {
|
||||||
|
s.rec.EndStage()
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
s.rec.EndStage()
|
||||||
logf("S4 stage %d done, resetting mint data...", stageIdx+1)
|
logf("S4 stage %d done, resetting mint data...", stageIdx+1)
|
||||||
if s.prodSSH != "" {
|
if s.prodSSH != "" {
|
||||||
cmd := exec.Command("ssh", s.prodSSH, "bash /opt/topfans/loadtest/scripts/mint_reset.sh")
|
cmd := exec.Command("ssh", s.prodSSH, "bash /opt/topfans/loadtest/scripts/mint_reset.sh")
|
||||||
|
|||||||
35
backend/scripts/loadgen/scripts/prod_seed.sh
Normal file
35
backend/scripts/loadgen/scripts/prod_seed.sh
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# ===================================================================
|
||||||
|
# prod seed 一键运行脚本
|
||||||
|
# 用途:从 /opt/topfans/docker/.env.prod 读 DB/JWT 凭据,跑 seed 工具
|
||||||
|
# 使用:ssh root@101.132.250.62 "bash /opt/topfans/loadtest/scripts/prod_seed.sh"
|
||||||
|
# ===================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ENV_FILE="/opt/topfans/docker/.env.prod"
|
||||||
|
LOADTEST_DIR="/opt/topfans/loadtest"
|
||||||
|
|
||||||
|
if [[ ! -f "$ENV_FILE" ]]; then
|
||||||
|
echo "❌ $ENV_FILE 不存在"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
export DB_PASSWORD=$(grep '^DB_PASSWORD=' "$ENV_FILE" | cut -d= -f2)
|
||||||
|
export JWT_SECRET=$(grep '^JWT_SECRET=' "$ENV_FILE" | cut -d= -f2)
|
||||||
|
|
||||||
|
cd "$LOADTEST_DIR"
|
||||||
|
|
||||||
|
echo "=========================================="
|
||||||
|
echo "prod seed - 准备 loadtest 数据"
|
||||||
|
echo "DB host: localhost (容器内)"
|
||||||
|
echo "DB name: topfans"
|
||||||
|
echo "JWT secret: ${JWT_SECRET:0:10}..."
|
||||||
|
echo "=========================================="
|
||||||
|
|
||||||
|
./seed --db-name=topfans --jwt-secret="$JWT_SECRET"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✅ seed 完成。生成的文件:"
|
||||||
|
ls -la users.csv
|
||||||
|
echo ""
|
||||||
|
echo "下一步: ./loadgen --cmd=preflight --target=http://localhost:8080"
|
||||||
@ -1,67 +1,188 @@
|
|||||||
# seed - 压测数据准备工具
|
# seed - 压测数据准备工具
|
||||||
|
|
||||||
## 用途
|
> 给 prod 凌晨压测灌 1000 个测试用户 + 资产 + JWT,数据用 `star_id=999900` 物理隔离。
|
||||||
|
|
||||||
在 prod 本地插入 1000 个测试用户、5000 资产、3000 booth_slots、2000 exhibitions、10000 friendships,签 1000 个 JWT,写 `users.csv`。
|
---
|
||||||
|
|
||||||
|
## 一句话总结
|
||||||
|
|
||||||
|
跑 `./seed`,数据库里多出 1000 个用户 + 5000 个 assets + 2000 个 exhibitions,本地多出 `users.csv` (含 JWT)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## 编译
|
## 编译
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd backend && go build -o seed ./scripts/loadgen/seed/
|
cd backend
|
||||||
|
go build -o bin/seed ./scripts/loadgen/seed/
|
||||||
|
# 或
|
||||||
|
make loadgen-build
|
||||||
```
|
```
|
||||||
|
|
||||||
## 在 prod 上跑
|
---
|
||||||
|
|
||||||
|
## 在 prod 上跑 (凌晨 T0 = 02:00)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1. 上传二进制
|
|
||||||
scp seed root@101.132.250.62:/opt/topfans/loadtest/
|
|
||||||
|
|
||||||
# 2. SSH 上去跑
|
|
||||||
ssh root@101.132.250.62
|
ssh root@101.132.250.62
|
||||||
cd /opt/topfans/loadtest
|
cd /opt/topfans/loadtest
|
||||||
export DB_PASSWORD=$(cat /opt/topfans/docker/.env.prod | grep DB_PASSWORD | cut -d= -f2)
|
bash scripts/prod_seed.sh
|
||||||
export JWT_SECRET=$(cat /opt/topfans/docker/.env.prod | grep JWT_SECRET | cut -d= -f2)
|
|
||||||
./seed --db-name=topfans --jwt-secret="$JWT_SECRET"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## 清理
|
这个脚本会自动:
|
||||||
|
1. 读 `/opt/topfans/docker/.env.prod` 拿 DB_PASSWORD + JWT_SECRET
|
||||||
|
2. 跑 seed (插入 23k 行测试数据)
|
||||||
|
3. 自动重置 PG 序列 (CLAUDE.md 规范)
|
||||||
|
4. 写 `users.csv` (含 1000 个 JWT)
|
||||||
|
|
||||||
|
**预计耗时**:30-60 秒
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 在本地 docker 跑 (开发联调)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 保留 1000 users + 资产(下次复用)
|
cd backend/scripts/loadgen/seed
|
||||||
./seed --cleanup
|
|
||||||
|
|
||||||
# 全删(包括账号本身)
|
# 1. 生成 bcrypt 哈希 (与 tokens.go 硬编码的 "Test@123" 匹配)
|
||||||
./seed --cleanup --full
|
python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \
|
||||||
|
> loadtest_bcrypt.txt
|
||||||
|
|
||||||
# 只重签 token(第二轮压测 JWT 过期时)
|
# 2. 跑 seed (假设本地 docker postgres 在 15432)
|
||||||
./seed --reset-tokens --jwt-secret="$JWT_SECRET"
|
cd /Users/liulujian/Documents/code/TopFansByGithub/backend
|
||||||
|
DB_PASSWORD=123456 \
|
||||||
|
JWT_SECRET=topfans-secret-key-local-dev-only \
|
||||||
|
./bin/seed \
|
||||||
|
--db-name=top-fans \
|
||||||
|
--db-host=localhost \
|
||||||
|
--db-port=15432 \
|
||||||
|
--db-user=postgres
|
||||||
```
|
```
|
||||||
|
|
||||||
## 本地 docker 联调(开发阶段)
|
**注意**: `loadtest_bcrypt.txt` 必须在 seed 二进制运行的**当前目录**(代码用相对路径读)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 命令行参数
|
||||||
|
|
||||||
|
```
|
||||||
|
./bin/seed --help
|
||||||
|
|
||||||
|
Usage of ./bin/seed:
|
||||||
|
-cleanup # 跑清理 (默认保留 1000 users)
|
||||||
|
-cleanup-star-id int # 要清的 star_id (默认 999900, 防止误删)
|
||||||
|
-full # 配合 -cleanup: 也删用户和 stars
|
||||||
|
-reset # 删旧数据再 seed (隐含 --cleanup 行为)
|
||||||
|
-reset-tokens # 只重签 JWT (数据保留)
|
||||||
|
-jwt-secret string # JWT 密钥 (默认 $JWT_SECRET)
|
||||||
|
-db-host string # PG host (默认 localhost)
|
||||||
|
-db-port int # PG port (默认 5432)
|
||||||
|
-db-name string # PG 数据库 (prod=topfans, 本地=top-fans)
|
||||||
|
-db-user string # PG user (默认 postgres)
|
||||||
|
-db-password string # PG 密码 (默认 $DB_PASSWORD)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 三种"清理"模式对比
|
||||||
|
|
||||||
|
| 命令 | 删 stars | 删 users | 删 assets/exhibits | 用途 |
|
||||||
|
|------|---------|---------|-------------------|------|
|
||||||
|
| `./seed --cleanup` | ❌ | ❌ | ✅ | 压完一轮,清理资产但保留账号 |
|
||||||
|
| `./seed --cleanup --full` | ✅ | ✅ | ✅ | 全部清,下次重新 seed |
|
||||||
|
| `./seed --reset` | ❌ | ❌ | ✅ | 等同 `--cleanup`(保留用户) |
|
||||||
|
| `./seed --reset-tokens` | ❌ | ❌ | ❌ | 只重新签 JWT,数据不动 |
|
||||||
|
|
||||||
|
**典型流程**:
|
||||||
|
```bash
|
||||||
|
# 第 1 轮压测 (02:00-03:00)
|
||||||
|
./seed # 灌数据
|
||||||
|
./loadgen --cmd=run --scenarios=S1,S2,S4 # 压测
|
||||||
|
./seed --cleanup # 压完清理资产
|
||||||
|
|
||||||
|
# 第 2 轮压测 (下周,JWT 过期了)
|
||||||
|
./seed --reset-tokens --jwt-secret=$JWT_SECRET # 只重签 JWT
|
||||||
|
./loadgen --cmd=run --scenarios=S1,S2,S4 # 复测
|
||||||
|
|
||||||
|
# 完全重来 (例如改了用户模型)
|
||||||
|
./seed --cleanup --full # 全删
|
||||||
|
./seed # 重新灌
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 数据规模
|
||||||
|
|
||||||
|
| 表 | 行数 | 备注 |
|
||||||
|
|----|------|------|
|
||||||
|
| `stars` | +1 | star_id=999900 |
|
||||||
|
| `users` | +1000 | mobile 19900000001 ~ 19900001000 |
|
||||||
|
| `fan_profiles` | +1000 | 每个 user 一个 |
|
||||||
|
| `crystal_transaction_records` | +1000+ | 初始水晶 |
|
||||||
|
| `assets` | +5000 | 每个 user ~5 个 |
|
||||||
|
| `booth_slots` | +3000 | |
|
||||||
|
| `exhibitions` | +2000 | |
|
||||||
|
| `friendships` | +10000 | |
|
||||||
|
| **TOTAL** | **~23k 行** | |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 关键设计
|
||||||
|
|
||||||
|
### 1. star_id 隔离
|
||||||
|
所有测试数据用 `star_id = 999900`,**不影响**真实业务 (87, 88, 91, 93, 94, 95)。
|
||||||
|
|
||||||
|
### 2. PG max_connections = 50
|
||||||
|
prod 已将 `POSTGRES_MAX_CONNECTIONS` 从 100 调到 50,避免被测试数据耗尽连接池。
|
||||||
|
|
||||||
|
### 3. CLAUDE.md 序列重置
|
||||||
|
seed 末尾自动 `setval()` 所有相关表的 sequence,避免后续 GORM 插入报 duplicate key。
|
||||||
|
|
||||||
|
### 4. JWT 7 天过期
|
||||||
|
跨周第二轮压测前需 `--reset-tokens` 重签。
|
||||||
|
|
||||||
|
### 5. bcrypt 哈希与密码硬编码
|
||||||
|
- `tokens.go` 硬编码密码为 `"Test@123"`(写到 users.csv 的 password 列)
|
||||||
|
- `loadtest_bcrypt.txt` 是这个密码的 bcrypt(cost=10) 哈希
|
||||||
|
- 二者必须匹配,否则 login 会报 500
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 常见问题
|
||||||
|
|
||||||
|
### Q: 跑完 seed 但 login 报"密码错误"?
|
||||||
|
A: `loadtest_bcrypt.txt` 没匹配上 `Test@123`。
|
||||||
|
```bash
|
||||||
|
python3 -c "import bcrypt; print(bcrypt.hashpw(b'Test@123', bcrypt.gensalt(rounds=10)).decode())" \
|
||||||
|
> loadtest_bcrypt.txt
|
||||||
|
./seed --cleanup --full && ./seed
|
||||||
|
```
|
||||||
|
|
||||||
|
### Q: 想换密码怎么办?
|
||||||
|
A: 同时改两个地方:
|
||||||
|
1. `tokens.go` 的 `u.Mobile, "Test@123"` → 你的密码
|
||||||
|
2. `loadtest_bcrypt.txt` 重新生成
|
||||||
|
|
||||||
|
### Q: "loadtest_bcrypt.txt: no such file or directory"?
|
||||||
|
A: seed 用相对路径读这个文件,必须在 seed 目录跑(或者把文件 cp 到当前目录)。
|
||||||
|
|
||||||
|
### Q: --reset 没生效,users 还是旧的?
|
||||||
|
A: 因为 `--reset` 等同 `--cleanup`(保留用户)。要删用户用 `--cleanup --full`。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 单元测试
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd backend
|
cd backend
|
||||||
go build -o bin/seed ./scripts/loadgen/seed/
|
go test ./scripts/loadgen/seed/ -v
|
||||||
DB_PASSWORD=postgres123 JWT_SECRET=topfans-secret-key-local-dev-only \
|
|
||||||
./bin/seed --db-name=top-fans --db-host=localhost
|
|
||||||
```
|
|
||||||
|
|
||||||
## 关键约束
|
|
||||||
|
|
||||||
- **star_id = 999900**:所有数据用此 star_id 隔离,不影响真实业务
|
|
||||||
- **PG max_connections = 50**:Task 5 已将 `POSTGRES_MAX_CONNECTIONS` 从 100 改到 50
|
|
||||||
- **CLAUDE.md 序列重置**:ResetSequences 会在 seed 末尾自动同步所有相关表的 sequence,避免后续 GORM 插入报 duplicate key
|
|
||||||
- **JWT 7 天过期**:跨周第二轮压测前需 `--reset-tokens` 重签
|
|
||||||
|
|
||||||
## 测试
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd backend && go test ./scripts/loadgen/seed/ -v
|
|
||||||
```
|
```
|
||||||
|
|
||||||
5 个测试:
|
5 个测试:
|
||||||
- `TestMobileNumbering`:mobile 编号正确性
|
- `TestMobileNumbering`: mobile 编号正确性
|
||||||
- `TestSequenceMapping`:loadtestSeqs 映射
|
- `TestSequenceMapping`: loadtestSeqs 映射
|
||||||
- `TestPKColumnMapping`:pkColumns 映射(关键 stars/star_id, booth_slots/slot_id)
|
- `TestPKColumnMapping`: pkColumns 映射(关键 stars/star_id, booth_slots/slot_id)
|
||||||
- `TestCleanupRejectsInvalidStarID`:cleanup 拒绝非 loadtest star_id
|
- `TestCleanupRejectsInvalidStarID`: cleanup 拒绝非 loadtest star_id
|
||||||
- `TestJoinInt64`:CSV 序列化辅助函数
|
- `TestJoinInt64`: CSV 序列化辅助函数
|
||||||
|
|
||||||
|
**测试状态**: 5/5 PASS
|
||||||
|
|||||||
@ -470,7 +470,7 @@ defineExpose({
|
|||||||
.creation-grid {
|
.creation-grid {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-wrap: wrap;
|
flex-wrap: wrap;
|
||||||
justify-content: space-between;
|
justify-content: space-around;
|
||||||
padding-bottom: 120rpx;
|
padding-bottom: 120rpx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -456,6 +456,8 @@ onUnmounted(() => {
|
|||||||
min-height: 0;
|
min-height: 0;
|
||||||
border-radius: 12px;
|
border-radius: 12px;
|
||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
|
position: relative;
|
||||||
|
z-index: 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
.ranking-tabs {
|
.ranking-tabs {
|
||||||
@ -636,6 +638,7 @@ onUnmounted(() => {
|
|||||||
/* box-shadow: 2px 2px 4.5px 0px #f04b4b40; */
|
/* box-shadow: 2px 2px 4.5px 0px #f04b4b40; */
|
||||||
box-shadow: 2px 4px 4px 0px #c92f2f5c;
|
box-shadow: 2px 4px 4px 0px #c92f2f5c;
|
||||||
margin-bottom: 36.8rpx;
|
margin-bottom: 36.8rpx;
|
||||||
|
z-index: 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 单行布局:藏品图片 + 头像 + 点赞信息 + TOP 标签 */
|
/* 单行布局:藏品图片 + 头像 + 点赞信息 + TOP 标签 */
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user