deep-risk/backend/scripts/generate_test_data.py
2025-12-14 20:08:27 +08:00

368 lines
15 KiB
Python

#!/usr/bin/env python3
"""
测试数据生成脚本
为税务风控系统生成各种测试数据
"""
import os
import sys
import json
import random
from datetime import datetime, timedelta
from typing import List, Dict, Any
from decimal import Decimal
# 添加项目路径
sys.path.append('/Users/liulujian/Documents/code/deeprisk-claude-1/backend')
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
from loguru import logger
# 配置日志
logger.add("test_data_generation.log", rotation="100 MB", level="INFO")
class TestDataGenerator:
"""测试数据生成器"""
def __init__(self):
# 数据库连接 - 使用SQLite用于测试
self.engine = create_engine(
"sqlite:///./test_data.db",
echo=False,
future=True
)
self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
def generate_streamers(self, count: int = 50) -> List[Dict]:
"""生成主播数据"""
logger.info(f"开始生成 {count} 个主播数据...")
streamers = []
tax_nos = [
"91110000123456789A", "91110000987654321B", "91110000555666777C",
"91110000333444555D", "91110000111222333E", "91110000777888999F"
]
for i in range(count):
streamer = {
"streamer_id": f"STREAMER_{i+1:04d}",
"streamer_name": f"主播_{i+1}",
"tax_no": random.choice(tax_nos),
"platform": random.choice(["抖音", "快手", "淘宝直播", "小红书"]),
"tier": random.choice(["S", "A", "B", "C"]),
"status": "active",
"created_at": datetime.now() - timedelta(days=random.randint(30, 365))
}
streamers.append(streamer)
logger.info(f"成功生成 {len(streamers)} 个主播数据")
return streamers
def generate_recharges(self, streamers: List[Dict], count: int = 1000) -> List[Dict]:
"""生成平台充值数据"""
logger.info(f"开始生成 {count} 条充值记录...")
recharges = []
for i in range(count):
streamer = random.choice(streamers)
amount = random.randint(1000, 100000)
recharge = {
"recharge_id": f"RECHARGE_{i+1:06d}",
"streamer_id": streamer["streamer_id"],
"recharge_date": datetime.now() - timedelta(days=random.randint(0, 365)),
"recharge_amount": float(amount),
"payment_method": random.choice(["支付宝", "微信支付", "银行卡", "Apple Pay"]),
"payment_status": "completed",
"platform": streamer["platform"],
"created_at": datetime.now() - timedelta(days=random.randint(0, 365))
}
recharges.append(recharge)
logger.info(f"成功生成 {len(recharges)} 条充值记录")
return recharges
def generate_tax_declarations(self, streamers: List[Dict], count: int = 500) -> List[Dict]:
"""生成税务申报数据"""
logger.info(f"开始生成 {count} 条税务申报记录...")
declarations = []
tax_nos = [
"91110000123456789A", "91110000987654321B", "91110000555666777C",
"91110000333444555D", "91110000111222333E", "91110000777888999F"
]
for i in range(count):
streamer = random.choice(streamers)
# 故意生成一些漏报/少报的情况
if random.random() < 0.1: # 10%的概率漏报
declared_amount = 0
elif random.random() < 0.2: # 20%的概率少报
declared_amount = random.randint(100, 50000)
else:
declared_amount = random.randint(10000, 100000)
declaration = {
"declaration_id": f"TAX_{i+1:06d}",
"tax_no": random.choice(tax_nos),
"declaration_date": datetime.now() - timedelta(days=random.randint(0, 365)),
"declared_amount": float(declared_amount),
"tax_rate": 0.13,
"tax_amount": float(declared_amount * 0.13),
"declaration_period": "2024-01",
"status": "submitted"
}
declarations.append(declaration)
logger.info(f"成功生成 {len(declarations)} 条税务申报记录")
return declarations
def generate_bank_transactions(self, count: int = 2000) -> List[Dict]:
"""生成银行流水数据"""
logger.info(f"开始生成 {count} 条银行流水记录...")
transactions = []
counterparty_names = [
"张三", "李四", "王五", "赵六", "钱七", # 个人
"XX科技有限公司", "YY贸易有限公司", "ZZ文化传媒有限公司", "AA网络科技有限公司" # 企业
]
for i in range(count):
counterparty = random.choice(counterparty_names)
# 个人账户更容易被检测为私户
is_personal = counterparty in ["张三", "李四", "王五", "赵六", "钱七"]
amount = random.randint(1000, 50000) if not is_personal else random.randint(10000, 100000)
transaction = {
"transaction_id": f"TXN_{i+1:06d}",
"account_no": f"6222{random.randint(1000000000000, 9999999999999)}",
"transaction_date": datetime.now() - timedelta(days=random.randint(0, 365)),
"transaction_type": random.choice(["转入", "转出", "消费", "收入"]),
"amount": float(amount),
"counterparty_name": counterparty,
"counterparty_account": f"6222{random.randint(1000000000000, 9999999999999)}",
"counterparty_bank": random.choice(["中国银行", "工商银行", "建设银行", "农业银行"]),
"description": random.choice(["转账", "收入", "退款", "服务费", "货款"]),
"balance": float(random.randint(100000, 1000000))
}
transactions.append(transaction)
logger.info(f"成功生成 {len(transactions)} 条银行流水记录")
return transactions
def generate_invoices(self, count: int = 800) -> List[Dict]:
"""生成发票数据"""
logger.info(f"开始生成 {count} 张发票...")
invoices = []
tax_nos = [
"91110000123456789A", "91110000987654321B", "91110000555666777C"
]
business_types = ["销售", "服务", "加工", "修理"]
for i in range(count):
# 10%的概率生成虚开发票(无对应订单)
is_fake = random.random() < 0.1
if is_fake:
total_amount = random.randint(50000, 200000) # 虚开发票金额较大
order_id = None
else:
total_amount = random.randint(5000, 50000)
order_id = f"ORDER_{random.randint(1, 10000):06d}"
tax_rate = random.choice([0.06, 0.09, 0.13])
tax_amount = total_amount * tax_rate
invoice = {
"invoice_id": f"INV_{i+1:06d}",
"seller_tax_no": random.choice(tax_nos),
"seller_name": "销售方企业",
"buyer_tax_no": random.choice(tax_nos),
"buyer_name": "购买方企业",
"invoice_date": datetime.now() - timedelta(days=random.randint(0, 365)),
"total_amount": float(total_amount),
"tax_amount": float(tax_amount),
"tax_rate": tax_rate,
"invoice_type": random.choice(["special", "normal"]),
"invoice_status": random.choice(["valid", "cancelled", "red"]),
"business_type": random.choice(business_types),
"order_id": order_id
}
invoices.append(invoice)
logger.info(f"成功生成 {len(invoices)} 张发票")
return invoices
def generate_orders(self, count: int = 1500) -> List[Dict]:
"""生成订单数据"""
logger.info(f"开始生成 {count} 条订单...")
orders = []
tax_nos = [
"91110000123456789A", "91110000987654321B", "91110000555666777C"
]
for i in range(count):
amount = random.randint(5000, 50000)
order = {
"order_id": f"ORDER_{i+1:06d}",
"seller_tax_no": random.choice(tax_nos),
"buyer_tax_no": random.choice(tax_nos),
"order_date": datetime.now() - timedelta(days=random.randint(0, 365)),
"total_amount": float(amount),
"payment_status": random.choice(["paid", "pending", "refunded"]),
"fulfillment_status": random.choice(["completed", "pending", "cancelled"]),
"settlement_id": f"SETTLE_{random.randint(1, 500):06d}"
}
orders.append(order)
logger.info(f"成功生成 {len(orders)} 条订单")
return orders
def generate_expenses(self, count: int = 600) -> List[Dict]:
"""生成费用凭证数据"""
logger.info(f"开始生成 {count} 条费用凭证...")
expenses = []
categories = ["办公费", "差旅费", "招待费", "交通费", "其他费用"]
for i in range(count):
# 15%的概率生成异常费用(集中、大额等)
is_anomaly = random.random() < 0.15
amount = random.randint(10000, 200000) if is_anomaly else random.randint(1000, 30000)
expense = {
"expense_id": f"EXP_{i+1:06d}",
"voucher_no": f"VOU2024{i+1:06d}",
"expense_type": random.choice(["成本", "费用"]),
"expense_category": random.choice(categories),
"payer_name": "付款方企业",
"payee_name": random.choice(["张三", "李四", "服务提供商"]),
"expense_date": datetime.now() - timedelta(days=random.randint(0, 365)),
"expense_amount": float(amount),
"tax_amount": float(amount * 0.13),
"tax_rate": 0.13,
"payment_method": random.choice(["银行转账", "现金", "支票"]),
"is_large_amount": amount > 50000,
"is_cross_border": random.random() < 0.2,
"fiscal_year": 2024,
"fiscal_period": random.randint(1, 12),
"payment_status": "已支付"
}
expenses.append(expense)
logger.info(f"成功生成 {len(expenses)} 条费用凭证")
return expenses
def generate_settlements(self, count: int = 400) -> List[Dict]:
"""生成结算数据"""
logger.info(f"开始生成 {count} 条结算记录...")
settlements = []
for i in range(count):
amount = random.randint(5000, 50000)
settlement = {
"settlement_id": f"SETTLE_{i+1:06d}",
"order_id": f"ORDER_{i+1:06d}",
"actual_amount": float(amount),
"settlement_date": datetime.now() - timedelta(days=random.randint(0, 365)),
"settlement_status": random.choice(["completed", "pending", "cancelled"]),
"commission_rate": round(random.uniform(0.05, 0.3), 2),
"platform_commission": float(amount * random.uniform(0.05, 0.3))
}
settlements.append(settlement)
logger.info(f"成功生成 {len(settlements)} 条结算记录")
return settlements
def save_to_json(self, data: List[Dict], filename: str):
"""保存数据到JSON文件"""
filepath = f"/Users/liulujian/Documents/code/deeprisk-claude-1/backend/test_data/{filename}"
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2, default=str)
logger.info(f"数据已保存到: {filepath}")
def generate_all_data(self):
"""生成所有测试数据"""
logger.info("开始生成测试数据...")
# 1. 生成主播数据
streamers = self.generate_streamers(50)
# 2. 生成关联数据
recharges = self.generate_recharges(streamers, 1000)
tax_declarations = self.generate_tax_declarations(streamers, 500)
bank_transactions = self.generate_bank_transactions(2000)
invoices = self.generate_invoices(800)
orders = self.generate_orders(1500)
expenses = self.generate_expenses(600)
settlements = self.generate_settlements(400)
# 3. 保存到JSON文件
self.save_to_json(streamers, "streamers.json")
self.save_to_json(recharges, "recharges.json")
self.save_to_json(tax_declarations, "tax_declarations.json")
self.save_to_json(bank_transactions, "bank_transactions.json")
self.save_to_json(invoices, "invoices.json")
self.save_to_json(orders, "orders.json")
self.save_to_json(expenses, "expenses.json")
self.save_to_json(settlements, "settlements.json")
# 4. 生成汇总报告
summary = {
"generation_time": datetime.now().isoformat(),
"data_count": {
"streamers": len(streamers),
"recharges": len(recharges),
"tax_declarations": len(tax_declarations),
"bank_transactions": len(bank_transactions),
"invoices": len(invoices),
"orders": len(orders),
"expenses": len(expenses),
"settlements": len(settlements),
},
"total_records": (
len(streamers) + len(recharges) + len(tax_declarations) +
len(bank_transactions) + len(invoices) + len(orders) +
len(expenses) + len(settlements)
)
}
self.save_to_json(summary, "summary.json")
logger.info("测试数据生成完成!")
logger.info(f"总计生成 {summary['total_records']} 条记录")
logger.info(json.dumps(summary, ensure_ascii=False, indent=2))
def main():
"""主函数"""
print("=" * 60)
print("税务风控系统 - 测试数据生成器")
print("=" * 60)
generator = TestDataGenerator()
try:
generator.generate_all_data()
print("\n✅ 测试数据生成成功!")
print("📂 数据文件位置: /Users/liulujian/Documents/code/deeprisk-claude-1/backend/test_data/")
print("\n生成的数据文件:")
print(" - streamers.json (主播数据)")
print(" - recharges.json (充值记录)")
print(" - tax_declarations.json (税务申报)")
print(" - bank_transactions.json (银行流水)")
print(" - invoices.json (发票数据)")
print(" - orders.json (订单数据)")
print(" - expenses.json (费用凭证)")
print(" - settlements.json (结算数据)")
print(" - summary.json (汇总信息)")
except Exception as e:
logger.error(f"生成测试数据失败: {str(e)}", exc_info=True)
print(f"\n❌ 生成失败: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
main()