368 lines
15 KiB
Python
368 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
测试数据生成脚本
|
|
为税务风控系统生成各种测试数据
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import random
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Any
|
|
from decimal import Decimal
|
|
|
|
# 添加项目路径
|
|
sys.path.append('/Users/liulujian/Documents/code/deeprisk-claude-1/backend')
|
|
|
|
from sqlalchemy import create_engine, text
|
|
from sqlalchemy.orm import sessionmaker
|
|
from loguru import logger
|
|
|
|
# 配置日志
|
|
logger.add("test_data_generation.log", rotation="100 MB", level="INFO")
|
|
|
|
class TestDataGenerator:
|
|
"""测试数据生成器"""
|
|
|
|
def __init__(self):
|
|
# 数据库连接 - 使用SQLite用于测试
|
|
self.engine = create_engine(
|
|
"sqlite:///./test_data.db",
|
|
echo=False,
|
|
future=True
|
|
)
|
|
self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
|
|
|
|
def generate_streamers(self, count: int = 50) -> List[Dict]:
|
|
"""生成主播数据"""
|
|
logger.info(f"开始生成 {count} 个主播数据...")
|
|
streamers = []
|
|
tax_nos = [
|
|
"91110000123456789A", "91110000987654321B", "91110000555666777C",
|
|
"91110000333444555D", "91110000111222333E", "91110000777888999F"
|
|
]
|
|
|
|
for i in range(count):
|
|
streamer = {
|
|
"streamer_id": f"STREAMER_{i+1:04d}",
|
|
"streamer_name": f"主播_{i+1}",
|
|
"tax_no": random.choice(tax_nos),
|
|
"platform": random.choice(["抖音", "快手", "淘宝直播", "小红书"]),
|
|
"tier": random.choice(["S", "A", "B", "C"]),
|
|
"status": "active",
|
|
"created_at": datetime.now() - timedelta(days=random.randint(30, 365))
|
|
}
|
|
streamers.append(streamer)
|
|
|
|
logger.info(f"成功生成 {len(streamers)} 个主播数据")
|
|
return streamers
|
|
|
|
def generate_recharges(self, streamers: List[Dict], count: int = 1000) -> List[Dict]:
|
|
"""生成平台充值数据"""
|
|
logger.info(f"开始生成 {count} 条充值记录...")
|
|
recharges = []
|
|
|
|
for i in range(count):
|
|
streamer = random.choice(streamers)
|
|
amount = random.randint(1000, 100000)
|
|
|
|
recharge = {
|
|
"recharge_id": f"RECHARGE_{i+1:06d}",
|
|
"streamer_id": streamer["streamer_id"],
|
|
"recharge_date": datetime.now() - timedelta(days=random.randint(0, 365)),
|
|
"recharge_amount": float(amount),
|
|
"payment_method": random.choice(["支付宝", "微信支付", "银行卡", "Apple Pay"]),
|
|
"payment_status": "completed",
|
|
"platform": streamer["platform"],
|
|
"created_at": datetime.now() - timedelta(days=random.randint(0, 365))
|
|
}
|
|
recharges.append(recharge)
|
|
|
|
logger.info(f"成功生成 {len(recharges)} 条充值记录")
|
|
return recharges
|
|
|
|
def generate_tax_declarations(self, streamers: List[Dict], count: int = 500) -> List[Dict]:
|
|
"""生成税务申报数据"""
|
|
logger.info(f"开始生成 {count} 条税务申报记录...")
|
|
declarations = []
|
|
tax_nos = [
|
|
"91110000123456789A", "91110000987654321B", "91110000555666777C",
|
|
"91110000333444555D", "91110000111222333E", "91110000777888999F"
|
|
]
|
|
|
|
for i in range(count):
|
|
streamer = random.choice(streamers)
|
|
# 故意生成一些漏报/少报的情况
|
|
if random.random() < 0.1: # 10%的概率漏报
|
|
declared_amount = 0
|
|
elif random.random() < 0.2: # 20%的概率少报
|
|
declared_amount = random.randint(100, 50000)
|
|
else:
|
|
declared_amount = random.randint(10000, 100000)
|
|
|
|
declaration = {
|
|
"declaration_id": f"TAX_{i+1:06d}",
|
|
"tax_no": random.choice(tax_nos),
|
|
"declaration_date": datetime.now() - timedelta(days=random.randint(0, 365)),
|
|
"declared_amount": float(declared_amount),
|
|
"tax_rate": 0.13,
|
|
"tax_amount": float(declared_amount * 0.13),
|
|
"declaration_period": "2024-01",
|
|
"status": "submitted"
|
|
}
|
|
declarations.append(declaration)
|
|
|
|
logger.info(f"成功生成 {len(declarations)} 条税务申报记录")
|
|
return declarations
|
|
|
|
def generate_bank_transactions(self, count: int = 2000) -> List[Dict]:
|
|
"""生成银行流水数据"""
|
|
logger.info(f"开始生成 {count} 条银行流水记录...")
|
|
transactions = []
|
|
counterparty_names = [
|
|
"张三", "李四", "王五", "赵六", "钱七", # 个人
|
|
"XX科技有限公司", "YY贸易有限公司", "ZZ文化传媒有限公司", "AA网络科技有限公司" # 企业
|
|
]
|
|
|
|
for i in range(count):
|
|
counterparty = random.choice(counterparty_names)
|
|
# 个人账户更容易被检测为私户
|
|
is_personal = counterparty in ["张三", "李四", "王五", "赵六", "钱七"]
|
|
amount = random.randint(1000, 50000) if not is_personal else random.randint(10000, 100000)
|
|
|
|
transaction = {
|
|
"transaction_id": f"TXN_{i+1:06d}",
|
|
"account_no": f"6222{random.randint(1000000000000, 9999999999999)}",
|
|
"transaction_date": datetime.now() - timedelta(days=random.randint(0, 365)),
|
|
"transaction_type": random.choice(["转入", "转出", "消费", "收入"]),
|
|
"amount": float(amount),
|
|
"counterparty_name": counterparty,
|
|
"counterparty_account": f"6222{random.randint(1000000000000, 9999999999999)}",
|
|
"counterparty_bank": random.choice(["中国银行", "工商银行", "建设银行", "农业银行"]),
|
|
"description": random.choice(["转账", "收入", "退款", "服务费", "货款"]),
|
|
"balance": float(random.randint(100000, 1000000))
|
|
}
|
|
transactions.append(transaction)
|
|
|
|
logger.info(f"成功生成 {len(transactions)} 条银行流水记录")
|
|
return transactions
|
|
|
|
def generate_invoices(self, count: int = 800) -> List[Dict]:
|
|
"""生成发票数据"""
|
|
logger.info(f"开始生成 {count} 张发票...")
|
|
invoices = []
|
|
tax_nos = [
|
|
"91110000123456789A", "91110000987654321B", "91110000555666777C"
|
|
]
|
|
business_types = ["销售", "服务", "加工", "修理"]
|
|
|
|
for i in range(count):
|
|
# 10%的概率生成虚开发票(无对应订单)
|
|
is_fake = random.random() < 0.1
|
|
|
|
if is_fake:
|
|
total_amount = random.randint(50000, 200000) # 虚开发票金额较大
|
|
order_id = None
|
|
else:
|
|
total_amount = random.randint(5000, 50000)
|
|
order_id = f"ORDER_{random.randint(1, 10000):06d}"
|
|
|
|
tax_rate = random.choice([0.06, 0.09, 0.13])
|
|
tax_amount = total_amount * tax_rate
|
|
|
|
invoice = {
|
|
"invoice_id": f"INV_{i+1:06d}",
|
|
"seller_tax_no": random.choice(tax_nos),
|
|
"seller_name": "销售方企业",
|
|
"buyer_tax_no": random.choice(tax_nos),
|
|
"buyer_name": "购买方企业",
|
|
"invoice_date": datetime.now() - timedelta(days=random.randint(0, 365)),
|
|
"total_amount": float(total_amount),
|
|
"tax_amount": float(tax_amount),
|
|
"tax_rate": tax_rate,
|
|
"invoice_type": random.choice(["special", "normal"]),
|
|
"invoice_status": random.choice(["valid", "cancelled", "red"]),
|
|
"business_type": random.choice(business_types),
|
|
"order_id": order_id
|
|
}
|
|
invoices.append(invoice)
|
|
|
|
logger.info(f"成功生成 {len(invoices)} 张发票")
|
|
return invoices
|
|
|
|
def generate_orders(self, count: int = 1500) -> List[Dict]:
|
|
"""生成订单数据"""
|
|
logger.info(f"开始生成 {count} 条订单...")
|
|
orders = []
|
|
tax_nos = [
|
|
"91110000123456789A", "91110000987654321B", "91110000555666777C"
|
|
]
|
|
|
|
for i in range(count):
|
|
amount = random.randint(5000, 50000)
|
|
|
|
order = {
|
|
"order_id": f"ORDER_{i+1:06d}",
|
|
"seller_tax_no": random.choice(tax_nos),
|
|
"buyer_tax_no": random.choice(tax_nos),
|
|
"order_date": datetime.now() - timedelta(days=random.randint(0, 365)),
|
|
"total_amount": float(amount),
|
|
"payment_status": random.choice(["paid", "pending", "refunded"]),
|
|
"fulfillment_status": random.choice(["completed", "pending", "cancelled"]),
|
|
"settlement_id": f"SETTLE_{random.randint(1, 500):06d}"
|
|
}
|
|
orders.append(order)
|
|
|
|
logger.info(f"成功生成 {len(orders)} 条订单")
|
|
return orders
|
|
|
|
def generate_expenses(self, count: int = 600) -> List[Dict]:
|
|
"""生成费用凭证数据"""
|
|
logger.info(f"开始生成 {count} 条费用凭证...")
|
|
expenses = []
|
|
categories = ["办公费", "差旅费", "招待费", "交通费", "其他费用"]
|
|
|
|
for i in range(count):
|
|
# 15%的概率生成异常费用(集中、大额等)
|
|
is_anomaly = random.random() < 0.15
|
|
amount = random.randint(10000, 200000) if is_anomaly else random.randint(1000, 30000)
|
|
|
|
expense = {
|
|
"expense_id": f"EXP_{i+1:06d}",
|
|
"voucher_no": f"VOU2024{i+1:06d}",
|
|
"expense_type": random.choice(["成本", "费用"]),
|
|
"expense_category": random.choice(categories),
|
|
"payer_name": "付款方企业",
|
|
"payee_name": random.choice(["张三", "李四", "服务提供商"]),
|
|
"expense_date": datetime.now() - timedelta(days=random.randint(0, 365)),
|
|
"expense_amount": float(amount),
|
|
"tax_amount": float(amount * 0.13),
|
|
"tax_rate": 0.13,
|
|
"payment_method": random.choice(["银行转账", "现金", "支票"]),
|
|
"is_large_amount": amount > 50000,
|
|
"is_cross_border": random.random() < 0.2,
|
|
"fiscal_year": 2024,
|
|
"fiscal_period": random.randint(1, 12),
|
|
"payment_status": "已支付"
|
|
}
|
|
expenses.append(expense)
|
|
|
|
logger.info(f"成功生成 {len(expenses)} 条费用凭证")
|
|
return expenses
|
|
|
|
def generate_settlements(self, count: int = 400) -> List[Dict]:
|
|
"""生成结算数据"""
|
|
logger.info(f"开始生成 {count} 条结算记录...")
|
|
settlements = []
|
|
|
|
for i in range(count):
|
|
amount = random.randint(5000, 50000)
|
|
|
|
settlement = {
|
|
"settlement_id": f"SETTLE_{i+1:06d}",
|
|
"order_id": f"ORDER_{i+1:06d}",
|
|
"actual_amount": float(amount),
|
|
"settlement_date": datetime.now() - timedelta(days=random.randint(0, 365)),
|
|
"settlement_status": random.choice(["completed", "pending", "cancelled"]),
|
|
"commission_rate": round(random.uniform(0.05, 0.3), 2),
|
|
"platform_commission": float(amount * random.uniform(0.05, 0.3))
|
|
}
|
|
settlements.append(settlement)
|
|
|
|
logger.info(f"成功生成 {len(settlements)} 条结算记录")
|
|
return settlements
|
|
|
|
def save_to_json(self, data: List[Dict], filename: str):
|
|
"""保存数据到JSON文件"""
|
|
filepath = f"/Users/liulujian/Documents/code/deeprisk-claude-1/backend/test_data/{filename}"
|
|
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2, default=str)
|
|
|
|
logger.info(f"数据已保存到: {filepath}")
|
|
|
|
def generate_all_data(self):
|
|
"""生成所有测试数据"""
|
|
logger.info("开始生成测试数据...")
|
|
|
|
# 1. 生成主播数据
|
|
streamers = self.generate_streamers(50)
|
|
|
|
# 2. 生成关联数据
|
|
recharges = self.generate_recharges(streamers, 1000)
|
|
tax_declarations = self.generate_tax_declarations(streamers, 500)
|
|
bank_transactions = self.generate_bank_transactions(2000)
|
|
invoices = self.generate_invoices(800)
|
|
orders = self.generate_orders(1500)
|
|
expenses = self.generate_expenses(600)
|
|
settlements = self.generate_settlements(400)
|
|
|
|
# 3. 保存到JSON文件
|
|
self.save_to_json(streamers, "streamers.json")
|
|
self.save_to_json(recharges, "recharges.json")
|
|
self.save_to_json(tax_declarations, "tax_declarations.json")
|
|
self.save_to_json(bank_transactions, "bank_transactions.json")
|
|
self.save_to_json(invoices, "invoices.json")
|
|
self.save_to_json(orders, "orders.json")
|
|
self.save_to_json(expenses, "expenses.json")
|
|
self.save_to_json(settlements, "settlements.json")
|
|
|
|
# 4. 生成汇总报告
|
|
summary = {
|
|
"generation_time": datetime.now().isoformat(),
|
|
"data_count": {
|
|
"streamers": len(streamers),
|
|
"recharges": len(recharges),
|
|
"tax_declarations": len(tax_declarations),
|
|
"bank_transactions": len(bank_transactions),
|
|
"invoices": len(invoices),
|
|
"orders": len(orders),
|
|
"expenses": len(expenses),
|
|
"settlements": len(settlements),
|
|
},
|
|
"total_records": (
|
|
len(streamers) + len(recharges) + len(tax_declarations) +
|
|
len(bank_transactions) + len(invoices) + len(orders) +
|
|
len(expenses) + len(settlements)
|
|
)
|
|
}
|
|
|
|
self.save_to_json(summary, "summary.json")
|
|
|
|
logger.info("测试数据生成完成!")
|
|
logger.info(f"总计生成 {summary['total_records']} 条记录")
|
|
logger.info(json.dumps(summary, ensure_ascii=False, indent=2))
|
|
|
|
def main():
|
|
"""主函数"""
|
|
print("=" * 60)
|
|
print("税务风控系统 - 测试数据生成器")
|
|
print("=" * 60)
|
|
|
|
generator = TestDataGenerator()
|
|
|
|
try:
|
|
generator.generate_all_data()
|
|
print("\n✅ 测试数据生成成功!")
|
|
print("📂 数据文件位置: /Users/liulujian/Documents/code/deeprisk-claude-1/backend/test_data/")
|
|
print("\n生成的数据文件:")
|
|
print(" - streamers.json (主播数据)")
|
|
print(" - recharges.json (充值记录)")
|
|
print(" - tax_declarations.json (税务申报)")
|
|
print(" - bank_transactions.json (银行流水)")
|
|
print(" - invoices.json (发票数据)")
|
|
print(" - orders.json (订单数据)")
|
|
print(" - expenses.json (费用凭证)")
|
|
print(" - settlements.json (结算数据)")
|
|
print(" - summary.json (汇总信息)")
|
|
|
|
except Exception as e:
|
|
logger.error(f"生成测试数据失败: {str(e)}", exc_info=True)
|
|
print(f"\n❌ 生成失败: {str(e)}")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|