#!/usr/bin/env python3 """ 测试数据生成脚本 为税务风控系统生成各种测试数据 """ import os import sys import json import random from datetime import datetime, timedelta from typing import List, Dict, Any from decimal import Decimal # 添加项目路径 sys.path.append('/Users/liulujian/Documents/code/deeprisk-claude-1/backend') from sqlalchemy import create_engine, text from sqlalchemy.orm import sessionmaker from loguru import logger # 配置日志 logger.add("test_data_generation.log", rotation="100 MB", level="INFO") class TestDataGenerator: """测试数据生成器""" def __init__(self): # 数据库连接 - 使用SQLite用于测试 self.engine = create_engine( "sqlite:///./test_data.db", echo=False, future=True ) self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine) def generate_streamers(self, count: int = 50) -> List[Dict]: """生成主播数据""" logger.info(f"开始生成 {count} 个主播数据...") streamers = [] tax_nos = [ "91110000123456789A", "91110000987654321B", "91110000555666777C", "91110000333444555D", "91110000111222333E", "91110000777888999F" ] for i in range(count): streamer = { "streamer_id": f"STREAMER_{i+1:04d}", "streamer_name": f"主播_{i+1}", "tax_no": random.choice(tax_nos), "platform": random.choice(["抖音", "快手", "淘宝直播", "小红书"]), "tier": random.choice(["S", "A", "B", "C"]), "status": "active", "created_at": datetime.now() - timedelta(days=random.randint(30, 365)) } streamers.append(streamer) logger.info(f"成功生成 {len(streamers)} 个主播数据") return streamers def generate_recharges(self, streamers: List[Dict], count: int = 1000) -> List[Dict]: """生成平台充值数据""" logger.info(f"开始生成 {count} 条充值记录...") recharges = [] for i in range(count): streamer = random.choice(streamers) amount = random.randint(1000, 100000) recharge = { "recharge_id": f"RECHARGE_{i+1:06d}", "streamer_id": streamer["streamer_id"], "recharge_date": datetime.now() - timedelta(days=random.randint(0, 365)), "recharge_amount": float(amount), "payment_method": random.choice(["支付宝", "微信支付", "银行卡", "Apple Pay"]), "payment_status": "completed", "platform": streamer["platform"], "created_at": datetime.now() - timedelta(days=random.randint(0, 365)) } recharges.append(recharge) logger.info(f"成功生成 {len(recharges)} 条充值记录") return recharges def generate_tax_declarations(self, streamers: List[Dict], count: int = 500) -> List[Dict]: """生成税务申报数据""" logger.info(f"开始生成 {count} 条税务申报记录...") declarations = [] tax_nos = [ "91110000123456789A", "91110000987654321B", "91110000555666777C", "91110000333444555D", "91110000111222333E", "91110000777888999F" ] for i in range(count): streamer = random.choice(streamers) # 故意生成一些漏报/少报的情况 if random.random() < 0.1: # 10%的概率漏报 declared_amount = 0 elif random.random() < 0.2: # 20%的概率少报 declared_amount = random.randint(100, 50000) else: declared_amount = random.randint(10000, 100000) declaration = { "declaration_id": f"TAX_{i+1:06d}", "tax_no": random.choice(tax_nos), "declaration_date": datetime.now() - timedelta(days=random.randint(0, 365)), "declared_amount": float(declared_amount), "tax_rate": 0.13, "tax_amount": float(declared_amount * 0.13), "declaration_period": "2024-01", "status": "submitted" } declarations.append(declaration) logger.info(f"成功生成 {len(declarations)} 条税务申报记录") return declarations def generate_bank_transactions(self, count: int = 2000) -> List[Dict]: """生成银行流水数据""" logger.info(f"开始生成 {count} 条银行流水记录...") transactions = [] counterparty_names = [ "张三", "李四", "王五", "赵六", "钱七", # 个人 "XX科技有限公司", "YY贸易有限公司", "ZZ文化传媒有限公司", "AA网络科技有限公司" # 企业 ] for i in range(count): counterparty = random.choice(counterparty_names) # 个人账户更容易被检测为私户 is_personal = counterparty in ["张三", "李四", "王五", "赵六", "钱七"] amount = random.randint(1000, 50000) if not is_personal else random.randint(10000, 100000) transaction = { "transaction_id": f"TXN_{i+1:06d}", "account_no": f"6222{random.randint(1000000000000, 9999999999999)}", "transaction_date": datetime.now() - timedelta(days=random.randint(0, 365)), "transaction_type": random.choice(["转入", "转出", "消费", "收入"]), "amount": float(amount), "counterparty_name": counterparty, "counterparty_account": f"6222{random.randint(1000000000000, 9999999999999)}", "counterparty_bank": random.choice(["中国银行", "工商银行", "建设银行", "农业银行"]), "description": random.choice(["转账", "收入", "退款", "服务费", "货款"]), "balance": float(random.randint(100000, 1000000)) } transactions.append(transaction) logger.info(f"成功生成 {len(transactions)} 条银行流水记录") return transactions def generate_invoices(self, count: int = 800) -> List[Dict]: """生成发票数据""" logger.info(f"开始生成 {count} 张发票...") invoices = [] tax_nos = [ "91110000123456789A", "91110000987654321B", "91110000555666777C" ] business_types = ["销售", "服务", "加工", "修理"] for i in range(count): # 10%的概率生成虚开发票(无对应订单) is_fake = random.random() < 0.1 if is_fake: total_amount = random.randint(50000, 200000) # 虚开发票金额较大 order_id = None else: total_amount = random.randint(5000, 50000) order_id = f"ORDER_{random.randint(1, 10000):06d}" tax_rate = random.choice([0.06, 0.09, 0.13]) tax_amount = total_amount * tax_rate invoice = { "invoice_id": f"INV_{i+1:06d}", "seller_tax_no": random.choice(tax_nos), "seller_name": "销售方企业", "buyer_tax_no": random.choice(tax_nos), "buyer_name": "购买方企业", "invoice_date": datetime.now() - timedelta(days=random.randint(0, 365)), "total_amount": float(total_amount), "tax_amount": float(tax_amount), "tax_rate": tax_rate, "invoice_type": random.choice(["special", "normal"]), "invoice_status": random.choice(["valid", "cancelled", "red"]), "business_type": random.choice(business_types), "order_id": order_id } invoices.append(invoice) logger.info(f"成功生成 {len(invoices)} 张发票") return invoices def generate_orders(self, count: int = 1500) -> List[Dict]: """生成订单数据""" logger.info(f"开始生成 {count} 条订单...") orders = [] tax_nos = [ "91110000123456789A", "91110000987654321B", "91110000555666777C" ] for i in range(count): amount = random.randint(5000, 50000) order = { "order_id": f"ORDER_{i+1:06d}", "seller_tax_no": random.choice(tax_nos), "buyer_tax_no": random.choice(tax_nos), "order_date": datetime.now() - timedelta(days=random.randint(0, 365)), "total_amount": float(amount), "payment_status": random.choice(["paid", "pending", "refunded"]), "fulfillment_status": random.choice(["completed", "pending", "cancelled"]), "settlement_id": f"SETTLE_{random.randint(1, 500):06d}" } orders.append(order) logger.info(f"成功生成 {len(orders)} 条订单") return orders def generate_expenses(self, count: int = 600) -> List[Dict]: """生成费用凭证数据""" logger.info(f"开始生成 {count} 条费用凭证...") expenses = [] categories = ["办公费", "差旅费", "招待费", "交通费", "其他费用"] for i in range(count): # 15%的概率生成异常费用(集中、大额等) is_anomaly = random.random() < 0.15 amount = random.randint(10000, 200000) if is_anomaly else random.randint(1000, 30000) expense = { "expense_id": f"EXP_{i+1:06d}", "voucher_no": f"VOU2024{i+1:06d}", "expense_type": random.choice(["成本", "费用"]), "expense_category": random.choice(categories), "payer_name": "付款方企业", "payee_name": random.choice(["张三", "李四", "服务提供商"]), "expense_date": datetime.now() - timedelta(days=random.randint(0, 365)), "expense_amount": float(amount), "tax_amount": float(amount * 0.13), "tax_rate": 0.13, "payment_method": random.choice(["银行转账", "现金", "支票"]), "is_large_amount": amount > 50000, "is_cross_border": random.random() < 0.2, "fiscal_year": 2024, "fiscal_period": random.randint(1, 12), "payment_status": "已支付" } expenses.append(expense) logger.info(f"成功生成 {len(expenses)} 条费用凭证") return expenses def generate_settlements(self, count: int = 400) -> List[Dict]: """生成结算数据""" logger.info(f"开始生成 {count} 条结算记录...") settlements = [] for i in range(count): amount = random.randint(5000, 50000) settlement = { "settlement_id": f"SETTLE_{i+1:06d}", "order_id": f"ORDER_{i+1:06d}", "actual_amount": float(amount), "settlement_date": datetime.now() - timedelta(days=random.randint(0, 365)), "settlement_status": random.choice(["completed", "pending", "cancelled"]), "commission_rate": round(random.uniform(0.05, 0.3), 2), "platform_commission": float(amount * random.uniform(0.05, 0.3)) } settlements.append(settlement) logger.info(f"成功生成 {len(settlements)} 条结算记录") return settlements def save_to_json(self, data: List[Dict], filename: str): """保存数据到JSON文件""" filepath = f"/Users/liulujian/Documents/code/deeprisk-claude-1/backend/test_data/{filename}" os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2, default=str) logger.info(f"数据已保存到: {filepath}") def generate_all_data(self): """生成所有测试数据""" logger.info("开始生成测试数据...") # 1. 生成主播数据 streamers = self.generate_streamers(50) # 2. 生成关联数据 recharges = self.generate_recharges(streamers, 1000) tax_declarations = self.generate_tax_declarations(streamers, 500) bank_transactions = self.generate_bank_transactions(2000) invoices = self.generate_invoices(800) orders = self.generate_orders(1500) expenses = self.generate_expenses(600) settlements = self.generate_settlements(400) # 3. 保存到JSON文件 self.save_to_json(streamers, "streamers.json") self.save_to_json(recharges, "recharges.json") self.save_to_json(tax_declarations, "tax_declarations.json") self.save_to_json(bank_transactions, "bank_transactions.json") self.save_to_json(invoices, "invoices.json") self.save_to_json(orders, "orders.json") self.save_to_json(expenses, "expenses.json") self.save_to_json(settlements, "settlements.json") # 4. 生成汇总报告 summary = { "generation_time": datetime.now().isoformat(), "data_count": { "streamers": len(streamers), "recharges": len(recharges), "tax_declarations": len(tax_declarations), "bank_transactions": len(bank_transactions), "invoices": len(invoices), "orders": len(orders), "expenses": len(expenses), "settlements": len(settlements), }, "total_records": ( len(streamers) + len(recharges) + len(tax_declarations) + len(bank_transactions) + len(invoices) + len(orders) + len(expenses) + len(settlements) ) } self.save_to_json(summary, "summary.json") logger.info("测试数据生成完成!") logger.info(f"总计生成 {summary['total_records']} 条记录") logger.info(json.dumps(summary, ensure_ascii=False, indent=2)) def main(): """主函数""" print("=" * 60) print("税务风控系统 - 测试数据生成器") print("=" * 60) generator = TestDataGenerator() try: generator.generate_all_data() print("\n✅ 测试数据生成成功!") print("📂 数据文件位置: /Users/liulujian/Documents/code/deeprisk-claude-1/backend/test_data/") print("\n生成的数据文件:") print(" - streamers.json (主播数据)") print(" - recharges.json (充值记录)") print(" - tax_declarations.json (税务申报)") print(" - bank_transactions.json (银行流水)") print(" - invoices.json (发票数据)") print(" - orders.json (订单数据)") print(" - expenses.json (费用凭证)") print(" - settlements.json (结算数据)") print(" - summary.json (汇总信息)") except Exception as e: logger.error(f"生成测试数据失败: {str(e)}", exc_info=True) print(f"\n❌ 生成失败: {str(e)}") sys.exit(1) if __name__ == "__main__": main()