""" 清理微信支付账单数据 1. 仅保留指定年份的数据(默认2026年) 2. 对于退款的条目,找到对应的支出: - 如果全额退款,两条都删除 - 如果部分退款,保留差额并备注 3. 字段格式与支付宝对齐 4. 根据商户名称自动推断交易分类 用法: python clean_wechat_data.py <输入文件> [输出文件] [--year 年份] 示例: python clean_wechat_data.py 微信账单.csv output.csv --year 2026 """ import csv import re import argparse from decimal import Decimal, ROUND_HALF_UP def parse_args(): parser = argparse.ArgumentParser(description="清理微信支付账单数据") parser.add_argument("input_file", help="输入的微信账单CSV文件") parser.add_argument("output_file", nargs="?", default=None, help="输出文件(默认为 输入文件名_cleaned.csv)") parser.add_argument("--year", type=str, default="2026", help="保留的年份(默认2026)") return parser.parse_args() # 与支付宝对齐的表头 ALIGNED_HEADER = ["交易时间", "交易分类", "交易对方", "对方账号", "商品说明", "收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注"] def parse_amount(amount_str): """解析金额字符串为Decimal(去掉¥符号)""" try: # 去掉¥符号和空格 clean = amount_str.replace("¥", "").replace(" ", "").strip() return Decimal(clean) except: return Decimal("0") def format_amount(amount): """格式化金额为字符串(不带¥符号,与支付宝一致)""" return str(amount.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP)) def extract_refund_amount(status): """从状态中提取已退款金额""" # 匹配 "已退款(¥1.00)" 或 "已退款¥1.00" 格式 match = re.search(r'已退款[((]?¥?([\d.]+)[))]?', status) if match: return Decimal(match.group(1)) if "已全额退款" in status: return None # 表示全额退款,需要从支出金额获取 return Decimal("0") def infer_category(merchant, product, income_expense): """根据商户名称和商品信息推断交易分类""" merchant_lower = merchant.lower() product_lower = product.lower() combined = merchant_lower + " " + product_lower # 收入类 if income_expense == "收入": if "退款" in combined: return "退款" return "其他收入" # 餐饮美食 food_keywords = ["coffee", "咖啡", "luckin", "瑞幸", "星巴克", "starbucks", "食堂", "订餐", "餐", "饮", "茶", "奶茶", "饮品", "美食", "烧烤", "火锅", "面", "饭", "粥", "小吃", "甜品", "蛋糕", "盒马", "鲜生", "超市", "麦当劳", "肯德基", "必胜客"] # 交通出行 transport_keywords = ["出行", "打车", "单车", "骑行", "骑安", "滴滴", "高德", "班车", "通勤", "公交", "地铁", "火车", "机票", "航空", "共享", "京庐", "哈啰", "美团单车", "青桔"] # 日用百货 daily_keywords = ["沃尔玛", "walmart", "京东", "京邦达", "快递", "淘宝", "天猫", "拼多多", "便利店", "超市", "商场", "购物"] # 医疗健康 health_keywords = ["医院", "药", "诊所", "健康", "皮肤", "医疗", "体检"] # 文化休闲 leisure_keywords = ["电影", "游戏", "娱乐", "健身", "运动", "滑雪", "冰雪", "旅游", "景区", "门票", "会员", "视频", "音乐"] # 充值缴费 recharge_keywords = ["充值", "缴费", "水费", "电费", "燃气", "话费", "流量"] # 按优先级匹配 for kw in health_keywords: if kw in combined: return "医疗健康" for kw in transport_keywords: if kw in combined: return "交通出行" for kw in recharge_keywords: if kw in combined: return "充值缴费" for kw in leisure_keywords: if kw in combined: return "文化休闲" # 盒马特殊处理:如果是盒马但不是餐饮相关,归为日用百货 if "盒马" in combined or "鲜生" in combined: return "日用百货" for kw in food_keywords: if kw in combined: return "餐饮美食" for kw in daily_keywords: if kw in combined: return "日用百货" # 转账类 if "转账" in combined: return "其他支出" # 默认 return "其他支出" def convert_row_to_aligned_format(row, remark_override=None): """ 将微信原始行转换为与支付宝对齐的格式 微信原始: 交易时间,交易类型,交易对方,商品,收/支,金额(元),支付方式,当前状态,交易单号,商户单号,备注 对齐后: 交易时间,交易分类,交易对方,对方账号,商品说明,收/支,金额,收/付款方式,交易状态,交易订单号,商家订单号,备注 """ transaction_time = row[0] # 交易时间 merchant = row[2] # 交易对方 product = row[3] # 商品 -> 商品说明 income_expense = row[4] # 收/支 amount = parse_amount(row[5]) # 金额(元) -> 金额(去掉¥) payment_method = row[6] # 支付方式 -> 收/付款方式 status = row[7] # 当前状态 -> 交易状态 order_no = row[8] # 交易单号 -> 交易订单号 merchant_order_no = row[9] if len(row) > 9 else "" # 商户单号 -> 商家订单号 remark = remark_override if remark_override else (row[10] if len(row) > 10 else "/") # 备注 # 推断交易分类 category = infer_category(merchant, product, income_expense) # 对方账号(微信没有这个字段,用/填充) account = "/" return [ transaction_time, category, merchant, account, product, income_expense, format_amount(amount), payment_method, status, order_no, merchant_order_no, remark ] def main(): args = parse_args() input_file = args.input_file output_file = args.output_file year = args.year # 如果未指定输出文件,自动生成 if output_file is None: import os base_name = os.path.splitext(input_file)[0] output_file = f"{base_name}_cleaned.csv" print(f"输入文件: {input_file}") print(f"输出文件: {output_file}") print(f"保留年份: {year}") print() # 读取所有数据 with open(input_file, "r", encoding="utf-8") as f: reader = csv.reader(f) header = next(reader) rows = list(reader) print(f"原始数据行数: {len(rows)}") # 第一步:筛选指定年份的数据 rows_year = [] for row in rows: if row and row[0].startswith(year): rows_year.append(row) print(f"{year}年数据行数: {len(rows_year)}") # 第二步:分离退款和非退款条目 # 微信的退款在"交易类型"列(index 1)包含"-退款" refund_rows = [] # 退款记录 expense_rows = [] # 支出记录 income_rows = [] # 收入记录(转账收入等,保留) for row in rows_year: if len(row) < 6: continue transaction_type = row[1] # 交易类型 income_expense = row[4] # 收/支 if "-退款" in transaction_type: refund_rows.append(row) elif income_expense == "支出": expense_rows.append(row) elif income_expense == "收入": # 收入但不是退款(如转账收入),保留 if "-退款" not in transaction_type: income_rows.append(row) print(f"退款条目数: {len(refund_rows)}") print(f"支出条目数: {len(expense_rows)}") print(f"其他收入条目数: {len(income_rows)}") # 第三步:处理退款 # 微信账单中,已退款的支出在"当前状态"列会标注 final_expense_rows = [] fully_refunded = 0 partially_refunded = 0 for row in expense_rows: status = row[7] # 当前状态 original_amount = parse_amount(row[5]) if "已全额退款" in status: # 全额退款,删除 fully_refunded += 1 print(f" 全额退款删除: {row[0]} | {row[2]} | {row[3][:25]}... | {row[5]}") elif "已退款" in status: # 部分退款,计算差额 refund_amt = extract_refund_amount(status) if refund_amt and refund_amt < original_amount: remaining = original_amount - refund_amt # 创建新行并设置调整后的金额 new_row = row.copy() new_row[5] = f"¥{format_amount(remaining)}" remark = f"原金额{row[5]},退款¥{refund_amt}" final_expense_rows.append((new_row, remark)) partially_refunded += 1 print(f" 部分退款: {row[0]} | {row[2]} | 原{row[5]} -> ¥{format_amount(remaining)}") else: # 无法解析退款金额,保留原记录 final_expense_rows.append((row, None)) else: # 正常支出,保留 final_expense_rows.append((row, None)) print(f"\n处理结果:") print(f" 全额退款删除: {fully_refunded} 条") print(f" 部分退款调整: {partially_refunded} 条") print(f" 保留支出条目: {len(final_expense_rows)} 条") print(f" 保留收入条目: {len(income_rows)} 条") # 转换为对齐格式 aligned_expense_rows = [convert_row_to_aligned_format(r, remark) for r, remark in final_expense_rows] aligned_income_rows = [convert_row_to_aligned_format(r, None) for r in income_rows] # 合并所有保留的记录并按时间排序 final_rows = aligned_expense_rows + aligned_income_rows final_rows.sort(key=lambda x: x[0], reverse=True) # 按时间倒序 print(f" 最终保留行数: {len(final_rows)}") # 写入清理后的数据(使用与支付宝对齐的表头) with open(output_file, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(ALIGNED_HEADER) writer.writerows(final_rows) print(f"\n清理后的数据已保存到: {output_file}") # 统计支出总额 total = Decimal("0") for row in aligned_expense_rows: total += Decimal(row[6]) print(f"清理后支出总额: ¥{total}") # 按分类统计 print("\n=== 按分类统计 ===") categories = {} for row in aligned_expense_rows: cat = row[1] amt = Decimal(row[6]) categories[cat] = categories.get(cat, Decimal("0")) + amt for cat, amt in sorted(categories.items(), key=lambda x: -x[1]): print(f" {cat}: ¥{amt}") if __name__ == "__main__": main()