309 lines
11 KiB
Python
309 lines
11 KiB
Python
"""
|
||
清理微信支付账单数据
|
||
1. 仅保留指定年份的数据(默认2026年)
|
||
2. 对于退款的条目,找到对应的支出:
|
||
- 如果全额退款,两条都删除
|
||
- 如果部分退款,保留差额并备注
|
||
3. 字段格式与支付宝对齐
|
||
4. 根据商户名称自动推断交易分类
|
||
|
||
用法: python clean_wechat_data.py <输入文件> [输出文件] [--year 年份]
|
||
示例: python clean_wechat_data.py 微信账单.csv output.csv --year 2026
|
||
"""
|
||
import csv
|
||
import re
|
||
import argparse
|
||
from decimal import Decimal, ROUND_HALF_UP
|
||
|
||
|
||
def parse_args():
|
||
parser = argparse.ArgumentParser(description="清理微信支付账单数据")
|
||
parser.add_argument("input_file", help="输入的微信账单CSV文件")
|
||
parser.add_argument("output_file", nargs="?", default=None, help="输出文件(默认为 输入文件名_cleaned.csv)")
|
||
parser.add_argument("--year", type=str, default="2026", help="保留的年份(默认2026)")
|
||
return parser.parse_args()
|
||
|
||
# 与支付宝对齐的表头
|
||
ALIGNED_HEADER = ["交易时间", "交易分类", "交易对方", "对方账号", "商品说明", "收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注"]
|
||
|
||
|
||
def parse_amount(amount_str):
|
||
"""解析金额字符串为Decimal(去掉¥符号)"""
|
||
try:
|
||
# 去掉¥符号和空格
|
||
clean = amount_str.replace("¥", "").replace(" ", "").strip()
|
||
return Decimal(clean)
|
||
except:
|
||
return Decimal("0")
|
||
|
||
|
||
def format_amount(amount):
|
||
"""格式化金额为字符串(不带¥符号,与支付宝一致)"""
|
||
return str(amount.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
|
||
|
||
|
||
def extract_refund_amount(status):
|
||
"""从状态中提取已退款金额"""
|
||
# 匹配 "已退款(¥1.00)" 或 "已退款¥1.00" 格式
|
||
match = re.search(r'已退款[((]?¥?([\d.]+)[))]?', status)
|
||
if match:
|
||
return Decimal(match.group(1))
|
||
if "已全额退款" in status:
|
||
return None # 表示全额退款,需要从支出金额获取
|
||
return Decimal("0")
|
||
|
||
|
||
def infer_category(merchant, product, income_expense):
|
||
"""根据商户名称和商品信息推断交易分类"""
|
||
merchant_lower = merchant.lower()
|
||
product_lower = product.lower()
|
||
combined = merchant_lower + " " + product_lower
|
||
|
||
# 收入类
|
||
if income_expense == "收入":
|
||
if "退款" in combined:
|
||
return "退款"
|
||
return "其他收入"
|
||
|
||
# 餐饮美食
|
||
food_keywords = ["coffee", "咖啡", "luckin", "瑞幸", "星巴克", "starbucks",
|
||
"食堂", "订餐", "餐", "饮", "茶", "奶茶", "饮品", "美食",
|
||
"烧烤", "火锅", "面", "饭", "粥", "小吃", "甜品", "蛋糕",
|
||
"盒马", "鲜生", "超市", "麦当劳", "肯德基", "必胜客"]
|
||
|
||
# 交通出行
|
||
transport_keywords = ["出行", "打车", "单车", "骑行", "骑安", "滴滴", "高德",
|
||
"班车", "通勤", "公交", "地铁", "火车", "机票", "航空",
|
||
"共享", "京庐", "哈啰", "美团单车", "青桔"]
|
||
|
||
# 日用百货
|
||
daily_keywords = ["沃尔玛", "walmart", "京东", "京邦达", "快递", "淘宝",
|
||
"天猫", "拼多多", "便利店", "超市", "商场", "购物"]
|
||
|
||
# 医疗健康
|
||
health_keywords = ["医院", "药", "诊所", "健康", "皮肤", "医疗", "体检"]
|
||
|
||
# 文化休闲
|
||
leisure_keywords = ["电影", "游戏", "娱乐", "健身", "运动", "滑雪", "冰雪",
|
||
"旅游", "景区", "门票", "会员", "视频", "音乐"]
|
||
|
||
# 充值缴费
|
||
recharge_keywords = ["充值", "缴费", "水费", "电费", "燃气", "话费", "流量"]
|
||
|
||
# 按优先级匹配
|
||
for kw in health_keywords:
|
||
if kw in combined:
|
||
return "医疗健康"
|
||
|
||
for kw in transport_keywords:
|
||
if kw in combined:
|
||
return "交通出行"
|
||
|
||
for kw in recharge_keywords:
|
||
if kw in combined:
|
||
return "充值缴费"
|
||
|
||
for kw in leisure_keywords:
|
||
if kw in combined:
|
||
return "文化休闲"
|
||
|
||
# 盒马特殊处理:如果是盒马但不是餐饮相关,归为日用百货
|
||
if "盒马" in combined or "鲜生" in combined:
|
||
return "日用百货"
|
||
|
||
for kw in food_keywords:
|
||
if kw in combined:
|
||
return "餐饮美食"
|
||
|
||
for kw in daily_keywords:
|
||
if kw in combined:
|
||
return "日用百货"
|
||
|
||
# 转账类
|
||
if "转账" in combined:
|
||
return "其他支出"
|
||
|
||
# 默认
|
||
return "其他支出"
|
||
|
||
|
||
def convert_row_to_aligned_format(row, remark_override=None):
|
||
"""
|
||
将微信原始行转换为与支付宝对齐的格式
|
||
微信原始: 交易时间,交易类型,交易对方,商品,收/支,金额(元),支付方式,当前状态,交易单号,商户单号,备注
|
||
对齐后: 交易时间,交易分类,交易对方,对方账号,商品说明,收/支,金额,收/付款方式,交易状态,交易订单号,商家订单号,备注
|
||
"""
|
||
transaction_time = row[0] # 交易时间
|
||
merchant = row[2] # 交易对方
|
||
product = row[3] # 商品 -> 商品说明
|
||
income_expense = row[4] # 收/支
|
||
amount = parse_amount(row[5]) # 金额(元) -> 金额(去掉¥)
|
||
payment_method = row[6] # 支付方式 -> 收/付款方式
|
||
status = row[7] # 当前状态 -> 交易状态
|
||
order_no = row[8] # 交易单号 -> 交易订单号
|
||
merchant_order_no = row[9] if len(row) > 9 else "" # 商户单号 -> 商家订单号
|
||
remark = remark_override if remark_override else (row[10] if len(row) > 10 else "/") # 备注
|
||
|
||
# 推断交易分类
|
||
category = infer_category(merchant, product, income_expense)
|
||
|
||
# 对方账号(微信没有这个字段,用/填充)
|
||
account = "/"
|
||
|
||
return [
|
||
transaction_time,
|
||
category,
|
||
merchant,
|
||
account,
|
||
product,
|
||
income_expense,
|
||
format_amount(amount),
|
||
payment_method,
|
||
status,
|
||
order_no,
|
||
merchant_order_no,
|
||
remark
|
||
]
|
||
|
||
|
||
def main():
|
||
args = parse_args()
|
||
|
||
input_file = args.input_file
|
||
output_file = args.output_file
|
||
year = args.year
|
||
|
||
# 如果未指定输出文件,自动生成
|
||
if output_file is None:
|
||
import os
|
||
base_name = os.path.splitext(input_file)[0]
|
||
output_file = f"{base_name}_cleaned.csv"
|
||
|
||
print(f"输入文件: {input_file}")
|
||
print(f"输出文件: {output_file}")
|
||
print(f"保留年份: {year}")
|
||
print()
|
||
|
||
# 读取所有数据
|
||
with open(input_file, "r", encoding="utf-8") as f:
|
||
reader = csv.reader(f)
|
||
header = next(reader)
|
||
rows = list(reader)
|
||
|
||
print(f"原始数据行数: {len(rows)}")
|
||
|
||
# 第一步:筛选指定年份的数据
|
||
rows_year = []
|
||
for row in rows:
|
||
if row and row[0].startswith(year):
|
||
rows_year.append(row)
|
||
|
||
print(f"{year}年数据行数: {len(rows_year)}")
|
||
|
||
# 第二步:分离退款和非退款条目
|
||
# 微信的退款在"交易类型"列(index 1)包含"-退款"
|
||
refund_rows = [] # 退款记录
|
||
expense_rows = [] # 支出记录
|
||
income_rows = [] # 收入记录(转账收入等,保留)
|
||
|
||
for row in rows_year:
|
||
if len(row) < 6:
|
||
continue
|
||
|
||
transaction_type = row[1] # 交易类型
|
||
income_expense = row[4] # 收/支
|
||
|
||
if "-退款" in transaction_type:
|
||
refund_rows.append(row)
|
||
elif income_expense == "支出":
|
||
expense_rows.append(row)
|
||
elif income_expense == "收入":
|
||
# 收入但不是退款(如转账收入),保留
|
||
if "-退款" not in transaction_type:
|
||
income_rows.append(row)
|
||
|
||
print(f"退款条目数: {len(refund_rows)}")
|
||
print(f"支出条目数: {len(expense_rows)}")
|
||
print(f"其他收入条目数: {len(income_rows)}")
|
||
|
||
# 第三步:处理退款
|
||
# 微信账单中,已退款的支出在"当前状态"列会标注
|
||
final_expense_rows = []
|
||
fully_refunded = 0
|
||
partially_refunded = 0
|
||
|
||
for row in expense_rows:
|
||
status = row[7] # 当前状态
|
||
original_amount = parse_amount(row[5])
|
||
|
||
if "已全额退款" in status:
|
||
# 全额退款,删除
|
||
fully_refunded += 1
|
||
print(f" 全额退款删除: {row[0]} | {row[2]} | {row[3][:25]}... | {row[5]}")
|
||
elif "已退款" in status:
|
||
# 部分退款,计算差额
|
||
refund_amt = extract_refund_amount(status)
|
||
if refund_amt and refund_amt < original_amount:
|
||
remaining = original_amount - refund_amt
|
||
|
||
# 创建新行并设置调整后的金额
|
||
new_row = row.copy()
|
||
new_row[5] = f"¥{format_amount(remaining)}"
|
||
remark = f"原金额{row[5]},退款¥{refund_amt}"
|
||
|
||
final_expense_rows.append((new_row, remark))
|
||
partially_refunded += 1
|
||
print(f" 部分退款: {row[0]} | {row[2]} | 原{row[5]} -> ¥{format_amount(remaining)}")
|
||
else:
|
||
# 无法解析退款金额,保留原记录
|
||
final_expense_rows.append((row, None))
|
||
else:
|
||
# 正常支出,保留
|
||
final_expense_rows.append((row, None))
|
||
|
||
print(f"\n处理结果:")
|
||
print(f" 全额退款删除: {fully_refunded} 条")
|
||
print(f" 部分退款调整: {partially_refunded} 条")
|
||
print(f" 保留支出条目: {len(final_expense_rows)} 条")
|
||
print(f" 保留收入条目: {len(income_rows)} 条")
|
||
|
||
# 转换为对齐格式
|
||
aligned_expense_rows = [convert_row_to_aligned_format(r, remark) for r, remark in final_expense_rows]
|
||
aligned_income_rows = [convert_row_to_aligned_format(r, None) for r in income_rows]
|
||
|
||
# 合并所有保留的记录并按时间排序
|
||
final_rows = aligned_expense_rows + aligned_income_rows
|
||
final_rows.sort(key=lambda x: x[0], reverse=True) # 按时间倒序
|
||
|
||
print(f" 最终保留行数: {len(final_rows)}")
|
||
|
||
# 写入清理后的数据(使用与支付宝对齐的表头)
|
||
with open(output_file, "w", encoding="utf-8", newline="") as f:
|
||
writer = csv.writer(f)
|
||
writer.writerow(ALIGNED_HEADER)
|
||
writer.writerows(final_rows)
|
||
|
||
print(f"\n清理后的数据已保存到: {output_file}")
|
||
|
||
# 统计支出总额
|
||
total = Decimal("0")
|
||
for row in aligned_expense_rows:
|
||
total += Decimal(row[6])
|
||
print(f"清理后支出总额: ¥{total}")
|
||
|
||
# 按分类统计
|
||
print("\n=== 按分类统计 ===")
|
||
categories = {}
|
||
for row in aligned_expense_rows:
|
||
cat = row[1]
|
||
amt = Decimal(row[6])
|
||
categories[cat] = categories.get(cat, Decimal("0")) + amt
|
||
|
||
for cat, amt in sorted(categories.items(), key=lambda x: -x[1]):
|
||
print(f" {cat}: ¥{amt}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|