Files
billai/clean_wechat_data.py
CHE LIANG ZHAO b15922a027 first commit
2026-01-07 18:14:53 +08:00

309 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
清理微信支付账单数据
1. 仅保留指定年份的数据默认2026年
2. 对于退款的条目,找到对应的支出:
- 如果全额退款,两条都删除
- 如果部分退款,保留差额并备注
3. 字段格式与支付宝对齐
4. 根据商户名称自动推断交易分类
用法: python clean_wechat_data.py <输入文件> [输出文件] [--year 年份]
示例: python clean_wechat_data.py 微信账单.csv output.csv --year 2026
"""
import csv
import re
import argparse
from decimal import Decimal, ROUND_HALF_UP
def parse_args():
parser = argparse.ArgumentParser(description="清理微信支付账单数据")
parser.add_argument("input_file", help="输入的微信账单CSV文件")
parser.add_argument("output_file", nargs="?", default=None, help="输出文件(默认为 输入文件名_cleaned.csv")
parser.add_argument("--year", type=str, default="2026", help="保留的年份默认2026")
return parser.parse_args()
# 与支付宝对齐的表头
ALIGNED_HEADER = ["交易时间", "交易分类", "交易对方", "对方账号", "商品说明", "收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注"]
def parse_amount(amount_str):
"""解析金额字符串为Decimal去掉¥符号"""
try:
# 去掉¥符号和空格
clean = amount_str.replace("", "").replace(" ", "").strip()
return Decimal(clean)
except:
return Decimal("0")
def format_amount(amount):
"""格式化金额为字符串(不带¥符号,与支付宝一致)"""
return str(amount.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
def extract_refund_amount(status):
"""从状态中提取已退款金额"""
# 匹配 "已退款(¥1.00)" 或 "已退款¥1.00" 格式
match = re.search(r'已退款[(]?¥?([\d.]+)[)]?', status)
if match:
return Decimal(match.group(1))
if "已全额退款" in status:
return None # 表示全额退款,需要从支出金额获取
return Decimal("0")
def infer_category(merchant, product, income_expense):
"""根据商户名称和商品信息推断交易分类"""
merchant_lower = merchant.lower()
product_lower = product.lower()
combined = merchant_lower + " " + product_lower
# 收入类
if income_expense == "收入":
if "退款" in combined:
return "退款"
return "其他收入"
# 餐饮美食
food_keywords = ["coffee", "咖啡", "luckin", "瑞幸", "星巴克", "starbucks",
"食堂", "订餐", "", "", "", "奶茶", "饮品", "美食",
"烧烤", "火锅", "", "", "", "小吃", "甜品", "蛋糕",
"盒马", "鲜生", "超市", "麦当劳", "肯德基", "必胜客"]
# 交通出行
transport_keywords = ["出行", "打车", "单车", "骑行", "骑安", "滴滴", "高德",
"班车", "通勤", "公交", "地铁", "火车", "机票", "航空",
"共享", "京庐", "哈啰", "美团单车", "青桔"]
# 日用百货
daily_keywords = ["沃尔玛", "walmart", "京东", "京邦达", "快递", "淘宝",
"天猫", "拼多多", "便利店", "超市", "商场", "购物"]
# 医疗健康
health_keywords = ["医院", "", "诊所", "健康", "皮肤", "医疗", "体检"]
# 文化休闲
leisure_keywords = ["电影", "游戏", "娱乐", "健身", "运动", "滑雪", "冰雪",
"旅游", "景区", "门票", "会员", "视频", "音乐"]
# 充值缴费
recharge_keywords = ["充值", "缴费", "水费", "电费", "燃气", "话费", "流量"]
# 按优先级匹配
for kw in health_keywords:
if kw in combined:
return "医疗健康"
for kw in transport_keywords:
if kw in combined:
return "交通出行"
for kw in recharge_keywords:
if kw in combined:
return "充值缴费"
for kw in leisure_keywords:
if kw in combined:
return "文化休闲"
# 盒马特殊处理:如果是盒马但不是餐饮相关,归为日用百货
if "盒马" in combined or "鲜生" in combined:
return "日用百货"
for kw in food_keywords:
if kw in combined:
return "餐饮美食"
for kw in daily_keywords:
if kw in combined:
return "日用百货"
# 转账类
if "转账" in combined:
return "其他支出"
# 默认
return "其他支出"
def convert_row_to_aligned_format(row, remark_override=None):
"""
将微信原始行转换为与支付宝对齐的格式
微信原始: 交易时间,交易类型,交易对方,商品,收/支,金额(元),支付方式,当前状态,交易单号,商户单号,备注
对齐后: 交易时间,交易分类,交易对方,对方账号,商品说明,收/支,金额,收/付款方式,交易状态,交易订单号,商家订单号,备注
"""
transaction_time = row[0] # 交易时间
merchant = row[2] # 交易对方
product = row[3] # 商品 -> 商品说明
income_expense = row[4] # 收/支
amount = parse_amount(row[5]) # 金额(元) -> 金额(去掉¥)
payment_method = row[6] # 支付方式 -> 收/付款方式
status = row[7] # 当前状态 -> 交易状态
order_no = row[8] # 交易单号 -> 交易订单号
merchant_order_no = row[9] if len(row) > 9 else "" # 商户单号 -> 商家订单号
remark = remark_override if remark_override else (row[10] if len(row) > 10 else "/") # 备注
# 推断交易分类
category = infer_category(merchant, product, income_expense)
# 对方账号(微信没有这个字段,用/填充)
account = "/"
return [
transaction_time,
category,
merchant,
account,
product,
income_expense,
format_amount(amount),
payment_method,
status,
order_no,
merchant_order_no,
remark
]
def main():
args = parse_args()
input_file = args.input_file
output_file = args.output_file
year = args.year
# 如果未指定输出文件,自动生成
if output_file is None:
import os
base_name = os.path.splitext(input_file)[0]
output_file = f"{base_name}_cleaned.csv"
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print(f"保留年份: {year}")
print()
# 读取所有数据
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader)
rows = list(reader)
print(f"原始数据行数: {len(rows)}")
# 第一步:筛选指定年份的数据
rows_year = []
for row in rows:
if row and row[0].startswith(year):
rows_year.append(row)
print(f"{year}年数据行数: {len(rows_year)}")
# 第二步:分离退款和非退款条目
# 微信的退款在"交易类型"列index 1包含"-退款"
refund_rows = [] # 退款记录
expense_rows = [] # 支出记录
income_rows = [] # 收入记录(转账收入等,保留)
for row in rows_year:
if len(row) < 6:
continue
transaction_type = row[1] # 交易类型
income_expense = row[4] # 收/支
if "-退款" in transaction_type:
refund_rows.append(row)
elif income_expense == "支出":
expense_rows.append(row)
elif income_expense == "收入":
# 收入但不是退款(如转账收入),保留
if "-退款" not in transaction_type:
income_rows.append(row)
print(f"退款条目数: {len(refund_rows)}")
print(f"支出条目数: {len(expense_rows)}")
print(f"其他收入条目数: {len(income_rows)}")
# 第三步:处理退款
# 微信账单中,已退款的支出在"当前状态"列会标注
final_expense_rows = []
fully_refunded = 0
partially_refunded = 0
for row in expense_rows:
status = row[7] # 当前状态
original_amount = parse_amount(row[5])
if "已全额退款" in status:
# 全额退款,删除
fully_refunded += 1
print(f" 全额退款删除: {row[0]} | {row[2]} | {row[3][:25]}... | {row[5]}")
elif "已退款" in status:
# 部分退款,计算差额
refund_amt = extract_refund_amount(status)
if refund_amt and refund_amt < original_amount:
remaining = original_amount - refund_amt
# 创建新行并设置调整后的金额
new_row = row.copy()
new_row[5] = f"{format_amount(remaining)}"
remark = f"原金额{row[5]},退款¥{refund_amt}"
final_expense_rows.append((new_row, remark))
partially_refunded += 1
print(f" 部分退款: {row[0]} | {row[2]} | 原{row[5]} -> ¥{format_amount(remaining)}")
else:
# 无法解析退款金额,保留原记录
final_expense_rows.append((row, None))
else:
# 正常支出,保留
final_expense_rows.append((row, None))
print(f"\n处理结果:")
print(f" 全额退款删除: {fully_refunded}")
print(f" 部分退款调整: {partially_refunded}")
print(f" 保留支出条目: {len(final_expense_rows)}")
print(f" 保留收入条目: {len(income_rows)}")
# 转换为对齐格式
aligned_expense_rows = [convert_row_to_aligned_format(r, remark) for r, remark in final_expense_rows]
aligned_income_rows = [convert_row_to_aligned_format(r, None) for r in income_rows]
# 合并所有保留的记录并按时间排序
final_rows = aligned_expense_rows + aligned_income_rows
final_rows.sort(key=lambda x: x[0], reverse=True) # 按时间倒序
print(f" 最终保留行数: {len(final_rows)}")
# 写入清理后的数据(使用与支付宝对齐的表头)
with open(output_file, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(ALIGNED_HEADER)
writer.writerows(final_rows)
print(f"\n清理后的数据已保存到: {output_file}")
# 统计支出总额
total = Decimal("0")
for row in aligned_expense_rows:
total += Decimal(row[6])
print(f"清理后支出总额: ¥{total}")
# 按分类统计
print("\n=== 按分类统计 ===")
categories = {}
for row in aligned_expense_rows:
cat = row[1]
amt = Decimal(row[6])
categories[cat] = categories.get(cat, Decimal("0")) + amt
for cat, amt in sorted(categories.items(), key=lambda x: -x[1]):
print(f" {cat}: ¥{amt}")
if __name__ == "__main__":
main()