first commit

This commit is contained in:
CHE LIANG ZHAO
2026-01-07 18:14:53 +08:00
commit b15922a027
6 changed files with 831 additions and 0 deletions

308
clean_wechat_data.py Normal file
View File

@@ -0,0 +1,308 @@
"""
清理微信支付账单数据
1. 仅保留指定年份的数据默认2026年
2. 对于退款的条目,找到对应的支出:
- 如果全额退款,两条都删除
- 如果部分退款,保留差额并备注
3. 字段格式与支付宝对齐
4. 根据商户名称自动推断交易分类
用法: python clean_wechat_data.py <输入文件> [输出文件] [--year 年份]
示例: python clean_wechat_data.py 微信账单.csv output.csv --year 2026
"""
import csv
import re
import argparse
from decimal import Decimal, ROUND_HALF_UP
def parse_args():
parser = argparse.ArgumentParser(description="清理微信支付账单数据")
parser.add_argument("input_file", help="输入的微信账单CSV文件")
parser.add_argument("output_file", nargs="?", default=None, help="输出文件(默认为 输入文件名_cleaned.csv")
parser.add_argument("--year", type=str, default="2026", help="保留的年份默认2026")
return parser.parse_args()
# 与支付宝对齐的表头
ALIGNED_HEADER = ["交易时间", "交易分类", "交易对方", "对方账号", "商品说明", "收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注"]
def parse_amount(amount_str):
"""解析金额字符串为Decimal去掉¥符号"""
try:
# 去掉¥符号和空格
clean = amount_str.replace("", "").replace(" ", "").strip()
return Decimal(clean)
except:
return Decimal("0")
def format_amount(amount):
"""格式化金额为字符串(不带¥符号,与支付宝一致)"""
return str(amount.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
def extract_refund_amount(status):
"""从状态中提取已退款金额"""
# 匹配 "已退款(¥1.00)" 或 "已退款¥1.00" 格式
match = re.search(r'已退款[(]?¥?([\d.]+)[)]?', status)
if match:
return Decimal(match.group(1))
if "已全额退款" in status:
return None # 表示全额退款,需要从支出金额获取
return Decimal("0")
def infer_category(merchant, product, income_expense):
"""根据商户名称和商品信息推断交易分类"""
merchant_lower = merchant.lower()
product_lower = product.lower()
combined = merchant_lower + " " + product_lower
# 收入类
if income_expense == "收入":
if "退款" in combined:
return "退款"
return "其他收入"
# 餐饮美食
food_keywords = ["coffee", "咖啡", "luckin", "瑞幸", "星巴克", "starbucks",
"食堂", "订餐", "", "", "", "奶茶", "饮品", "美食",
"烧烤", "火锅", "", "", "", "小吃", "甜品", "蛋糕",
"盒马", "鲜生", "超市", "麦当劳", "肯德基", "必胜客"]
# 交通出行
transport_keywords = ["出行", "打车", "单车", "骑行", "骑安", "滴滴", "高德",
"班车", "通勤", "公交", "地铁", "火车", "机票", "航空",
"共享", "京庐", "哈啰", "美团单车", "青桔"]
# 日用百货
daily_keywords = ["沃尔玛", "walmart", "京东", "京邦达", "快递", "淘宝",
"天猫", "拼多多", "便利店", "超市", "商场", "购物"]
# 医疗健康
health_keywords = ["医院", "", "诊所", "健康", "皮肤", "医疗", "体检"]
# 文化休闲
leisure_keywords = ["电影", "游戏", "娱乐", "健身", "运动", "滑雪", "冰雪",
"旅游", "景区", "门票", "会员", "视频", "音乐"]
# 充值缴费
recharge_keywords = ["充值", "缴费", "水费", "电费", "燃气", "话费", "流量"]
# 按优先级匹配
for kw in health_keywords:
if kw in combined:
return "医疗健康"
for kw in transport_keywords:
if kw in combined:
return "交通出行"
for kw in recharge_keywords:
if kw in combined:
return "充值缴费"
for kw in leisure_keywords:
if kw in combined:
return "文化休闲"
# 盒马特殊处理:如果是盒马但不是餐饮相关,归为日用百货
if "盒马" in combined or "鲜生" in combined:
return "日用百货"
for kw in food_keywords:
if kw in combined:
return "餐饮美食"
for kw in daily_keywords:
if kw in combined:
return "日用百货"
# 转账类
if "转账" in combined:
return "其他支出"
# 默认
return "其他支出"
def convert_row_to_aligned_format(row, remark_override=None):
"""
将微信原始行转换为与支付宝对齐的格式
微信原始: 交易时间,交易类型,交易对方,商品,收/支,金额(元),支付方式,当前状态,交易单号,商户单号,备注
对齐后: 交易时间,交易分类,交易对方,对方账号,商品说明,收/支,金额,收/付款方式,交易状态,交易订单号,商家订单号,备注
"""
transaction_time = row[0] # 交易时间
merchant = row[2] # 交易对方
product = row[3] # 商品 -> 商品说明
income_expense = row[4] # 收/支
amount = parse_amount(row[5]) # 金额(元) -> 金额(去掉¥)
payment_method = row[6] # 支付方式 -> 收/付款方式
status = row[7] # 当前状态 -> 交易状态
order_no = row[8] # 交易单号 -> 交易订单号
merchant_order_no = row[9] if len(row) > 9 else "" # 商户单号 -> 商家订单号
remark = remark_override if remark_override else (row[10] if len(row) > 10 else "/") # 备注
# 推断交易分类
category = infer_category(merchant, product, income_expense)
# 对方账号(微信没有这个字段,用/填充)
account = "/"
return [
transaction_time,
category,
merchant,
account,
product,
income_expense,
format_amount(amount),
payment_method,
status,
order_no,
merchant_order_no,
remark
]
def main():
args = parse_args()
input_file = args.input_file
output_file = args.output_file
year = args.year
# 如果未指定输出文件,自动生成
if output_file is None:
import os
base_name = os.path.splitext(input_file)[0]
output_file = f"{base_name}_cleaned.csv"
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print(f"保留年份: {year}")
print()
# 读取所有数据
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader)
rows = list(reader)
print(f"原始数据行数: {len(rows)}")
# 第一步:筛选指定年份的数据
rows_year = []
for row in rows:
if row and row[0].startswith(year):
rows_year.append(row)
print(f"{year}年数据行数: {len(rows_year)}")
# 第二步:分离退款和非退款条目
# 微信的退款在"交易类型"列index 1包含"-退款"
refund_rows = [] # 退款记录
expense_rows = [] # 支出记录
income_rows = [] # 收入记录(转账收入等,保留)
for row in rows_year:
if len(row) < 6:
continue
transaction_type = row[1] # 交易类型
income_expense = row[4] # 收/支
if "-退款" in transaction_type:
refund_rows.append(row)
elif income_expense == "支出":
expense_rows.append(row)
elif income_expense == "收入":
# 收入但不是退款(如转账收入),保留
if "-退款" not in transaction_type:
income_rows.append(row)
print(f"退款条目数: {len(refund_rows)}")
print(f"支出条目数: {len(expense_rows)}")
print(f"其他收入条目数: {len(income_rows)}")
# 第三步:处理退款
# 微信账单中,已退款的支出在"当前状态"列会标注
final_expense_rows = []
fully_refunded = 0
partially_refunded = 0
for row in expense_rows:
status = row[7] # 当前状态
original_amount = parse_amount(row[5])
if "已全额退款" in status:
# 全额退款,删除
fully_refunded += 1
print(f" 全额退款删除: {row[0]} | {row[2]} | {row[3][:25]}... | {row[5]}")
elif "已退款" in status:
# 部分退款,计算差额
refund_amt = extract_refund_amount(status)
if refund_amt and refund_amt < original_amount:
remaining = original_amount - refund_amt
# 创建新行并设置调整后的金额
new_row = row.copy()
new_row[5] = f"{format_amount(remaining)}"
remark = f"原金额{row[5]},退款¥{refund_amt}"
final_expense_rows.append((new_row, remark))
partially_refunded += 1
print(f" 部分退款: {row[0]} | {row[2]} | 原{row[5]} -> ¥{format_amount(remaining)}")
else:
# 无法解析退款金额,保留原记录
final_expense_rows.append((row, None))
else:
# 正常支出,保留
final_expense_rows.append((row, None))
print(f"\n处理结果:")
print(f" 全额退款删除: {fully_refunded}")
print(f" 部分退款调整: {partially_refunded}")
print(f" 保留支出条目: {len(final_expense_rows)}")
print(f" 保留收入条目: {len(income_rows)}")
# 转换为对齐格式
aligned_expense_rows = [convert_row_to_aligned_format(r, remark) for r, remark in final_expense_rows]
aligned_income_rows = [convert_row_to_aligned_format(r, None) for r in income_rows]
# 合并所有保留的记录并按时间排序
final_rows = aligned_expense_rows + aligned_income_rows
final_rows.sort(key=lambda x: x[0], reverse=True) # 按时间倒序
print(f" 最终保留行数: {len(final_rows)}")
# 写入清理后的数据(使用与支付宝对齐的表头)
with open(output_file, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(ALIGNED_HEADER)
writer.writerows(final_rows)
print(f"\n清理后的数据已保存到: {output_file}")
# 统计支出总额
total = Decimal("0")
for row in aligned_expense_rows:
total += Decimal(row[6])
print(f"清理后支出总额: ¥{total}")
# 按分类统计
print("\n=== 按分类统计 ===")
categories = {}
for row in aligned_expense_rows:
cat = row[1]
amt = Decimal(row[6])
categories[cat] = categories.get(cat, Decimal("0")) + amt
for cat, amt in sorted(categories.items(), key=lambda x: -x[1]):
print(f" {cat}: ¥{amt}")
if __name__ == "__main__":
main()