Files
billai/clean_alipay_data.py
CHE LIANG ZHAO b15922a027 first commit
2026-01-07 18:14:53 +08:00

191 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
清理支付宝交易明细数据
1. 仅保留指定年份的数据默认2026年
2. 对于退款的条目,找到对应的支出:
- 如果退款金额=支出金额,两条都删除
- 如果退款金额<支出金额,保留差额并备注
用法: python clean_alipay_data.py <输入文件> [输出文件] [--year 年份]
示例: python clean_alipay_data.py 支付宝交易明细.csv output.csv --year 2026
"""
import csv
import argparse
from decimal import Decimal, ROUND_HALF_UP
def parse_args():
parser = argparse.ArgumentParser(description="清理支付宝交易明细数据")
parser.add_argument("input_file", help="输入的支付宝账单CSV文件")
parser.add_argument("output_file", nargs="?", default=None, help="输出文件(默认为 输入文件名_cleaned.csv")
parser.add_argument("--year", type=str, default="2026", help="保留的年份默认2026")
return parser.parse_args()
def parse_amount(amount_str):
"""解析金额字符串为Decimal"""
try:
return Decimal(amount_str.strip())
except:
return Decimal("0")
def find_matching_expense(refund_row, expense_rows):
"""
找到退款对应的支出记录
返回 (索引, 支出记录) 或 (None, None)
"""
if len(refund_row) < 11:
return None, None
refund_order_no = refund_row[9].strip() # 交易订单号
refund_merchant_no = refund_row[10].strip() # 商家订单号
# 退款的交易订单号通常包含原订单号用_分隔
original_order = refund_order_no.split("_")[0] if "_" in refund_order_no else refund_order_no
for i, expense_row in enumerate(expense_rows):
if len(expense_row) >= 11:
expense_order_no = expense_row[9].strip()
expense_merchant_no = expense_row[10].strip()
# 匹配条件:订单号相同 或 商家订单号相同
if (original_order and expense_order_no == original_order) or \
(refund_merchant_no and expense_merchant_no == refund_merchant_no):
return i, expense_row
return None, None
def main():
args = parse_args()
input_file = args.input_file
output_file = args.output_file
year = args.year
# 如果未指定输出文件,自动生成
if output_file is None:
import os
base_name = os.path.splitext(input_file)[0]
output_file = f"{base_name}_cleaned.csv"
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print(f"保留年份: {year}")
print()
# 读取所有数据
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader)
rows = list(reader)
print(f"原始数据行数: {len(rows)}")
# 第一步:筛选指定年份的数据
rows_year = []
for row in rows:
if row and row[0].startswith(year):
rows_year.append(row)
print(f"{year}年数据行数: {len(rows_year)}")
# 第二步:分离退款和非退款条目
refund_rows = [] # 退款条目
expense_rows = [] # 非退款条目
for row in rows_year:
if len(row) > 1 and row[1] == "退款":
refund_rows.append(row)
else:
expense_rows.append(row)
print(f"退款条目数: {len(refund_rows)}")
print(f"非退款条目数: {len(expense_rows)}")
# 第三步:处理退款,按订单号聚合退款金额
# 一个支出可能有多笔退款
order_refunds = {} # 订单号 -> 退款总额
for refund_row in refund_rows:
if len(refund_row) >= 11:
refund_order_no = refund_row[9].strip()
refund_merchant_no = refund_row[10].strip()
refund_amount = parse_amount(refund_row[6])
original_order = refund_order_no.split("_")[0] if "_" in refund_order_no else refund_order_no
# 使用原订单号作为key
key = original_order if original_order else refund_merchant_no
if key:
if key not in order_refunds:
order_refunds[key] = Decimal("0")
order_refunds[key] += refund_amount
print(f" 退款记录: {refund_row[0]} | {refund_row[2]} | {refund_amount}")
print(f"有退款的订单数: {len(order_refunds)}")
# 第四步:处理每笔支出
final_rows = []
fully_refunded = 0
partially_refunded = 0
for row in expense_rows:
if len(row) >= 12:
order_no = row[9].strip()
merchant_no = row[10].strip()
expense_amount = parse_amount(row[6])
# 查找对应的退款
refund_amount = Decimal("0")
matched_key = None
for key, amount in order_refunds.items():
if key and (order_no == key or merchant_no == key or order_no.startswith(key)):
refund_amount = amount
matched_key = key
break
if matched_key:
if refund_amount >= expense_amount:
# 全额退款,删除该条目
fully_refunded += 1
print(f" 全额退款删除: {row[0]} | {row[2]} | {row[4][:25]}... | 原{expense_amount}")
else:
# 部分退款,保留差额并备注
remaining = expense_amount - refund_amount
remaining_str = str(remaining.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
new_row = row.copy()
new_row[6] = remaining_str
# 在备注列添加说明
original_remark = new_row[11] if len(new_row) > 11 else ""
new_row[11] = f"原金额{expense_amount}元,退款{refund_amount}{';' + original_remark if original_remark else ''}"
final_rows.append(new_row)
partially_refunded += 1
print(f" 部分退款: {row[0]} | {row[2]} | 原{expense_amount}元 -> {remaining_str}")
else:
# 无退款,保留原记录
final_rows.append(row)
else:
final_rows.append(row)
print(f"\n处理结果:")
print(f" 全额退款删除: {fully_refunded}")
print(f" 部分退款调整: {partially_refunded}")
print(f" 最终保留行数: {len(final_rows)}")
# 写入清理后的数据
with open(output_file, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(final_rows)
print(f"\n清理后的数据已保存到: {output_file}")
if __name__ == "__main__":
main()