first commit

This commit is contained in:
CHE LIANG ZHAO
2026-01-07 18:14:53 +08:00
commit b15922a027
6 changed files with 831 additions and 0 deletions

190
clean_alipay_data.py Normal file
View File

@@ -0,0 +1,190 @@
"""
清理支付宝交易明细数据
1. 仅保留指定年份的数据默认2026年
2. 对于退款的条目,找到对应的支出:
- 如果退款金额=支出金额,两条都删除
- 如果退款金额<支出金额,保留差额并备注
用法: python clean_alipay_data.py <输入文件> [输出文件] [--year 年份]
示例: python clean_alipay_data.py 支付宝交易明细.csv output.csv --year 2026
"""
import csv
import argparse
from decimal import Decimal, ROUND_HALF_UP
def parse_args():
parser = argparse.ArgumentParser(description="清理支付宝交易明细数据")
parser.add_argument("input_file", help="输入的支付宝账单CSV文件")
parser.add_argument("output_file", nargs="?", default=None, help="输出文件(默认为 输入文件名_cleaned.csv")
parser.add_argument("--year", type=str, default="2026", help="保留的年份默认2026")
return parser.parse_args()
def parse_amount(amount_str):
"""解析金额字符串为Decimal"""
try:
return Decimal(amount_str.strip())
except:
return Decimal("0")
def find_matching_expense(refund_row, expense_rows):
"""
找到退款对应的支出记录
返回 (索引, 支出记录) 或 (None, None)
"""
if len(refund_row) < 11:
return None, None
refund_order_no = refund_row[9].strip() # 交易订单号
refund_merchant_no = refund_row[10].strip() # 商家订单号
# 退款的交易订单号通常包含原订单号用_分隔
original_order = refund_order_no.split("_")[0] if "_" in refund_order_no else refund_order_no
for i, expense_row in enumerate(expense_rows):
if len(expense_row) >= 11:
expense_order_no = expense_row[9].strip()
expense_merchant_no = expense_row[10].strip()
# 匹配条件:订单号相同 或 商家订单号相同
if (original_order and expense_order_no == original_order) or \
(refund_merchant_no and expense_merchant_no == refund_merchant_no):
return i, expense_row
return None, None
def main():
args = parse_args()
input_file = args.input_file
output_file = args.output_file
year = args.year
# 如果未指定输出文件,自动生成
if output_file is None:
import os
base_name = os.path.splitext(input_file)[0]
output_file = f"{base_name}_cleaned.csv"
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print(f"保留年份: {year}")
print()
# 读取所有数据
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader)
rows = list(reader)
print(f"原始数据行数: {len(rows)}")
# 第一步:筛选指定年份的数据
rows_year = []
for row in rows:
if row and row[0].startswith(year):
rows_year.append(row)
print(f"{year}年数据行数: {len(rows_year)}")
# 第二步:分离退款和非退款条目
refund_rows = [] # 退款条目
expense_rows = [] # 非退款条目
for row in rows_year:
if len(row) > 1 and row[1] == "退款":
refund_rows.append(row)
else:
expense_rows.append(row)
print(f"退款条目数: {len(refund_rows)}")
print(f"非退款条目数: {len(expense_rows)}")
# 第三步:处理退款,按订单号聚合退款金额
# 一个支出可能有多笔退款
order_refunds = {} # 订单号 -> 退款总额
for refund_row in refund_rows:
if len(refund_row) >= 11:
refund_order_no = refund_row[9].strip()
refund_merchant_no = refund_row[10].strip()
refund_amount = parse_amount(refund_row[6])
original_order = refund_order_no.split("_")[0] if "_" in refund_order_no else refund_order_no
# 使用原订单号作为key
key = original_order if original_order else refund_merchant_no
if key:
if key not in order_refunds:
order_refunds[key] = Decimal("0")
order_refunds[key] += refund_amount
print(f" 退款记录: {refund_row[0]} | {refund_row[2]} | {refund_amount}")
print(f"有退款的订单数: {len(order_refunds)}")
# 第四步:处理每笔支出
final_rows = []
fully_refunded = 0
partially_refunded = 0
for row in expense_rows:
if len(row) >= 12:
order_no = row[9].strip()
merchant_no = row[10].strip()
expense_amount = parse_amount(row[6])
# 查找对应的退款
refund_amount = Decimal("0")
matched_key = None
for key, amount in order_refunds.items():
if key and (order_no == key or merchant_no == key or order_no.startswith(key)):
refund_amount = amount
matched_key = key
break
if matched_key:
if refund_amount >= expense_amount:
# 全额退款,删除该条目
fully_refunded += 1
print(f" 全额退款删除: {row[0]} | {row[2]} | {row[4][:25]}... | 原{expense_amount}")
else:
# 部分退款,保留差额并备注
remaining = expense_amount - refund_amount
remaining_str = str(remaining.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
new_row = row.copy()
new_row[6] = remaining_str
# 在备注列添加说明
original_remark = new_row[11] if len(new_row) > 11 else ""
new_row[11] = f"原金额{expense_amount}元,退款{refund_amount}{';' + original_remark if original_remark else ''}"
final_rows.append(new_row)
partially_refunded += 1
print(f" 部分退款: {row[0]} | {row[2]} | 原{expense_amount}元 -> {remaining_str}")
else:
# 无退款,保留原记录
final_rows.append(row)
else:
final_rows.append(row)
print(f"\n处理结果:")
print(f" 全额退款删除: {fully_refunded}")
print(f" 部分退款调整: {partially_refunded}")
print(f" 最终保留行数: {len(final_rows)}")
# 写入清理后的数据
with open(output_file, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(final_rows)
print(f"\n清理后的数据已保存到: {output_file}")
if __name__ == "__main__":
main()