371 lines
14 KiB
Python
371 lines
14 KiB
Python
"""
|
||
京东白条账单清理模块
|
||
"""
|
||
import csv
|
||
import re
|
||
from decimal import Decimal
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
from .base import (
|
||
BaseCleaner, parse_amount, format_amount,
|
||
is_in_date_range, create_arg_parser
|
||
)
|
||
from category import infer_category
|
||
|
||
|
||
# 加载京东专属分类配置
|
||
JD_CONFIG_FILE = Path(__file__).parent.parent / "config" / "category_jd.yaml"
|
||
|
||
def load_jd_config():
|
||
"""加载京东分类配置"""
|
||
with open(JD_CONFIG_FILE, "r", encoding="utf-8") as f:
|
||
return yaml.safe_load(f)
|
||
|
||
_jd_config = load_jd_config()
|
||
|
||
|
||
def infer_jd_category(merchant: str, product: str, original_category: str) -> tuple[str, bool, int]:
|
||
"""
|
||
根据京东账单的商户名称、商品说明和原分类推断统一分类
|
||
|
||
Args:
|
||
merchant: 商户名称(如"京东外卖"、"京东平台商户")
|
||
product: 交易说明/商品说明
|
||
original_category: 京东原始分类(如"食品酒饮"、"数码电器")
|
||
|
||
Returns:
|
||
(分类名称, 是否确定, 复核等级)
|
||
|
||
复核等级:
|
||
0 = 无需复核(商户映射或原分类映射成功,高置信度)
|
||
1 = 低优先级复核(通用关键词匹配成功,需确认)
|
||
2 = 高优先级复核(全部匹配失败或未知分类,需人工分类)
|
||
"""
|
||
# 1. 先检查商户名称直接映射(如"京东外卖" -> "餐饮美食")
|
||
merchant_mapping = _jd_config.get("商户映射", {})
|
||
for merchant_key, category in merchant_mapping.items():
|
||
if merchant_key in merchant:
|
||
return category, True, 0 # 商户映射,无需复核
|
||
|
||
# 2. 尝试直接映射京东原分类
|
||
category_mapping = _jd_config.get("分类映射", {})
|
||
|
||
# 处理多分类情况(如"食品酒饮 其他网购")
|
||
original_cats = original_category.split() if original_category else []
|
||
for orig_cat in original_cats:
|
||
if orig_cat in category_mapping:
|
||
mapped = category_mapping[orig_cat]
|
||
if mapped: # 非空映射 → 使用映射结果
|
||
return mapped, True, 0 # 原分类映射,无需复核
|
||
# 空映射(如"其他"→"")→ 继续检查下一个原分类或进入关键词匹配
|
||
else:
|
||
# 未知分类(不在映射表中)→ 保留原分类,HIGH 复核
|
||
return orig_cat, True, 2
|
||
|
||
# 3. 使用通用分类推断(已包含京东平台商户关键词)
|
||
category, is_certain = infer_category(merchant, product, "支出")
|
||
if is_certain:
|
||
return category, True, 1 # 关键词匹配,低优先级复核
|
||
|
||
# 4. 返回默认分类
|
||
return _jd_config.get("默认分类", "其他支出"), False, 2 # 全部失败,高优先级复核
|
||
|
||
|
||
# 与支付宝/微信对齐的表头(包含"复核等级"字段)
|
||
ALIGNED_HEADER = [
|
||
"交易时间", "交易分类", "交易对方", "对方账号", "商品说明",
|
||
"收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注", "复核等级"
|
||
]
|
||
|
||
|
||
class JDCleaner(BaseCleaner):
|
||
"""京东白条账单清理器"""
|
||
|
||
def clean(self) -> None:
|
||
"""执行清理"""
|
||
self.print_header()
|
||
|
||
# 读取数据,跳过京东导出文件的头部信息
|
||
with open(self.input_file, "r", encoding="utf-8") as f:
|
||
reader = csv.reader(f)
|
||
header = None
|
||
rows = []
|
||
|
||
for row in reader:
|
||
# 跳过空行
|
||
if not row or not row[0].strip():
|
||
continue
|
||
|
||
# 清理每个字段的 tab 字符
|
||
row = [cell.strip().replace('\t', '') for cell in row]
|
||
|
||
# 查找实际的CSV头部行(包含"交易时间"和"商户名称")
|
||
if header is None:
|
||
if len(row) >= 2 and "交易时间" in row[0] and "商户名称" in row[1]:
|
||
header = row
|
||
continue
|
||
# 跳过头部信息行
|
||
continue
|
||
|
||
# 收集数据行
|
||
rows.append(row)
|
||
|
||
# 确保找到了有效的头部
|
||
if header is None:
|
||
raise ValueError("无法找到有效的京东账单表头(需包含'交易时间'和'商户名称'列)")
|
||
|
||
self.stats["original_count"] = len(rows)
|
||
print(f"原始数据行数: {len(rows)}")
|
||
|
||
# 第一步:按日期范围筛选
|
||
rows_filtered = [
|
||
row for row in rows
|
||
if row and is_in_date_range(row[0], self.start_date, self.end_date)
|
||
]
|
||
self.stats["filtered_count"] = len(rows_filtered)
|
||
|
||
date_desc = f"{self.start_date} ~ {self.end_date}" if self.start_date or self.end_date else "全部"
|
||
print(f"筛选后数据行数: {len(rows_filtered)} ({date_desc})")
|
||
|
||
# 第二步:分离退款和支出条目(过滤掉"不计收支")
|
||
refund_rows = []
|
||
expense_rows = []
|
||
skipped_count = 0 # 不计收支(还款、冻结等)
|
||
|
||
for row in rows_filtered:
|
||
if len(row) < 7:
|
||
continue
|
||
|
||
income_expense = row[6].strip() # 收/支 列
|
||
transaction_desc = row[2].strip() # 交易说明
|
||
status = row[5].strip() # 交易状态
|
||
|
||
# 过滤掉"不计收支"记录(还款、冻结、预授权等)
|
||
if income_expense == "不计收支":
|
||
skipped_count += 1
|
||
continue
|
||
|
||
# 退款判断:交易说明以"退款-"开头 或 状态包含"退款成功"
|
||
if transaction_desc.startswith("退款-") or "退款" in status:
|
||
refund_rows.append(row)
|
||
elif income_expense == "支出":
|
||
expense_rows.append(row)
|
||
|
||
print(f"退款条目数: {len(refund_rows)}")
|
||
print(f"支出条目数: {len(expense_rows)}")
|
||
print(f"不计收支过滤: {skipped_count} 条(还款/冻结等)")
|
||
|
||
# 第三步:处理退款
|
||
# 京东账单特点:已全额退款的记录金额会显示为 "179.00(已全额退款)"
|
||
final_expense_rows = self._process_expenses(expense_rows, refund_rows)
|
||
|
||
print(f"\n处理结果:")
|
||
print(f" 全额退款删除: {self.stats['fully_refunded']} 条")
|
||
print(f" 部分退款调整: {self.stats['partially_refunded']} 条")
|
||
if self.stats.get("zero_amount", 0) > 0:
|
||
print(f" 0元记录过滤: {self.stats['zero_amount']} 条")
|
||
print(f" 最终保留行数: {len(final_expense_rows)}")
|
||
|
||
# 第四步:转换为对齐格式并重新分类
|
||
aligned_rows = [self._convert_and_reclassify(row_data) for row_data in final_expense_rows]
|
||
|
||
# 按时间排序(最新在前)
|
||
aligned_rows.sort(key=lambda x: x[0], reverse=True)
|
||
|
||
# 统计复核数量
|
||
review_high_count = sum(1 for row in aligned_rows if row[-1] == "HIGH")
|
||
|
||
self.stats["final_count"] = len(aligned_rows)
|
||
if review_high_count > 0:
|
||
print(f" 高优先级复核: {review_high_count} 条(无法判断)")
|
||
|
||
# 写入文件
|
||
self.write_output(ALIGNED_HEADER, aligned_rows)
|
||
|
||
print(f"\n清理后的数据已保存到: {self.output_file}")
|
||
|
||
# 统计支出
|
||
self._print_expense_summary(aligned_rows)
|
||
|
||
def _parse_jd_amount(self, amount_str: str) -> tuple[Decimal, bool]:
|
||
"""
|
||
解析京东账单金额
|
||
|
||
京东金额格式特点:
|
||
- 普通金额: "179.00"
|
||
- 全额退款: "179.00(已全额退款)"
|
||
|
||
Returns:
|
||
(金额, 是否已全额退款)
|
||
"""
|
||
amount_str = amount_str.strip()
|
||
|
||
# 检查是否包含"已全额退款"
|
||
if "(已全额退款)" in amount_str or "(已全额退款)" in amount_str:
|
||
# 提取金额部分
|
||
amount_part = re.sub(r'[((]已全额退款[))]', '', amount_str)
|
||
return parse_amount(amount_part), True
|
||
|
||
return parse_amount(amount_str), False
|
||
|
||
def _process_expenses(self, expense_rows: list, refund_rows: list) -> list:
|
||
"""
|
||
处理支出记录
|
||
|
||
京东账单特点:
|
||
1. 已全额退款的记录金额显示为 "金额(已全额退款)"
|
||
2. 部分退款可能有单独的退款记录
|
||
"""
|
||
# 构建退款索引(按订单号)
|
||
order_refunds = {}
|
||
for row in refund_rows:
|
||
if len(row) >= 9:
|
||
order_no = row[8].strip() # 交易订单号
|
||
amount = parse_amount(row[3]) # 金额
|
||
if order_no:
|
||
if order_no not in order_refunds:
|
||
order_refunds[order_no] = Decimal("0")
|
||
order_refunds[order_no] += amount
|
||
print(f" 退款记录: {row[0]} | {row[1]} | {amount}元")
|
||
|
||
final_rows = []
|
||
|
||
for row in expense_rows:
|
||
if len(row) < 9:
|
||
continue
|
||
|
||
order_no = row[8].strip() # 交易订单号
|
||
amount, is_fully_refunded = self._parse_jd_amount(row[3])
|
||
|
||
# 情况1:金额已标注"已全额退款"
|
||
if is_fully_refunded:
|
||
self.stats["fully_refunded"] += 1
|
||
desc = row[2][:25] if len(row[2]) > 25 else row[2]
|
||
print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | {row[3]}")
|
||
continue
|
||
|
||
# 情况2:检查是否有对应的退款记录
|
||
refund_amount = order_refunds.get(order_no, Decimal("0"))
|
||
if refund_amount > 0:
|
||
if refund_amount >= amount:
|
||
# 全额退款
|
||
self.stats["fully_refunded"] += 1
|
||
desc = row[2][:25] if len(row[2]) > 25 else row[2]
|
||
print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | 原{amount}元")
|
||
else:
|
||
# 部分退款
|
||
remaining = amount - refund_amount
|
||
new_row = row.copy()
|
||
new_row[3] = format_amount(remaining)
|
||
remark = f"原金额{amount}元,退款{refund_amount}元"
|
||
|
||
final_rows.append((new_row, remark))
|
||
self.stats["partially_refunded"] += 1
|
||
print(f" 部分退款: {row[0]} | {row[1]} | 原{amount}元 -> {format_amount(remaining)}元")
|
||
else:
|
||
# 无退款,正常记录
|
||
if amount > 0:
|
||
final_rows.append((row, None))
|
||
else:
|
||
self.stats["zero_amount"] = self.stats.get("zero_amount", 0) + 1
|
||
|
||
return final_rows
|
||
|
||
def _convert_and_reclassify(self, row_tuple: tuple) -> list:
|
||
"""
|
||
转换为对齐格式并重新分类
|
||
|
||
京东原始字段:
|
||
0: 交易时间, 1: 商户名称, 2: 交易说明, 3: 金额,
|
||
4: 收/付款方式, 5: 交易状态, 6: 收/支, 7: 交易分类,
|
||
8: 交易订单号, 9: 商家订单号, 10: 备注
|
||
|
||
对齐后字段:
|
||
交易时间, 交易分类, 交易对方, 对方账号, 商品说明,
|
||
收/支, 金额, 收/付款方式, 交易状态, 交易订单号, 商家订单号, 备注, 复核等级
|
||
"""
|
||
if isinstance(row_tuple, tuple):
|
||
row, remark = row_tuple
|
||
else:
|
||
row, remark = row_tuple, None
|
||
|
||
transaction_time = row[0]
|
||
merchant = row[1] # 商户名称
|
||
product = row[2] # 交易说明
|
||
amount, _ = self._parse_jd_amount(row[3])
|
||
payment_method = row[4] if len(row) > 4 else ""
|
||
status = row[5] if len(row) > 5 else ""
|
||
income_expense = row[6] if len(row) > 6 else "支出"
|
||
original_category = row[7] if len(row) > 7 else ""
|
||
order_no = row[8] if len(row) > 8 else ""
|
||
merchant_order_no = row[9] if len(row) > 9 else ""
|
||
final_remark = remark if remark else (row[10] if len(row) > 10 else "/")
|
||
|
||
# 使用京东专属分类推断
|
||
category, is_certain, review_level = infer_jd_category(merchant, product, original_category)
|
||
|
||
# 复核等级映射: 0=空, 1=LOW, 2=HIGH
|
||
review_marks = {0: "", 1: "LOW", 2: "HIGH"}
|
||
review_mark = review_marks.get(review_level, "")
|
||
|
||
return [
|
||
transaction_time,
|
||
category,
|
||
merchant,
|
||
"/", # 对方账号(京东无此字段)
|
||
product,
|
||
income_expense,
|
||
format_amount(amount),
|
||
payment_method,
|
||
status,
|
||
order_no,
|
||
merchant_order_no,
|
||
final_remark,
|
||
review_mark
|
||
]
|
||
|
||
def reclassify(self, rows: list) -> list:
|
||
"""
|
||
重新分类京东账单
|
||
|
||
京东账单在 _convert_and_reclassify 中已完成分类
|
||
此方法为接口兼容保留
|
||
"""
|
||
return rows
|
||
|
||
def _print_expense_summary(self, expense_rows: list):
|
||
"""打印支出统计"""
|
||
total = Decimal("0")
|
||
categories = {}
|
||
|
||
for row in expense_rows:
|
||
if row[5] == "支出":
|
||
amt = Decimal(row[6])
|
||
total += amt
|
||
cat = row[1]
|
||
categories[cat] = categories.get(cat, Decimal("0")) + amt
|
||
|
||
print(f"清理后支出总额: ¥{total}")
|
||
print("\n=== 按分类统计 ===")
|
||
for cat, amt in sorted(categories.items(), key=lambda x: -x[1]):
|
||
print(f" {cat}: ¥{amt}")
|
||
|
||
|
||
def main():
|
||
"""命令行入口"""
|
||
parser = create_arg_parser("清理京东白条账单数据")
|
||
args = parser.parse_args()
|
||
|
||
from .base import compute_date_range
|
||
|
||
cleaner = JDCleaner(args.input_file, args.output_file)
|
||
start_date, end_date = compute_date_range(args)
|
||
cleaner.set_date_range(start_date, end_date)
|
||
cleaner.clean()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|