Files
billai/analyzer/cleaners/jd.py
2026-01-26 14:25:39 +08:00

371 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
京东白条账单清理模块
"""
import csv
import re
from decimal import Decimal
from pathlib import Path
import yaml
from .base import (
BaseCleaner, parse_amount, format_amount,
is_in_date_range, create_arg_parser
)
from category import infer_category
# 加载京东专属分类配置
JD_CONFIG_FILE = Path(__file__).parent.parent / "config" / "category_jd.yaml"
def load_jd_config():
"""加载京东分类配置"""
with open(JD_CONFIG_FILE, "r", encoding="utf-8") as f:
return yaml.safe_load(f)
_jd_config = load_jd_config()
def infer_jd_category(merchant: str, product: str, original_category: str) -> tuple[str, bool, int]:
"""
根据京东账单的商户名称、商品说明和原分类推断统一分类
Args:
merchant: 商户名称(如"京东外卖""京东平台商户"
product: 交易说明/商品说明
original_category: 京东原始分类(如"食品酒饮""数码电器"
Returns:
(分类名称, 是否确定, 复核等级)
复核等级:
0 = 无需复核(商户映射或原分类映射成功,高置信度)
1 = 低优先级复核(通用关键词匹配成功,需确认)
2 = 高优先级复核(全部匹配失败或未知分类,需人工分类)
"""
# 1. 先检查商户名称直接映射(如"京东外卖" -> "餐饮美食"
merchant_mapping = _jd_config.get("商户映射", {})
for merchant_key, category in merchant_mapping.items():
if merchant_key in merchant:
return category, True, 0 # 商户映射,无需复核
# 2. 尝试直接映射京东原分类
category_mapping = _jd_config.get("分类映射", {})
# 处理多分类情况(如"食品酒饮 其他网购"
original_cats = original_category.split() if original_category else []
for orig_cat in original_cats:
if orig_cat in category_mapping:
mapped = category_mapping[orig_cat]
if mapped: # 非空映射 → 使用映射结果
return mapped, True, 0 # 原分类映射,无需复核
# 空映射(如"其他"→"")→ 继续检查下一个原分类或进入关键词匹配
else:
# 未知分类(不在映射表中)→ 保留原分类HIGH 复核
return orig_cat, True, 2
# 3. 使用通用分类推断(已包含京东平台商户关键词)
category, is_certain = infer_category(merchant, product, "支出")
if is_certain:
return category, True, 1 # 关键词匹配,低优先级复核
# 4. 返回默认分类
return _jd_config.get("默认分类", "其他支出"), False, 2 # 全部失败,高优先级复核
# 与支付宝/微信对齐的表头(包含"复核等级"字段)
ALIGNED_HEADER = [
"交易时间", "交易分类", "交易对方", "对方账号", "商品说明",
"收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注", "复核等级"
]
class JDCleaner(BaseCleaner):
"""京东白条账单清理器"""
def clean(self) -> None:
"""执行清理"""
self.print_header()
# 读取数据,跳过京东导出文件的头部信息
with open(self.input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
header = None
rows = []
for row in reader:
# 跳过空行
if not row or not row[0].strip():
continue
# 清理每个字段的 tab 字符
row = [cell.strip().replace('\t', '') for cell in row]
# 查找实际的CSV头部行包含"交易时间"和"商户名称"
if header is None:
if len(row) >= 2 and "交易时间" in row[0] and "商户名称" in row[1]:
header = row
continue
# 跳过头部信息行
continue
# 收集数据行
rows.append(row)
# 确保找到了有效的头部
if header is None:
raise ValueError("无法找到有效的京东账单表头(需包含'交易时间''商户名称'列)")
self.stats["original_count"] = len(rows)
print(f"原始数据行数: {len(rows)}")
# 第一步:按日期范围筛选
rows_filtered = [
row for row in rows
if row and is_in_date_range(row[0], self.start_date, self.end_date)
]
self.stats["filtered_count"] = len(rows_filtered)
date_desc = f"{self.start_date} ~ {self.end_date}" if self.start_date or self.end_date else "全部"
print(f"筛选后数据行数: {len(rows_filtered)} ({date_desc})")
# 第二步:分离退款和支出条目(过滤掉"不计收支"
refund_rows = []
expense_rows = []
skipped_count = 0 # 不计收支(还款、冻结等)
for row in rows_filtered:
if len(row) < 7:
continue
income_expense = row[6].strip() # 收/支 列
transaction_desc = row[2].strip() # 交易说明
status = row[5].strip() # 交易状态
# 过滤掉"不计收支"记录(还款、冻结、预授权等)
if income_expense == "不计收支":
skipped_count += 1
continue
# 退款判断:交易说明以"退款-"开头 或 状态包含"退款成功"
if transaction_desc.startswith("退款-") or "退款" in status:
refund_rows.append(row)
elif income_expense == "支出":
expense_rows.append(row)
print(f"退款条目数: {len(refund_rows)}")
print(f"支出条目数: {len(expense_rows)}")
print(f"不计收支过滤: {skipped_count} 条(还款/冻结等)")
# 第三步:处理退款
# 京东账单特点:已全额退款的记录金额会显示为 "179.00(已全额退款)"
final_expense_rows = self._process_expenses(expense_rows, refund_rows)
print(f"\n处理结果:")
print(f" 全额退款删除: {self.stats['fully_refunded']}")
print(f" 部分退款调整: {self.stats['partially_refunded']}")
if self.stats.get("zero_amount", 0) > 0:
print(f" 0元记录过滤: {self.stats['zero_amount']}")
print(f" 最终保留行数: {len(final_expense_rows)}")
# 第四步:转换为对齐格式并重新分类
aligned_rows = [self._convert_and_reclassify(row_data) for row_data in final_expense_rows]
# 按时间排序(最新在前)
aligned_rows.sort(key=lambda x: x[0], reverse=True)
# 统计复核数量
review_high_count = sum(1 for row in aligned_rows if row[-1] == "HIGH")
self.stats["final_count"] = len(aligned_rows)
if review_high_count > 0:
print(f" 高优先级复核: {review_high_count} 条(无法判断)")
# 写入文件
self.write_output(ALIGNED_HEADER, aligned_rows)
print(f"\n清理后的数据已保存到: {self.output_file}")
# 统计支出
self._print_expense_summary(aligned_rows)
def _parse_jd_amount(self, amount_str: str) -> tuple[Decimal, bool]:
"""
解析京东账单金额
京东金额格式特点:
- 普通金额: "179.00"
- 全额退款: "179.00(已全额退款)"
Returns:
(金额, 是否已全额退款)
"""
amount_str = amount_str.strip()
# 检查是否包含"已全额退款"
if "(已全额退款)" in amount_str or "(已全额退款)" in amount_str:
# 提取金额部分
amount_part = re.sub(r'[(]已全额退款[)]', '', amount_str)
return parse_amount(amount_part), True
return parse_amount(amount_str), False
def _process_expenses(self, expense_rows: list, refund_rows: list) -> list:
"""
处理支出记录
京东账单特点:
1. 已全额退款的记录金额显示为 "金额(已全额退款)"
2. 部分退款可能有单独的退款记录
"""
# 构建退款索引(按订单号)
order_refunds = {}
for row in refund_rows:
if len(row) >= 9:
order_no = row[8].strip() # 交易订单号
amount = parse_amount(row[3]) # 金额
if order_no:
if order_no not in order_refunds:
order_refunds[order_no] = Decimal("0")
order_refunds[order_no] += amount
print(f" 退款记录: {row[0]} | {row[1]} | {amount}")
final_rows = []
for row in expense_rows:
if len(row) < 9:
continue
order_no = row[8].strip() # 交易订单号
amount, is_fully_refunded = self._parse_jd_amount(row[3])
# 情况1金额已标注"已全额退款"
if is_fully_refunded:
self.stats["fully_refunded"] += 1
desc = row[2][:25] if len(row[2]) > 25 else row[2]
print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | {row[3]}")
continue
# 情况2检查是否有对应的退款记录
refund_amount = order_refunds.get(order_no, Decimal("0"))
if refund_amount > 0:
if refund_amount >= amount:
# 全额退款
self.stats["fully_refunded"] += 1
desc = row[2][:25] if len(row[2]) > 25 else row[2]
print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | 原{amount}")
else:
# 部分退款
remaining = amount - refund_amount
new_row = row.copy()
new_row[3] = format_amount(remaining)
remark = f"原金额{amount}元,退款{refund_amount}"
final_rows.append((new_row, remark))
self.stats["partially_refunded"] += 1
print(f" 部分退款: {row[0]} | {row[1]} | 原{amount}元 -> {format_amount(remaining)}")
else:
# 无退款,正常记录
if amount > 0:
final_rows.append((row, None))
else:
self.stats["zero_amount"] = self.stats.get("zero_amount", 0) + 1
return final_rows
def _convert_and_reclassify(self, row_tuple: tuple) -> list:
"""
转换为对齐格式并重新分类
京东原始字段:
0: 交易时间, 1: 商户名称, 2: 交易说明, 3: 金额,
4: 收/付款方式, 5: 交易状态, 6: 收/支, 7: 交易分类,
8: 交易订单号, 9: 商家订单号, 10: 备注
对齐后字段:
交易时间, 交易分类, 交易对方, 对方账号, 商品说明,
收/支, 金额, 收/付款方式, 交易状态, 交易订单号, 商家订单号, 备注, 复核等级
"""
if isinstance(row_tuple, tuple):
row, remark = row_tuple
else:
row, remark = row_tuple, None
transaction_time = row[0]
merchant = row[1] # 商户名称
product = row[2] # 交易说明
amount, _ = self._parse_jd_amount(row[3])
payment_method = row[4] if len(row) > 4 else ""
status = row[5] if len(row) > 5 else ""
income_expense = row[6] if len(row) > 6 else "支出"
original_category = row[7] if len(row) > 7 else ""
order_no = row[8] if len(row) > 8 else ""
merchant_order_no = row[9] if len(row) > 9 else ""
final_remark = remark if remark else (row[10] if len(row) > 10 else "/")
# 使用京东专属分类推断
category, is_certain, review_level = infer_jd_category(merchant, product, original_category)
# 复核等级映射: 0=空, 1=LOW, 2=HIGH
review_marks = {0: "", 1: "LOW", 2: "HIGH"}
review_mark = review_marks.get(review_level, "")
return [
transaction_time,
category,
merchant,
"/", # 对方账号(京东无此字段)
product,
income_expense,
format_amount(amount),
payment_method,
status,
order_no,
merchant_order_no,
final_remark,
review_mark
]
def reclassify(self, rows: list) -> list:
"""
重新分类京东账单
京东账单在 _convert_and_reclassify 中已完成分类
此方法为接口兼容保留
"""
return rows
def _print_expense_summary(self, expense_rows: list):
"""打印支出统计"""
total = Decimal("0")
categories = {}
for row in expense_rows:
if row[5] == "支出":
amt = Decimal(row[6])
total += amt
cat = row[1]
categories[cat] = categories.get(cat, Decimal("0")) + amt
print(f"清理后支出总额: ¥{total}")
print("\n=== 按分类统计 ===")
for cat, amt in sorted(categories.items(), key=lambda x: -x[1]):
print(f" {cat}: ¥{amt}")
def main():
"""命令行入口"""
parser = create_arg_parser("清理京东白条账单数据")
args = parser.parse_args()
from .base import compute_date_range
cleaner = JDCleaner(args.input_file, args.output_file)
start_date, end_date = compute_date_range(args)
cleaner.set_date_range(start_date, end_date)
cleaner.clean()
if __name__ == "__main__":
main()