""" 京东白条账单清理模块 """ import csv import re from decimal import Decimal from .base import ( BaseCleaner, parse_amount, format_amount, is_in_date_range, create_arg_parser ) from category import infer_category # 与支付宝/微信对齐的表头(包含"复核等级"字段) ALIGNED_HEADER = [ "交易时间", "交易分类", "交易对方", "对方账号", "商品说明", "收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注", "复核等级" ] class JDCleaner(BaseCleaner): """京东白条账单清理器""" def clean(self) -> None: """执行清理""" self.print_header() # 读取数据,跳过京东导出文件的头部信息 with open(self.input_file, "r", encoding="utf-8") as f: reader = csv.reader(f) header = None rows = [] for row in reader: # 跳过空行 if not row or not row[0].strip(): continue # 清理每个字段的 tab 字符 row = [cell.strip().replace('\t', '') for cell in row] # 查找实际的CSV头部行(包含"交易时间"和"商户名称") if header is None: if len(row) >= 2 and "交易时间" in row[0] and "商户名称" in row[1]: header = row continue # 跳过头部信息行 continue # 收集数据行 rows.append(row) # 确保找到了有效的头部 if header is None: raise ValueError("无法找到有效的京东账单表头(需包含'交易时间'和'商户名称'列)") self.stats["original_count"] = len(rows) print(f"原始数据行数: {len(rows)}") # 第一步:按日期范围筛选 rows_filtered = [ row for row in rows if row and is_in_date_range(row[0], self.start_date, self.end_date) ] self.stats["filtered_count"] = len(rows_filtered) date_desc = f"{self.start_date} ~ {self.end_date}" if self.start_date or self.end_date else "全部" print(f"筛选后数据行数: {len(rows_filtered)} ({date_desc})") # 第二步:分离退款和支出条目(过滤掉"不计收支") refund_rows = [] expense_rows = [] skipped_count = 0 # 不计收支(还款、冻结等) for row in rows_filtered: if len(row) < 7: continue income_expense = row[6].strip() # 收/支 列 transaction_desc = row[2].strip() # 交易说明 status = row[5].strip() # 交易状态 # 过滤掉"不计收支"记录(还款、冻结、预授权等) if income_expense == "不计收支": skipped_count += 1 continue # 退款判断:交易说明以"退款-"开头 或 状态包含"退款成功" if transaction_desc.startswith("退款-") or "退款" in status: refund_rows.append(row) elif income_expense == "支出": expense_rows.append(row) print(f"退款条目数: {len(refund_rows)}") print(f"支出条目数: {len(expense_rows)}") print(f"不计收支过滤: {skipped_count} 条(还款/冻结等)") # 第三步:处理退款 # 京东账单特点:已全额退款的记录金额会显示为 "179.00(已全额退款)" final_expense_rows = self._process_expenses(expense_rows, refund_rows) print(f"\n处理结果:") print(f" 全额退款删除: {self.stats['fully_refunded']} 条") print(f" 部分退款调整: {self.stats['partially_refunded']} 条") if self.stats.get("zero_amount", 0) > 0: print(f" 0元记录过滤: {self.stats['zero_amount']} 条") print(f" 最终保留行数: {len(final_expense_rows)}") # 第四步:转换为对齐格式并重新分类 aligned_rows = [self._convert_and_reclassify(row_data) for row_data in final_expense_rows] # 按时间排序(最新在前) aligned_rows.sort(key=lambda x: x[0], reverse=True) # 统计复核数量 review_high_count = sum(1 for row in aligned_rows if row[-1] == "HIGH") self.stats["final_count"] = len(aligned_rows) if review_high_count > 0: print(f" 高优先级复核: {review_high_count} 条(无法判断)") # 写入文件 self.write_output(ALIGNED_HEADER, aligned_rows) print(f"\n清理后的数据已保存到: {self.output_file}") # 统计支出 self._print_expense_summary(aligned_rows) def _parse_jd_amount(self, amount_str: str) -> tuple[Decimal, bool]: """ 解析京东账单金额 京东金额格式特点: - 普通金额: "179.00" - 全额退款: "179.00(已全额退款)" Returns: (金额, 是否已全额退款) """ amount_str = amount_str.strip() # 检查是否包含"已全额退款" if "(已全额退款)" in amount_str or "(已全额退款)" in amount_str: # 提取金额部分 amount_part = re.sub(r'[((]已全额退款[))]', '', amount_str) return parse_amount(amount_part), True return parse_amount(amount_str), False def _process_expenses(self, expense_rows: list, refund_rows: list) -> list: """ 处理支出记录 京东账单特点: 1. 已全额退款的记录金额显示为 "金额(已全额退款)" 2. 部分退款可能有单独的退款记录 """ # 构建退款索引(按订单号) order_refunds = {} for row in refund_rows: if len(row) >= 9: order_no = row[8].strip() # 交易订单号 amount = parse_amount(row[3]) # 金额 if order_no: if order_no not in order_refunds: order_refunds[order_no] = Decimal("0") order_refunds[order_no] += amount print(f" 退款记录: {row[0]} | {row[1]} | {amount}元") final_rows = [] for row in expense_rows: if len(row) < 9: continue order_no = row[8].strip() # 交易订单号 amount, is_fully_refunded = self._parse_jd_amount(row[3]) # 情况1:金额已标注"已全额退款" if is_fully_refunded: self.stats["fully_refunded"] += 1 desc = row[2][:25] if len(row[2]) > 25 else row[2] print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | {row[3]}") continue # 情况2:检查是否有对应的退款记录 refund_amount = order_refunds.get(order_no, Decimal("0")) if refund_amount > 0: if refund_amount >= amount: # 全额退款 self.stats["fully_refunded"] += 1 desc = row[2][:25] if len(row[2]) > 25 else row[2] print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | 原{amount}元") else: # 部分退款 remaining = amount - refund_amount new_row = row.copy() new_row[3] = format_amount(remaining) remark = f"原金额{amount}元,退款{refund_amount}元" final_rows.append((new_row, remark)) self.stats["partially_refunded"] += 1 print(f" 部分退款: {row[0]} | {row[1]} | 原{amount}元 -> {format_amount(remaining)}元") else: # 无退款,正常记录 if amount > 0: final_rows.append((row, None)) else: self.stats["zero_amount"] = self.stats.get("zero_amount", 0) + 1 return final_rows def _convert_and_reclassify(self, row_tuple: tuple) -> list: """ 转换为对齐格式并重新分类 京东原始字段: 0: 交易时间, 1: 商户名称, 2: 交易说明, 3: 金额, 4: 收/付款方式, 5: 交易状态, 6: 收/支, 7: 交易分类, 8: 交易订单号, 9: 商家订单号, 10: 备注 对齐后字段: 交易时间, 交易分类, 交易对方, 对方账号, 商品说明, 收/支, 金额, 收/付款方式, 交易状态, 交易订单号, 商家订单号, 备注, 复核等级 """ if isinstance(row_tuple, tuple): row, remark = row_tuple else: row, remark = row_tuple, None transaction_time = row[0] merchant = row[1] # 商户名称 product = row[2] # 交易说明 amount, _ = self._parse_jd_amount(row[3]) payment_method = row[4] if len(row) > 4 else "" status = row[5] if len(row) > 5 else "" income_expense = row[6] if len(row) > 6 else "支出" original_category = row[7] if len(row) > 7 else "" order_no = row[8] if len(row) > 8 else "" merchant_order_no = row[9] if len(row) > 9 else "" final_remark = remark if remark else (row[10] if len(row) > 10 else "/") # 使用推断分类(京东原始分类相对准确,但仍可优化) category, is_certain = infer_category(merchant, product, income_expense) # 如果推断失败但原分类非空,使用原分类 if not is_certain and original_category and original_category not in ["其他", ""]: category = original_category is_certain = True # 信任京东原分类 # 复核等级: 空=无需复核, HIGH=无法判断 review_mark = "" if is_certain else "HIGH" return [ transaction_time, category, merchant, "/", # 对方账号(京东无此字段) product, income_expense, format_amount(amount), payment_method, status, order_no, merchant_order_no, final_remark, review_mark ] def reclassify(self, rows: list) -> list: """ 重新分类京东账单 京东账单在 _convert_and_reclassify 中已完成分类 此方法为接口兼容保留 """ return rows def _print_expense_summary(self, expense_rows: list): """打印支出统计""" total = Decimal("0") categories = {} for row in expense_rows: if row[5] == "支出": amt = Decimal(row[6]) total += amt cat = row[1] categories[cat] = categories.get(cat, Decimal("0")) + amt print(f"清理后支出总额: ¥{total}") print("\n=== 按分类统计 ===") for cat, amt in sorted(categories.items(), key=lambda x: -x[1]): print(f" {cat}: ¥{amt}") def main(): """命令行入口""" parser = create_arg_parser("清理京东白条账单数据") args = parser.parse_args() from .base import compute_date_range cleaner = JDCleaner(args.input_file, args.output_file) start_date, end_date = compute_date_range(args) cleaner.set_date_range(start_date, end_date) cleaner.clean() if __name__ == "__main__": main()