feat: 支持京东白条账单上传和清洗

This commit is contained in:
CHE LIANG ZHAO
2026-01-26 13:44:22 +08:00
parent 7b2d6a9fbb
commit 3cf39b4664
17 changed files with 383 additions and 27 deletions

View File

@@ -4,6 +4,7 @@
from .base import BaseCleaner
from .alipay import AlipayCleaner
from .wechat import WechatCleaner
from .jd import JDCleaner
__all__ = ['BaseCleaner', 'AlipayCleaner', 'WechatCleaner']
__all__ = ['BaseCleaner', 'AlipayCleaner', 'WechatCleaner', 'JDCleaner']

313
analyzer/cleaners/jd.py Normal file
View File

@@ -0,0 +1,313 @@
"""
京东白条账单清理模块
"""
import csv
import re
from decimal import Decimal
from .base import (
BaseCleaner, parse_amount, format_amount,
is_in_date_range, create_arg_parser
)
from category import infer_category
# 与支付宝/微信对齐的表头(包含"复核等级"字段)
ALIGNED_HEADER = [
"交易时间", "交易分类", "交易对方", "对方账号", "商品说明",
"收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注", "复核等级"
]
class JDCleaner(BaseCleaner):
"""京东白条账单清理器"""
def clean(self) -> None:
"""执行清理"""
self.print_header()
# 读取数据,跳过京东导出文件的头部信息
with open(self.input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
header = None
rows = []
for row in reader:
# 跳过空行
if not row or not row[0].strip():
continue
# 清理每个字段的 tab 字符
row = [cell.strip().replace('\t', '') for cell in row]
# 查找实际的CSV头部行包含"交易时间"和"商户名称"
if header is None:
if len(row) >= 2 and "交易时间" in row[0] and "商户名称" in row[1]:
header = row
continue
# 跳过头部信息行
continue
# 收集数据行
rows.append(row)
# 确保找到了有效的头部
if header is None:
raise ValueError("无法找到有效的京东账单表头(需包含'交易时间''商户名称'列)")
self.stats["original_count"] = len(rows)
print(f"原始数据行数: {len(rows)}")
# 第一步:按日期范围筛选
rows_filtered = [
row for row in rows
if row and is_in_date_range(row[0], self.start_date, self.end_date)
]
self.stats["filtered_count"] = len(rows_filtered)
date_desc = f"{self.start_date} ~ {self.end_date}" if self.start_date or self.end_date else "全部"
print(f"筛选后数据行数: {len(rows_filtered)} ({date_desc})")
# 第二步:分离退款和支出条目(过滤掉"不计收支"
refund_rows = []
expense_rows = []
skipped_count = 0 # 不计收支(还款、冻结等)
for row in rows_filtered:
if len(row) < 7:
continue
income_expense = row[6].strip() # 收/支 列
transaction_desc = row[2].strip() # 交易说明
status = row[5].strip() # 交易状态
# 过滤掉"不计收支"记录(还款、冻结、预授权等)
if income_expense == "不计收支":
skipped_count += 1
continue
# 退款判断:交易说明以"退款-"开头 或 状态包含"退款成功"
if transaction_desc.startswith("退款-") or "退款" in status:
refund_rows.append(row)
elif income_expense == "支出":
expense_rows.append(row)
print(f"退款条目数: {len(refund_rows)}")
print(f"支出条目数: {len(expense_rows)}")
print(f"不计收支过滤: {skipped_count} 条(还款/冻结等)")
# 第三步:处理退款
# 京东账单特点:已全额退款的记录金额会显示为 "179.00(已全额退款)"
final_expense_rows = self._process_expenses(expense_rows, refund_rows)
print(f"\n处理结果:")
print(f" 全额退款删除: {self.stats['fully_refunded']}")
print(f" 部分退款调整: {self.stats['partially_refunded']}")
if self.stats.get("zero_amount", 0) > 0:
print(f" 0元记录过滤: {self.stats['zero_amount']}")
print(f" 最终保留行数: {len(final_expense_rows)}")
# 第四步:转换为对齐格式并重新分类
aligned_rows = [self._convert_and_reclassify(row_data) for row_data in final_expense_rows]
# 按时间排序(最新在前)
aligned_rows.sort(key=lambda x: x[0], reverse=True)
# 统计复核数量
review_high_count = sum(1 for row in aligned_rows if row[-1] == "HIGH")
self.stats["final_count"] = len(aligned_rows)
if review_high_count > 0:
print(f" 高优先级复核: {review_high_count} 条(无法判断)")
# 写入文件
self.write_output(ALIGNED_HEADER, aligned_rows)
print(f"\n清理后的数据已保存到: {self.output_file}")
# 统计支出
self._print_expense_summary(aligned_rows)
def _parse_jd_amount(self, amount_str: str) -> tuple[Decimal, bool]:
"""
解析京东账单金额
京东金额格式特点:
- 普通金额: "179.00"
- 全额退款: "179.00(已全额退款)"
Returns:
(金额, 是否已全额退款)
"""
amount_str = amount_str.strip()
# 检查是否包含"已全额退款"
if "(已全额退款)" in amount_str or "(已全额退款)" in amount_str:
# 提取金额部分
amount_part = re.sub(r'[(]已全额退款[)]', '', amount_str)
return parse_amount(amount_part), True
return parse_amount(amount_str), False
def _process_expenses(self, expense_rows: list, refund_rows: list) -> list:
"""
处理支出记录
京东账单特点:
1. 已全额退款的记录金额显示为 "金额(已全额退款)"
2. 部分退款可能有单独的退款记录
"""
# 构建退款索引(按订单号)
order_refunds = {}
for row in refund_rows:
if len(row) >= 9:
order_no = row[8].strip() # 交易订单号
amount = parse_amount(row[3]) # 金额
if order_no:
if order_no not in order_refunds:
order_refunds[order_no] = Decimal("0")
order_refunds[order_no] += amount
print(f" 退款记录: {row[0]} | {row[1]} | {amount}")
final_rows = []
for row in expense_rows:
if len(row) < 9:
continue
order_no = row[8].strip() # 交易订单号
amount, is_fully_refunded = self._parse_jd_amount(row[3])
# 情况1金额已标注"已全额退款"
if is_fully_refunded:
self.stats["fully_refunded"] += 1
desc = row[2][:25] if len(row[2]) > 25 else row[2]
print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | {row[3]}")
continue
# 情况2检查是否有对应的退款记录
refund_amount = order_refunds.get(order_no, Decimal("0"))
if refund_amount > 0:
if refund_amount >= amount:
# 全额退款
self.stats["fully_refunded"] += 1
desc = row[2][:25] if len(row[2]) > 25 else row[2]
print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | 原{amount}")
else:
# 部分退款
remaining = amount - refund_amount
new_row = row.copy()
new_row[3] = format_amount(remaining)
remark = f"原金额{amount}元,退款{refund_amount}"
final_rows.append((new_row, remark))
self.stats["partially_refunded"] += 1
print(f" 部分退款: {row[0]} | {row[1]} | 原{amount}元 -> {format_amount(remaining)}")
else:
# 无退款,正常记录
if amount > 0:
final_rows.append((row, None))
else:
self.stats["zero_amount"] = self.stats.get("zero_amount", 0) + 1
return final_rows
def _convert_and_reclassify(self, row_tuple: tuple) -> list:
"""
转换为对齐格式并重新分类
京东原始字段:
0: 交易时间, 1: 商户名称, 2: 交易说明, 3: 金额,
4: 收/付款方式, 5: 交易状态, 6: 收/支, 7: 交易分类,
8: 交易订单号, 9: 商家订单号, 10: 备注
对齐后字段:
交易时间, 交易分类, 交易对方, 对方账号, 商品说明,
收/支, 金额, 收/付款方式, 交易状态, 交易订单号, 商家订单号, 备注, 复核等级
"""
if isinstance(row_tuple, tuple):
row, remark = row_tuple
else:
row, remark = row_tuple, None
transaction_time = row[0]
merchant = row[1] # 商户名称
product = row[2] # 交易说明
amount, _ = self._parse_jd_amount(row[3])
payment_method = row[4] if len(row) > 4 else ""
status = row[5] if len(row) > 5 else ""
income_expense = row[6] if len(row) > 6 else "支出"
original_category = row[7] if len(row) > 7 else ""
order_no = row[8] if len(row) > 8 else ""
merchant_order_no = row[9] if len(row) > 9 else ""
final_remark = remark if remark else (row[10] if len(row) > 10 else "/")
# 使用推断分类(京东原始分类相对准确,但仍可优化)
category, is_certain = infer_category(merchant, product, income_expense)
# 如果推断失败但原分类非空,使用原分类
if not is_certain and original_category and original_category not in ["其他", ""]:
category = original_category
is_certain = True # 信任京东原分类
# 复核等级: 空=无需复核, HIGH=无法判断
review_mark = "" if is_certain else "HIGH"
return [
transaction_time,
category,
merchant,
"/", # 对方账号(京东无此字段)
product,
income_expense,
format_amount(amount),
payment_method,
status,
order_no,
merchant_order_no,
final_remark,
review_mark
]
def reclassify(self, rows: list) -> list:
"""
重新分类京东账单
京东账单在 _convert_and_reclassify 中已完成分类
此方法为接口兼容保留
"""
return rows
def _print_expense_summary(self, expense_rows: list):
"""打印支出统计"""
total = Decimal("0")
categories = {}
for row in expense_rows:
if row[5] == "支出":
amt = Decimal(row[6])
total += amt
cat = row[1]
categories[cat] = categories.get(cat, Decimal("0")) + amt
print(f"清理后支出总额: ¥{total}")
print("\n=== 按分类统计 ===")
for cat, amt in sorted(categories.items(), key=lambda x: -x[1]):
print(f" {cat}: ¥{amt}")
def main():
"""命令行入口"""
parser = create_arg_parser("清理京东白条账单数据")
args = parser.parse_args()
from .base import compute_date_range
cleaner = JDCleaner(args.input_file, args.output_file)
start_date, end_date = compute_date_range(args)
cleaner.set_date_range(start_date, end_date)
cleaner.clean()
if __name__ == "__main__":
main()

View File

@@ -49,7 +49,7 @@ def detect_bill_type_from_content(content: str, filename: str = "") -> str:
从内容和文件名检测账单类型
Returns:
'alipay', 'wechat', 或 ''
'alipay', 'wechat', 'jd',''
"""
# 从文件名检测
filename_lower = filename.lower()
@@ -57,6 +57,8 @@ def detect_bill_type_from_content(content: str, filename: str = "") -> str:
return 'alipay'
if '微信' in filename or 'wechat' in filename_lower:
return 'wechat'
if '京东' in filename or 'jd' in filename_lower:
return 'jd'
# 从内容检测
# 支付宝特征: 有 "交易分类" 和 "对方账号" 列
@@ -67,6 +69,12 @@ def detect_bill_type_from_content(content: str, filename: str = "") -> str:
if '交易类型' in content and '金额(元)' in content:
return 'wechat'
# 京东特征: 有 "商户名称" 和 "交易说明" 列,或头部包含 "京东账号名"
if '商户名称' in content and '交易说明' in content:
return 'jd'
if '京东账号名' in content:
return 'jd'
return ''

View File

@@ -22,7 +22,7 @@ if sys.stdout.encoding != 'utf-8':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
from cleaners.base import compute_date_range_from_values
from cleaners import AlipayCleaner, WechatCleaner
from cleaners import AlipayCleaner, WechatCleaner, JDCleaner
from category import infer_category, get_all_categories, get_all_income_categories
from converter import convert_bill_file
@@ -43,7 +43,7 @@ class CleanRequest(BaseModel):
start: Optional[str] = None
end: Optional[str] = None
format: Optional[str] = "csv"
bill_type: Optional[str] = "auto" # auto, alipay, wechat
bill_type: Optional[str] = "auto" # auto, alipay, wechat, jd
class CleanResponse(BaseModel):
@@ -90,7 +90,7 @@ def detect_bill_type(filepath: str) -> str | None:
检测账单类型
Returns:
'alipay' | 'wechat' | None
'alipay' | 'wechat' | 'jd' | None
"""
try:
with open(filepath, "r", encoding="utf-8") as f:
@@ -107,6 +107,14 @@ def detect_bill_type(filepath: str) -> str | None:
if "交易类型" in line and "金额(元)" in line:
return "wechat"
# 京东特征:表头包含 "商户名称" 和 "交易说明"
if "商户名称" in line and "交易说明" in line:
return "jd"
# 京东特征:头部信息包含 "京东账号名"
if "京东账号名" in line:
return "jd"
# 数据行特征
if line.startswith("202"):
if "" in line:
@@ -155,14 +163,16 @@ def do_clean(
try:
if bill_type == "alipay":
cleaner = AlipayCleaner(input_path, output_path, output_format)
elif bill_type == "jd":
cleaner = JDCleaner(input_path, output_path, output_format)
else:
cleaner = WechatCleaner(input_path, output_path, output_format)
cleaner.set_date_range(start_date, end_date)
cleaner.clean()
type_names = {"alipay": "支付宝", "wechat": "微信"}
return True, bill_type, f"{type_names[bill_type]}账单清洗完成"
type_names = {"alipay": "支付宝", "wechat": "微信", "jd": "京东白条"}
return True, bill_type, f"{type_names.get(bill_type, bill_type)}账单清洗完成"
except Exception as e:
return False, bill_type, f"清洗失败: {str(e)}"
@@ -324,7 +334,7 @@ async def detect_bill_type_api(file: UploadFile = File(...)):
"""
检测账单类型
上传文件后自动检测是支付宝还是微信账单
上传文件后自动检测是支付宝、微信还是京东账单
"""
suffix = Path(file.filename).suffix or ".csv"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
@@ -336,10 +346,10 @@ async def detect_bill_type_api(file: UploadFile = File(...)):
if bill_type is None:
raise HTTPException(status_code=400, detail="无法识别账单类型")
type_names = {"alipay": "支付宝", "wechat": "微信"}
type_names = {"alipay": "支付宝", "wechat": "微信", "jd": "京东白条"}
return {
"bill_type": bill_type,
"display_name": type_names[bill_type]
"display_name": type_names.get(bill_type, bill_type)
}
finally:
if os.path.exists(tmp_path):