feat: 支持京东白条账单上传和清洗
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
from .base import BaseCleaner
|
||||
from .alipay import AlipayCleaner
|
||||
from .wechat import WechatCleaner
|
||||
from .jd import JDCleaner
|
||||
|
||||
__all__ = ['BaseCleaner', 'AlipayCleaner', 'WechatCleaner']
|
||||
__all__ = ['BaseCleaner', 'AlipayCleaner', 'WechatCleaner', 'JDCleaner']
|
||||
|
||||
|
||||
313
analyzer/cleaners/jd.py
Normal file
313
analyzer/cleaners/jd.py
Normal file
@@ -0,0 +1,313 @@
|
||||
"""
|
||||
京东白条账单清理模块
|
||||
"""
|
||||
import csv
|
||||
import re
|
||||
from decimal import Decimal
|
||||
|
||||
from .base import (
|
||||
BaseCleaner, parse_amount, format_amount,
|
||||
is_in_date_range, create_arg_parser
|
||||
)
|
||||
from category import infer_category
|
||||
|
||||
|
||||
# 与支付宝/微信对齐的表头(包含"复核等级"字段)
|
||||
ALIGNED_HEADER = [
|
||||
"交易时间", "交易分类", "交易对方", "对方账号", "商品说明",
|
||||
"收/支", "金额", "收/付款方式", "交易状态", "交易订单号", "商家订单号", "备注", "复核等级"
|
||||
]
|
||||
|
||||
|
||||
class JDCleaner(BaseCleaner):
|
||||
"""京东白条账单清理器"""
|
||||
|
||||
def clean(self) -> None:
|
||||
"""执行清理"""
|
||||
self.print_header()
|
||||
|
||||
# 读取数据,跳过京东导出文件的头部信息
|
||||
with open(self.input_file, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f)
|
||||
header = None
|
||||
rows = []
|
||||
|
||||
for row in reader:
|
||||
# 跳过空行
|
||||
if not row or not row[0].strip():
|
||||
continue
|
||||
|
||||
# 清理每个字段的 tab 字符
|
||||
row = [cell.strip().replace('\t', '') for cell in row]
|
||||
|
||||
# 查找实际的CSV头部行(包含"交易时间"和"商户名称")
|
||||
if header is None:
|
||||
if len(row) >= 2 and "交易时间" in row[0] and "商户名称" in row[1]:
|
||||
header = row
|
||||
continue
|
||||
# 跳过头部信息行
|
||||
continue
|
||||
|
||||
# 收集数据行
|
||||
rows.append(row)
|
||||
|
||||
# 确保找到了有效的头部
|
||||
if header is None:
|
||||
raise ValueError("无法找到有效的京东账单表头(需包含'交易时间'和'商户名称'列)")
|
||||
|
||||
self.stats["original_count"] = len(rows)
|
||||
print(f"原始数据行数: {len(rows)}")
|
||||
|
||||
# 第一步:按日期范围筛选
|
||||
rows_filtered = [
|
||||
row for row in rows
|
||||
if row and is_in_date_range(row[0], self.start_date, self.end_date)
|
||||
]
|
||||
self.stats["filtered_count"] = len(rows_filtered)
|
||||
|
||||
date_desc = f"{self.start_date} ~ {self.end_date}" if self.start_date or self.end_date else "全部"
|
||||
print(f"筛选后数据行数: {len(rows_filtered)} ({date_desc})")
|
||||
|
||||
# 第二步:分离退款和支出条目(过滤掉"不计收支")
|
||||
refund_rows = []
|
||||
expense_rows = []
|
||||
skipped_count = 0 # 不计收支(还款、冻结等)
|
||||
|
||||
for row in rows_filtered:
|
||||
if len(row) < 7:
|
||||
continue
|
||||
|
||||
income_expense = row[6].strip() # 收/支 列
|
||||
transaction_desc = row[2].strip() # 交易说明
|
||||
status = row[5].strip() # 交易状态
|
||||
|
||||
# 过滤掉"不计收支"记录(还款、冻结、预授权等)
|
||||
if income_expense == "不计收支":
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# 退款判断:交易说明以"退款-"开头 或 状态包含"退款成功"
|
||||
if transaction_desc.startswith("退款-") or "退款" in status:
|
||||
refund_rows.append(row)
|
||||
elif income_expense == "支出":
|
||||
expense_rows.append(row)
|
||||
|
||||
print(f"退款条目数: {len(refund_rows)}")
|
||||
print(f"支出条目数: {len(expense_rows)}")
|
||||
print(f"不计收支过滤: {skipped_count} 条(还款/冻结等)")
|
||||
|
||||
# 第三步:处理退款
|
||||
# 京东账单特点:已全额退款的记录金额会显示为 "179.00(已全额退款)"
|
||||
final_expense_rows = self._process_expenses(expense_rows, refund_rows)
|
||||
|
||||
print(f"\n处理结果:")
|
||||
print(f" 全额退款删除: {self.stats['fully_refunded']} 条")
|
||||
print(f" 部分退款调整: {self.stats['partially_refunded']} 条")
|
||||
if self.stats.get("zero_amount", 0) > 0:
|
||||
print(f" 0元记录过滤: {self.stats['zero_amount']} 条")
|
||||
print(f" 最终保留行数: {len(final_expense_rows)}")
|
||||
|
||||
# 第四步:转换为对齐格式并重新分类
|
||||
aligned_rows = [self._convert_and_reclassify(row_data) for row_data in final_expense_rows]
|
||||
|
||||
# 按时间排序(最新在前)
|
||||
aligned_rows.sort(key=lambda x: x[0], reverse=True)
|
||||
|
||||
# 统计复核数量
|
||||
review_high_count = sum(1 for row in aligned_rows if row[-1] == "HIGH")
|
||||
|
||||
self.stats["final_count"] = len(aligned_rows)
|
||||
if review_high_count > 0:
|
||||
print(f" 高优先级复核: {review_high_count} 条(无法判断)")
|
||||
|
||||
# 写入文件
|
||||
self.write_output(ALIGNED_HEADER, aligned_rows)
|
||||
|
||||
print(f"\n清理后的数据已保存到: {self.output_file}")
|
||||
|
||||
# 统计支出
|
||||
self._print_expense_summary(aligned_rows)
|
||||
|
||||
def _parse_jd_amount(self, amount_str: str) -> tuple[Decimal, bool]:
|
||||
"""
|
||||
解析京东账单金额
|
||||
|
||||
京东金额格式特点:
|
||||
- 普通金额: "179.00"
|
||||
- 全额退款: "179.00(已全额退款)"
|
||||
|
||||
Returns:
|
||||
(金额, 是否已全额退款)
|
||||
"""
|
||||
amount_str = amount_str.strip()
|
||||
|
||||
# 检查是否包含"已全额退款"
|
||||
if "(已全额退款)" in amount_str or "(已全额退款)" in amount_str:
|
||||
# 提取金额部分
|
||||
amount_part = re.sub(r'[((]已全额退款[))]', '', amount_str)
|
||||
return parse_amount(amount_part), True
|
||||
|
||||
return parse_amount(amount_str), False
|
||||
|
||||
def _process_expenses(self, expense_rows: list, refund_rows: list) -> list:
|
||||
"""
|
||||
处理支出记录
|
||||
|
||||
京东账单特点:
|
||||
1. 已全额退款的记录金额显示为 "金额(已全额退款)"
|
||||
2. 部分退款可能有单独的退款记录
|
||||
"""
|
||||
# 构建退款索引(按订单号)
|
||||
order_refunds = {}
|
||||
for row in refund_rows:
|
||||
if len(row) >= 9:
|
||||
order_no = row[8].strip() # 交易订单号
|
||||
amount = parse_amount(row[3]) # 金额
|
||||
if order_no:
|
||||
if order_no not in order_refunds:
|
||||
order_refunds[order_no] = Decimal("0")
|
||||
order_refunds[order_no] += amount
|
||||
print(f" 退款记录: {row[0]} | {row[1]} | {amount}元")
|
||||
|
||||
final_rows = []
|
||||
|
||||
for row in expense_rows:
|
||||
if len(row) < 9:
|
||||
continue
|
||||
|
||||
order_no = row[8].strip() # 交易订单号
|
||||
amount, is_fully_refunded = self._parse_jd_amount(row[3])
|
||||
|
||||
# 情况1:金额已标注"已全额退款"
|
||||
if is_fully_refunded:
|
||||
self.stats["fully_refunded"] += 1
|
||||
desc = row[2][:25] if len(row[2]) > 25 else row[2]
|
||||
print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | {row[3]}")
|
||||
continue
|
||||
|
||||
# 情况2:检查是否有对应的退款记录
|
||||
refund_amount = order_refunds.get(order_no, Decimal("0"))
|
||||
if refund_amount > 0:
|
||||
if refund_amount >= amount:
|
||||
# 全额退款
|
||||
self.stats["fully_refunded"] += 1
|
||||
desc = row[2][:25] if len(row[2]) > 25 else row[2]
|
||||
print(f" 全额退款删除: {row[0]} | {row[1]} | {desc}... | 原{amount}元")
|
||||
else:
|
||||
# 部分退款
|
||||
remaining = amount - refund_amount
|
||||
new_row = row.copy()
|
||||
new_row[3] = format_amount(remaining)
|
||||
remark = f"原金额{amount}元,退款{refund_amount}元"
|
||||
|
||||
final_rows.append((new_row, remark))
|
||||
self.stats["partially_refunded"] += 1
|
||||
print(f" 部分退款: {row[0]} | {row[1]} | 原{amount}元 -> {format_amount(remaining)}元")
|
||||
else:
|
||||
# 无退款,正常记录
|
||||
if amount > 0:
|
||||
final_rows.append((row, None))
|
||||
else:
|
||||
self.stats["zero_amount"] = self.stats.get("zero_amount", 0) + 1
|
||||
|
||||
return final_rows
|
||||
|
||||
def _convert_and_reclassify(self, row_tuple: tuple) -> list:
|
||||
"""
|
||||
转换为对齐格式并重新分类
|
||||
|
||||
京东原始字段:
|
||||
0: 交易时间, 1: 商户名称, 2: 交易说明, 3: 金额,
|
||||
4: 收/付款方式, 5: 交易状态, 6: 收/支, 7: 交易分类,
|
||||
8: 交易订单号, 9: 商家订单号, 10: 备注
|
||||
|
||||
对齐后字段:
|
||||
交易时间, 交易分类, 交易对方, 对方账号, 商品说明,
|
||||
收/支, 金额, 收/付款方式, 交易状态, 交易订单号, 商家订单号, 备注, 复核等级
|
||||
"""
|
||||
if isinstance(row_tuple, tuple):
|
||||
row, remark = row_tuple
|
||||
else:
|
||||
row, remark = row_tuple, None
|
||||
|
||||
transaction_time = row[0]
|
||||
merchant = row[1] # 商户名称
|
||||
product = row[2] # 交易说明
|
||||
amount, _ = self._parse_jd_amount(row[3])
|
||||
payment_method = row[4] if len(row) > 4 else ""
|
||||
status = row[5] if len(row) > 5 else ""
|
||||
income_expense = row[6] if len(row) > 6 else "支出"
|
||||
original_category = row[7] if len(row) > 7 else ""
|
||||
order_no = row[8] if len(row) > 8 else ""
|
||||
merchant_order_no = row[9] if len(row) > 9 else ""
|
||||
final_remark = remark if remark else (row[10] if len(row) > 10 else "/")
|
||||
|
||||
# 使用推断分类(京东原始分类相对准确,但仍可优化)
|
||||
category, is_certain = infer_category(merchant, product, income_expense)
|
||||
|
||||
# 如果推断失败但原分类非空,使用原分类
|
||||
if not is_certain and original_category and original_category not in ["其他", ""]:
|
||||
category = original_category
|
||||
is_certain = True # 信任京东原分类
|
||||
|
||||
# 复核等级: 空=无需复核, HIGH=无法判断
|
||||
review_mark = "" if is_certain else "HIGH"
|
||||
|
||||
return [
|
||||
transaction_time,
|
||||
category,
|
||||
merchant,
|
||||
"/", # 对方账号(京东无此字段)
|
||||
product,
|
||||
income_expense,
|
||||
format_amount(amount),
|
||||
payment_method,
|
||||
status,
|
||||
order_no,
|
||||
merchant_order_no,
|
||||
final_remark,
|
||||
review_mark
|
||||
]
|
||||
|
||||
def reclassify(self, rows: list) -> list:
|
||||
"""
|
||||
重新分类京东账单
|
||||
|
||||
京东账单在 _convert_and_reclassify 中已完成分类
|
||||
此方法为接口兼容保留
|
||||
"""
|
||||
return rows
|
||||
|
||||
def _print_expense_summary(self, expense_rows: list):
|
||||
"""打印支出统计"""
|
||||
total = Decimal("0")
|
||||
categories = {}
|
||||
|
||||
for row in expense_rows:
|
||||
if row[5] == "支出":
|
||||
amt = Decimal(row[6])
|
||||
total += amt
|
||||
cat = row[1]
|
||||
categories[cat] = categories.get(cat, Decimal("0")) + amt
|
||||
|
||||
print(f"清理后支出总额: ¥{total}")
|
||||
print("\n=== 按分类统计 ===")
|
||||
for cat, amt in sorted(categories.items(), key=lambda x: -x[1]):
|
||||
print(f" {cat}: ¥{amt}")
|
||||
|
||||
|
||||
def main():
|
||||
"""命令行入口"""
|
||||
parser = create_arg_parser("清理京东白条账单数据")
|
||||
args = parser.parse_args()
|
||||
|
||||
from .base import compute_date_range
|
||||
|
||||
cleaner = JDCleaner(args.input_file, args.output_file)
|
||||
start_date, end_date = compute_date_range(args)
|
||||
cleaner.set_date_range(start_date, end_date)
|
||||
cleaner.clean()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -49,7 +49,7 @@ def detect_bill_type_from_content(content: str, filename: str = "") -> str:
|
||||
从内容和文件名检测账单类型
|
||||
|
||||
Returns:
|
||||
'alipay', 'wechat', 或 ''
|
||||
'alipay', 'wechat', 'jd', 或 ''
|
||||
"""
|
||||
# 从文件名检测
|
||||
filename_lower = filename.lower()
|
||||
@@ -57,6 +57,8 @@ def detect_bill_type_from_content(content: str, filename: str = "") -> str:
|
||||
return 'alipay'
|
||||
if '微信' in filename or 'wechat' in filename_lower:
|
||||
return 'wechat'
|
||||
if '京东' in filename or 'jd' in filename_lower:
|
||||
return 'jd'
|
||||
|
||||
# 从内容检测
|
||||
# 支付宝特征: 有 "交易分类" 和 "对方账号" 列
|
||||
@@ -67,6 +69,12 @@ def detect_bill_type_from_content(content: str, filename: str = "") -> str:
|
||||
if '交易类型' in content and '金额(元)' in content:
|
||||
return 'wechat'
|
||||
|
||||
# 京东特征: 有 "商户名称" 和 "交易说明" 列,或头部包含 "京东账号名"
|
||||
if '商户名称' in content and '交易说明' in content:
|
||||
return 'jd'
|
||||
if '京东账号名' in content:
|
||||
return 'jd'
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
||||
|
||||
from cleaners.base import compute_date_range_from_values
|
||||
from cleaners import AlipayCleaner, WechatCleaner
|
||||
from cleaners import AlipayCleaner, WechatCleaner, JDCleaner
|
||||
from category import infer_category, get_all_categories, get_all_income_categories
|
||||
from converter import convert_bill_file
|
||||
|
||||
@@ -43,7 +43,7 @@ class CleanRequest(BaseModel):
|
||||
start: Optional[str] = None
|
||||
end: Optional[str] = None
|
||||
format: Optional[str] = "csv"
|
||||
bill_type: Optional[str] = "auto" # auto, alipay, wechat
|
||||
bill_type: Optional[str] = "auto" # auto, alipay, wechat, jd
|
||||
|
||||
|
||||
class CleanResponse(BaseModel):
|
||||
@@ -90,7 +90,7 @@ def detect_bill_type(filepath: str) -> str | None:
|
||||
检测账单类型
|
||||
|
||||
Returns:
|
||||
'alipay' | 'wechat' | None
|
||||
'alipay' | 'wechat' | 'jd' | None
|
||||
"""
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
@@ -107,6 +107,14 @@ def detect_bill_type(filepath: str) -> str | None:
|
||||
if "交易类型" in line and "金额(元)" in line:
|
||||
return "wechat"
|
||||
|
||||
# 京东特征:表头包含 "商户名称" 和 "交易说明"
|
||||
if "商户名称" in line and "交易说明" in line:
|
||||
return "jd"
|
||||
|
||||
# 京东特征:头部信息包含 "京东账号名"
|
||||
if "京东账号名" in line:
|
||||
return "jd"
|
||||
|
||||
# 数据行特征
|
||||
if line.startswith("202"):
|
||||
if "¥" in line:
|
||||
@@ -155,14 +163,16 @@ def do_clean(
|
||||
try:
|
||||
if bill_type == "alipay":
|
||||
cleaner = AlipayCleaner(input_path, output_path, output_format)
|
||||
elif bill_type == "jd":
|
||||
cleaner = JDCleaner(input_path, output_path, output_format)
|
||||
else:
|
||||
cleaner = WechatCleaner(input_path, output_path, output_format)
|
||||
|
||||
cleaner.set_date_range(start_date, end_date)
|
||||
cleaner.clean()
|
||||
|
||||
type_names = {"alipay": "支付宝", "wechat": "微信"}
|
||||
return True, bill_type, f"✅ {type_names[bill_type]}账单清洗完成"
|
||||
type_names = {"alipay": "支付宝", "wechat": "微信", "jd": "京东白条"}
|
||||
return True, bill_type, f"✅ {type_names.get(bill_type, bill_type)}账单清洗完成"
|
||||
|
||||
except Exception as e:
|
||||
return False, bill_type, f"清洗失败: {str(e)}"
|
||||
@@ -324,7 +334,7 @@ async def detect_bill_type_api(file: UploadFile = File(...)):
|
||||
"""
|
||||
检测账单类型
|
||||
|
||||
上传文件后自动检测是支付宝还是微信账单
|
||||
上传文件后自动检测是支付宝、微信还是京东账单
|
||||
"""
|
||||
suffix = Path(file.filename).suffix or ".csv"
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
@@ -336,10 +346,10 @@ async def detect_bill_type_api(file: UploadFile = File(...)):
|
||||
if bill_type is None:
|
||||
raise HTTPException(status_code=400, detail="无法识别账单类型")
|
||||
|
||||
type_names = {"alipay": "支付宝", "wechat": "微信"}
|
||||
type_names = {"alipay": "支付宝", "wechat": "微信", "jd": "京东白条"}
|
||||
return {
|
||||
"bill_type": bill_type,
|
||||
"display_name": type_names[bill_type]
|
||||
"display_name": type_names.get(bill_type, bill_type)
|
||||
}
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
|
||||
Reference in New Issue
Block a user