billai/analyzer/converter.py

"""
账单文件格式转换模块

支持:
- xlsx -> csv 转换
- GBK/GB2312 -> UTF-8 编码转换
- 账单类型自动检测
"""
import os
import csv
import tempfile
from pathlib import Path
from typing import Optional, Tuple

# 尝试导入 openpyxl，用于读取 xlsx 文件
try:
    from openpyxl import load_workbook
    HAS_OPENPYXL = True
except ImportError:
    HAS_OPENPYXL = False


def detect_encoding(filepath: str) -> str:
    """
    检测文件编码

    Returns:
        'utf-8', 'gbk', 或 'utf-8-sig'
    """
    # 尝试读取前几行来检测编码
    encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312', 'gb18030']

    for encoding in encodings:
        try:
            with open(filepath, 'r', encoding=encoding) as f:
                # 尝试读取前 10 行
                for _ in range(10):
                    f.readline()
            return encoding
        except (UnicodeDecodeError, UnicodeError):
            continue

    # 默认使用 gbk
    return 'gbk'


def detect_bill_type_from_content(content: str, filename: str = "") -> str:
    """
    从内容和文件名检测账单类型

    Returns:
        'alipay', 'wechat', 'jd', 或 ''
    """
    # 从文件名检测
    filename_lower = filename.lower()
    if '支付宝' in filename or 'alipay' in filename_lower:
        return 'alipay'
    if '微信' in filename or 'wechat' in filename_lower:
        return 'wechat'
    if '京东' in filename or 'jd' in filename_lower:
        return 'jd'

    # 从内容检测
    # 支付宝特征: 有 "交易分类" 和 "对方账号" 列
    if '交易分类' in content and '对方账号' in content:
        return 'alipay'

    # 微信特征: 有 "交易类型" 和 "金额(元)" 列
    if '交易类型' in content and '金额(元)' in content:
        return 'wechat'

    # 京东特征: 有 "商户名称" 和 "交易说明" 列，或头部包含 "京东账号名"
    if '商户名称' in content and '交易说明' in content:
        return 'jd'
    if '京东账号名' in content:
        return 'jd'

    return ''


def convert_xlsx_to_csv(xlsx_path: str, csv_path: str) -> Tuple[bool, str]:
    """
    将 xlsx 文件转换为 csv 文件

    Returns:
        (success, message)
    """
    if not HAS_OPENPYXL:
        return False, "缺少 openpyxl 库，无法读取 xlsx 文件。请运行: pip install openpyxl"

    try:
        wb = load_workbook(xlsx_path, read_only=True, data_only=True)
        ws = wb.active

        with open(csv_path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            for row in ws.iter_rows(values_only=True):
                # 跳过全空行
                if all(cell is None for cell in row):
                    continue
                # 将 None 转换为空字符串
                writer.writerow(['' if cell is None else str(cell) for cell in row])

        wb.close()
        return True, "xlsx 转换成功"

    except Exception as e:
        return False, f"xlsx 转换失败: {str(e)}"


def convert_csv_encoding(input_path: str, output_path: str, source_encoding: str = 'auto') -> Tuple[bool, str]:
    """
    将 csv 文件从 GBK/其他编码转换为 UTF-8

    Returns:
        (success, message)
    """
    if source_encoding == 'auto':
        source_encoding = detect_encoding(input_path)

    # 如果已经是 UTF-8，直接复制
    if source_encoding in ('utf-8', 'utf-8-sig'):
        if input_path != output_path:
            import shutil
            shutil.copy(input_path, output_path)
        return True, "文件已是 UTF-8 编码"

    try:
        with open(input_path, 'r', encoding=source_encoding) as f_in:
            content = f_in.read()

        with open(output_path, 'w', encoding='utf-8', newline='') as f_out:
            f_out.write(content)

        return True, f"编码转换成功: {source_encoding} -> utf-8"

    except Exception as e:
        return False, f"编码转换失败: {str(e)}"


def convert_bill_file(input_path: str, output_path: Optional[str] = None) -> Tuple[bool, str, str, str]:
    """
    转换账单文件为标准 CSV 格式（UTF-8 编码）

    支持:
    - xlsx -> csv 转换
    - GBK/GB2312 -> UTF-8 编码转换

    Args:
        input_path: 输入文件路径
        output_path: 输出文件路径（可选，默认在同目录生成）

    Returns:
        (success, bill_type, output_path, message)
    """
    input_path = Path(input_path)

    if not input_path.exists():
        return False, '', '', f"文件不存在: {input_path}"

    # 确定输出路径
    if output_path is None:
        # 生成临时文件
        suffix = '.csv'
        fd, output_path = tempfile.mkstemp(suffix=suffix)
        os.close(fd)

    ext = input_path.suffix.lower()
    bill_type = ''

    if ext == '.xlsx':
        # xlsx 转换
        success, message = convert_xlsx_to_csv(str(input_path), output_path)
        if not success:
            return False, '', '', message

        # 读取内容检测账单类型
        with open(output_path, 'r', encoding='utf-8') as f:
            content = f.read(2000)  # 只读取前 2000 字符用于检测
        bill_type = detect_bill_type_from_content(content, input_path.name)

    elif ext == '.csv':
        # CSV 编码转换
        success, message = convert_csv_encoding(str(input_path), output_path)
        if not success:
            return False, '', '', message

        # 读取内容检测账单类型
        with open(output_path, 'r', encoding='utf-8') as f:
            content = f.read(2000)
        bill_type = detect_bill_type_from_content(content, input_path.name)

    else:
        return False, '', '', f"不支持的文件格式: {ext}"

    return True, bill_type, output_path, "转换成功"