feat: 支持ZIP压缩包上传（含密码保护）

2026-01-23 13:46:45 +08:00
parent 49e3176e6b
commit a97a8d6a20
22 changed files with 973 additions and 72 deletions
--- a/analyzer/cleaners/alipay.py
+++ b/analyzer/cleaners/alipay.py
@@ -18,11 +18,31 @@ class AlipayCleaner(BaseCleaner):
        """执行清理"""
        self.print_header()
        
-        # 读取数据
+        # 读取数据，跳过支付宝导出文件的头部信息
        with open(self.input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
-            header = next(reader)
-            rows = list(reader)
+            header = None
+            rows = []
+            
+            for row in reader:
+                # 跳过空行
+                if not row or not row[0].strip():
+                    continue
+                
+                # 查找实际的CSV头部行（包含"交易时间"和"交易分类"）
+                if header is None:
+                    if len(row) >= 2 and "交易时间" in row[0] and "交易分类" in row[1]:
+                        header = row
+                        continue
+                    # 跳过头部信息行
+                    continue
+                
+                # 收集数据行
+                rows.append(row)
+        
+        # 确保找到了有效的头部
+        if header is None:
+            raise ValueError("无法找到有效的支付宝账单表头（需包含'交易时间'和'交易分类'列）")
        
        self.stats["original_count"] = len(rows)
        print(f"原始数据行数: {len(rows)}")
--- a/analyzer/converter.py
+++ b/analyzer/converter.py
@@ -0,0 +1,188 @@
+"""
+账单文件格式转换模块
+
+支持:
+- xlsx -> csv 转换
+- GBK/GB2312 -> UTF-8 编码转换
+- 账单类型自动检测
+"""
+import os
+import csv
+import tempfile
+from pathlib import Path
+from typing import Optional, Tuple
+
+# 尝试导入 openpyxl，用于读取 xlsx 文件
+try:
+    from openpyxl import load_workbook
+    HAS_OPENPYXL = True
+except ImportError:
+    HAS_OPENPYXL = False
+
+
+def detect_encoding(filepath: str) -> str:
+    """
+    检测文件编码
+    
+    Returns:
+        'utf-8', 'gbk', 或 'utf-8-sig'
+    """
+    # 尝试读取前几行来检测编码
+    encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312', 'gb18030']
+    
+    for encoding in encodings:
+        try:
+            with open(filepath, 'r', encoding=encoding) as f:
+                # 尝试读取前 10 行
+                for _ in range(10):
+                    f.readline()
+            return encoding
+        except (UnicodeDecodeError, UnicodeError):
+            continue
+    
+    # 默认使用 gbk
+    return 'gbk'
+
+
+def detect_bill_type_from_content(content: str, filename: str = "") -> str:
+    """
+    从内容和文件名检测账单类型
+    
+    Returns:
+        'alipay', 'wechat', 或 ''
+    """
+    # 从文件名检测
+    filename_lower = filename.lower()
+    if '支付宝' in filename or 'alipay' in filename_lower:
+        return 'alipay'
+    if '微信' in filename or 'wechat' in filename_lower:
+        return 'wechat'
+    
+    # 从内容检测
+    # 支付宝特征: 有 "交易分类" 和 "对方账号" 列
+    if '交易分类' in content and '对方账号' in content:
+        return 'alipay'
+    
+    # 微信特征: 有 "交易类型" 和 "金额(元)" 列
+    if '交易类型' in content and '金额(元)' in content:
+        return 'wechat'
+    
+    return ''
+
+
+def convert_xlsx_to_csv(xlsx_path: str, csv_path: str) -> Tuple[bool, str]:
+    """
+    将 xlsx 文件转换为 csv 文件
+    
+    Returns:
+        (success, message)
+    """
+    if not HAS_OPENPYXL:
+        return False, "缺少 openpyxl 库，无法读取 xlsx 文件。请运行: pip install openpyxl"
+    
+    try:
+        wb = load_workbook(xlsx_path, read_only=True, data_only=True)
+        ws = wb.active
+        
+        with open(csv_path, 'w', encoding='utf-8', newline='') as f:
+            writer = csv.writer(f)
+            for row in ws.iter_rows(values_only=True):
+                # 跳过全空行
+                if all(cell is None for cell in row):
+                    continue
+                # 将 None 转换为空字符串
+                writer.writerow(['' if cell is None else str(cell) for cell in row])
+        
+        wb.close()
+        return True, "xlsx 转换成功"
+    
+    except Exception as e:
+        return False, f"xlsx 转换失败: {str(e)}"
+
+
+def convert_csv_encoding(input_path: str, output_path: str, source_encoding: str = 'auto') -> Tuple[bool, str]:
+    """
+    将 csv 文件从 GBK/其他编码转换为 UTF-8
+    
+    Returns:
+        (success, message)
+    """
+    if source_encoding == 'auto':
+        source_encoding = detect_encoding(input_path)
+    
+    # 如果已经是 UTF-8，直接复制
+    if source_encoding in ('utf-8', 'utf-8-sig'):
+        if input_path != output_path:
+            import shutil
+            shutil.copy(input_path, output_path)
+        return True, "文件已是 UTF-8 编码"
+    
+    try:
+        with open(input_path, 'r', encoding=source_encoding) as f_in:
+            content = f_in.read()
+        
+        with open(output_path, 'w', encoding='utf-8', newline='') as f_out:
+            f_out.write(content)
+        
+        return True, f"编码转换成功: {source_encoding} -> utf-8"
+    
+    except Exception as e:
+        return False, f"编码转换失败: {str(e)}"
+
+
+def convert_bill_file(input_path: str, output_path: Optional[str] = None) -> Tuple[bool, str, str, str]:
+    """
+    转换账单文件为标准 CSV 格式（UTF-8 编码）
+    
+    支持:
+    - xlsx -> csv 转换
+    - GBK/GB2312 -> UTF-8 编码转换
+    
+    Args:
+        input_path: 输入文件路径
+        output_path: 输出文件路径（可选，默认在同目录生成）
+    
+    Returns:
+        (success, bill_type, output_path, message)
+    """
+    input_path = Path(input_path)
+    
+    if not input_path.exists():
+        return False, '', '', f"文件不存在: {input_path}"
+    
+    # 确定输出路径
+    if output_path is None:
+        # 生成临时文件
+        suffix = '.csv'
+        fd, output_path = tempfile.mkstemp(suffix=suffix)
+        os.close(fd)
+    
+    ext = input_path.suffix.lower()
+    bill_type = ''
+    
+    if ext == '.xlsx':
+        # xlsx 转换
+        success, message = convert_xlsx_to_csv(str(input_path), output_path)
+        if not success:
+            return False, '', '', message
+        
+        # 读取内容检测账单类型
+        with open(output_path, 'r', encoding='utf-8') as f:
+            content = f.read(2000)  # 只读取前 2000 字符用于检测
+        bill_type = detect_bill_type_from_content(content, input_path.name)
+        
+    elif ext == '.csv':
+        # CSV 编码转换
+        success, message = convert_csv_encoding(str(input_path), output_path)
+        if not success:
+            return False, '', '', message
+        
+        # 读取内容检测账单类型
+        with open(output_path, 'r', encoding='utf-8') as f:
+            content = f.read(2000)
+        bill_type = detect_bill_type_from_content(content, input_path.name)
+        
+    else:
+        return False, '', '', f"不支持的文件格式: {ext}"
+    
+    return True, bill_type, output_path, "转换成功"
--- a/analyzer/requirements.txt
+++ b/analyzer/requirements.txt
@@ -2,3 +2,4 @@ pyyaml>=6.0
 fastapi>=0.109.0
 uvicorn[standard]>=0.27.0
 python-multipart>=0.0.6
+openpyxl>=3.1.0
--- a/analyzer/server.py
+++ b/analyzer/server.py
@@ -24,6 +24,7 @@ if sys.stdout.encoding != 'utf-8':
 from cleaners.base import compute_date_range_from_values
 from cleaners import AlipayCleaner, WechatCleaner
 from category import infer_category, get_all_categories, get_all_income_categories
+from converter import convert_bill_file

 # 应用版本
 APP_VERSION = "0.0.1"
@@ -72,6 +73,14 @@ class HealthResponse(BaseModel):
    version: str


+class ConvertResponse(BaseModel):
+    """文件转换响应"""
+    success: bool
+    bill_type: str
+    output_path: str
+    message: str
+
+
 # =============================================================================
 # 辅助函数
 # =============================================================================
@@ -85,7 +94,7 @@ def detect_bill_type(filepath: str) -> str | None:
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
-            for _ in range(20):
+            for _ in range(50):  # 支付宝账单可能有较多的头部信息行
                line = f.readline()
                if not line:
                    break
@@ -337,6 +346,43 @@ async def detect_bill_type_api(file: UploadFile = File(...)):
            os.unlink(tmp_path)


+@app.post("/convert", response_model=ConvertResponse)
+async def convert_bill_file_api(file: UploadFile = File(...)):
+    """
+    转换账单文件格式
+    
+    支持:
+    - xlsx -> csv 转换
+    - GBK/GB2312 -> UTF-8 编码转换
+    
+    返回转换后的文件路径和检测到的账单类型
+    """
+    # 保存上传的文件到临时位置
+    suffix = Path(file.filename).suffix or ".csv"
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        input_path = tmp.name
+    
+    try:
+        # 调用转换函数
+        success, bill_type, output_path, message = convert_bill_file(input_path)
+        
+        if not success:
+            raise HTTPException(status_code=400, detail=message)
+        
+        return ConvertResponse(
+            success=True,
+            bill_type=bill_type,
+            output_path=output_path,
+            message=message
+        )
+    
+    finally:
+        # 清理输入临时文件（转换后的输出文件由调用方负责清理）
+        if os.path.exists(input_path):
+            os.unlink(input_path)
+
+
 # =============================================================================
 # 启动入口
 # =============================================================================