feat: 支持ZIP压缩包上传(含密码保护)
This commit is contained in:
@@ -18,11 +18,31 @@ class AlipayCleaner(BaseCleaner):
|
||||
"""执行清理"""
|
||||
self.print_header()
|
||||
|
||||
# 读取数据
|
||||
# 读取数据,跳过支付宝导出文件的头部信息
|
||||
with open(self.input_file, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f)
|
||||
header = next(reader)
|
||||
rows = list(reader)
|
||||
header = None
|
||||
rows = []
|
||||
|
||||
for row in reader:
|
||||
# 跳过空行
|
||||
if not row or not row[0].strip():
|
||||
continue
|
||||
|
||||
# 查找实际的CSV头部行(包含"交易时间"和"交易分类")
|
||||
if header is None:
|
||||
if len(row) >= 2 and "交易时间" in row[0] and "交易分类" in row[1]:
|
||||
header = row
|
||||
continue
|
||||
# 跳过头部信息行
|
||||
continue
|
||||
|
||||
# 收集数据行
|
||||
rows.append(row)
|
||||
|
||||
# 确保找到了有效的头部
|
||||
if header is None:
|
||||
raise ValueError("无法找到有效的支付宝账单表头(需包含'交易时间'和'交易分类'列)")
|
||||
|
||||
self.stats["original_count"] = len(rows)
|
||||
print(f"原始数据行数: {len(rows)}")
|
||||
|
||||
188
analyzer/converter.py
Normal file
188
analyzer/converter.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
账单文件格式转换模块
|
||||
|
||||
支持:
|
||||
- xlsx -> csv 转换
|
||||
- GBK/GB2312 -> UTF-8 编码转换
|
||||
- 账单类型自动检测
|
||||
"""
|
||||
import os
|
||||
import csv
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
# 尝试导入 openpyxl,用于读取 xlsx 文件
|
||||
try:
|
||||
from openpyxl import load_workbook
|
||||
HAS_OPENPYXL = True
|
||||
except ImportError:
|
||||
HAS_OPENPYXL = False
|
||||
|
||||
|
||||
def detect_encoding(filepath: str) -> str:
|
||||
"""
|
||||
检测文件编码
|
||||
|
||||
Returns:
|
||||
'utf-8', 'gbk', 或 'utf-8-sig'
|
||||
"""
|
||||
# 尝试读取前几行来检测编码
|
||||
encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312', 'gb18030']
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
with open(filepath, 'r', encoding=encoding) as f:
|
||||
# 尝试读取前 10 行
|
||||
for _ in range(10):
|
||||
f.readline()
|
||||
return encoding
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
continue
|
||||
|
||||
# 默认使用 gbk
|
||||
return 'gbk'
|
||||
|
||||
|
||||
def detect_bill_type_from_content(content: str, filename: str = "") -> str:
|
||||
"""
|
||||
从内容和文件名检测账单类型
|
||||
|
||||
Returns:
|
||||
'alipay', 'wechat', 或 ''
|
||||
"""
|
||||
# 从文件名检测
|
||||
filename_lower = filename.lower()
|
||||
if '支付宝' in filename or 'alipay' in filename_lower:
|
||||
return 'alipay'
|
||||
if '微信' in filename or 'wechat' in filename_lower:
|
||||
return 'wechat'
|
||||
|
||||
# 从内容检测
|
||||
# 支付宝特征: 有 "交易分类" 和 "对方账号" 列
|
||||
if '交易分类' in content and '对方账号' in content:
|
||||
return 'alipay'
|
||||
|
||||
# 微信特征: 有 "交易类型" 和 "金额(元)" 列
|
||||
if '交易类型' in content and '金额(元)' in content:
|
||||
return 'wechat'
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def convert_xlsx_to_csv(xlsx_path: str, csv_path: str) -> Tuple[bool, str]:
|
||||
"""
|
||||
将 xlsx 文件转换为 csv 文件
|
||||
|
||||
Returns:
|
||||
(success, message)
|
||||
"""
|
||||
if not HAS_OPENPYXL:
|
||||
return False, "缺少 openpyxl 库,无法读取 xlsx 文件。请运行: pip install openpyxl"
|
||||
|
||||
try:
|
||||
wb = load_workbook(xlsx_path, read_only=True, data_only=True)
|
||||
ws = wb.active
|
||||
|
||||
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
# 跳过全空行
|
||||
if all(cell is None for cell in row):
|
||||
continue
|
||||
# 将 None 转换为空字符串
|
||||
writer.writerow(['' if cell is None else str(cell) for cell in row])
|
||||
|
||||
wb.close()
|
||||
return True, "xlsx 转换成功"
|
||||
|
||||
except Exception as e:
|
||||
return False, f"xlsx 转换失败: {str(e)}"
|
||||
|
||||
|
||||
def convert_csv_encoding(input_path: str, output_path: str, source_encoding: str = 'auto') -> Tuple[bool, str]:
|
||||
"""
|
||||
将 csv 文件从 GBK/其他编码转换为 UTF-8
|
||||
|
||||
Returns:
|
||||
(success, message)
|
||||
"""
|
||||
if source_encoding == 'auto':
|
||||
source_encoding = detect_encoding(input_path)
|
||||
|
||||
# 如果已经是 UTF-8,直接复制
|
||||
if source_encoding in ('utf-8', 'utf-8-sig'):
|
||||
if input_path != output_path:
|
||||
import shutil
|
||||
shutil.copy(input_path, output_path)
|
||||
return True, "文件已是 UTF-8 编码"
|
||||
|
||||
try:
|
||||
with open(input_path, 'r', encoding=source_encoding) as f_in:
|
||||
content = f_in.read()
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8', newline='') as f_out:
|
||||
f_out.write(content)
|
||||
|
||||
return True, f"编码转换成功: {source_encoding} -> utf-8"
|
||||
|
||||
except Exception as e:
|
||||
return False, f"编码转换失败: {str(e)}"
|
||||
|
||||
|
||||
def convert_bill_file(input_path: str, output_path: Optional[str] = None) -> Tuple[bool, str, str, str]:
|
||||
"""
|
||||
转换账单文件为标准 CSV 格式(UTF-8 编码)
|
||||
|
||||
支持:
|
||||
- xlsx -> csv 转换
|
||||
- GBK/GB2312 -> UTF-8 编码转换
|
||||
|
||||
Args:
|
||||
input_path: 输入文件路径
|
||||
output_path: 输出文件路径(可选,默认在同目录生成)
|
||||
|
||||
Returns:
|
||||
(success, bill_type, output_path, message)
|
||||
"""
|
||||
input_path = Path(input_path)
|
||||
|
||||
if not input_path.exists():
|
||||
return False, '', '', f"文件不存在: {input_path}"
|
||||
|
||||
# 确定输出路径
|
||||
if output_path is None:
|
||||
# 生成临时文件
|
||||
suffix = '.csv'
|
||||
fd, output_path = tempfile.mkstemp(suffix=suffix)
|
||||
os.close(fd)
|
||||
|
||||
ext = input_path.suffix.lower()
|
||||
bill_type = ''
|
||||
|
||||
if ext == '.xlsx':
|
||||
# xlsx 转换
|
||||
success, message = convert_xlsx_to_csv(str(input_path), output_path)
|
||||
if not success:
|
||||
return False, '', '', message
|
||||
|
||||
# 读取内容检测账单类型
|
||||
with open(output_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read(2000) # 只读取前 2000 字符用于检测
|
||||
bill_type = detect_bill_type_from_content(content, input_path.name)
|
||||
|
||||
elif ext == '.csv':
|
||||
# CSV 编码转换
|
||||
success, message = convert_csv_encoding(str(input_path), output_path)
|
||||
if not success:
|
||||
return False, '', '', message
|
||||
|
||||
# 读取内容检测账单类型
|
||||
with open(output_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read(2000)
|
||||
bill_type = detect_bill_type_from_content(content, input_path.name)
|
||||
|
||||
else:
|
||||
return False, '', '', f"不支持的文件格式: {ext}"
|
||||
|
||||
return True, bill_type, output_path, "转换成功"
|
||||
@@ -2,3 +2,4 @@ pyyaml>=6.0
|
||||
fastapi>=0.109.0
|
||||
uvicorn[standard]>=0.27.0
|
||||
python-multipart>=0.0.6
|
||||
openpyxl>=3.1.0
|
||||
|
||||
@@ -24,6 +24,7 @@ if sys.stdout.encoding != 'utf-8':
|
||||
from cleaners.base import compute_date_range_from_values
|
||||
from cleaners import AlipayCleaner, WechatCleaner
|
||||
from category import infer_category, get_all_categories, get_all_income_categories
|
||||
from converter import convert_bill_file
|
||||
|
||||
# 应用版本
|
||||
APP_VERSION = "0.0.1"
|
||||
@@ -72,6 +73,14 @@ class HealthResponse(BaseModel):
|
||||
version: str
|
||||
|
||||
|
||||
class ConvertResponse(BaseModel):
|
||||
"""文件转换响应"""
|
||||
success: bool
|
||||
bill_type: str
|
||||
output_path: str
|
||||
message: str
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 辅助函数
|
||||
# =============================================================================
|
||||
@@ -85,7 +94,7 @@ def detect_bill_type(filepath: str) -> str | None:
|
||||
"""
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for _ in range(20):
|
||||
for _ in range(50): # 支付宝账单可能有较多的头部信息行
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
@@ -337,6 +346,43 @@ async def detect_bill_type_api(file: UploadFile = File(...)):
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
@app.post("/convert", response_model=ConvertResponse)
|
||||
async def convert_bill_file_api(file: UploadFile = File(...)):
|
||||
"""
|
||||
转换账单文件格式
|
||||
|
||||
支持:
|
||||
- xlsx -> csv 转换
|
||||
- GBK/GB2312 -> UTF-8 编码转换
|
||||
|
||||
返回转换后的文件路径和检测到的账单类型
|
||||
"""
|
||||
# 保存上传的文件到临时位置
|
||||
suffix = Path(file.filename).suffix or ".csv"
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
shutil.copyfileobj(file.file, tmp)
|
||||
input_path = tmp.name
|
||||
|
||||
try:
|
||||
# 调用转换函数
|
||||
success, bill_type, output_path, message = convert_bill_file(input_path)
|
||||
|
||||
if not success:
|
||||
raise HTTPException(status_code=400, detail=message)
|
||||
|
||||
return ConvertResponse(
|
||||
success=True,
|
||||
bill_type=bill_type,
|
||||
output_path=output_path,
|
||||
message=message
|
||||
)
|
||||
|
||||
finally:
|
||||
# 清理输入临时文件(转换后的输出文件由调用方负责清理)
|
||||
if os.path.exists(input_path):
|
||||
os.unlink(input_path)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 启动入口
|
||||
# =============================================================================
|
||||
|
||||
Reference in New Issue
Block a user