jrxml_banch_chunker.py

"""
JRXML Batch Chunker
批量处理JRXML模板文件的入口脚本
"""

import os
import sys
import json
import time
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from jrxml_chunker import JRXMLSemanticChunker, save_chunks_to_json, print_chunk_summary
from config import JRXML_SOURCE_DIR, CHUNKER_OUTPUT_DIR, MAX_CHUNK_SIZE


def batch_chunk_with_report(input_dir: str = None, output_dir: str = None, max_chunk_size: int = None):
    """
    批量分块并生成详细报告
    
    Args:
        input_dir: JRXML文件目录
        output_dir: 输出目录，默认为 input_dir/../chunked_output
        max_chunk_size: 单个chunk最大字节数
    """
    if input_dir is None:
        input_dir = str(JRXML_SOURCE_DIR)
    input_path = Path(input_dir).resolve()
    
    if not input_path.exists():
        print(f"❌ 目录不存在: {input_path}")
        return None
    
    if not input_path.is_dir():
        print(f"❌ 不是目录: {input_path}")
        return None
    
    if output_dir is None:
        output_dir = str(CHUNKER_OUTPUT_DIR)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    if max_chunk_size is None:
        max_chunk_size = MAX_CHUNK_SIZE
    
    print(f"\n{'='*60}")
    print(f"JRXML 语义分块 v3.0 - 批量处理")
    print(f"{'='*60}")
    print(f"输入目录: {input_path}")
    print(f"输出目录: {output_path}")
    print(f"{'='*60}\n")
    
    # 初始化
    chunker = JRXMLSemanticChunker(max_chunk_size=max_chunk_size)
    
    # 收集所有JRXML文件
    jrxml_files = list(input_path.rglob("*.jrxml")) + list(input_path.rglob("*.JRXML"))
    total_files = len(jrxml_files)
    print(f"找到 {total_files} 个JRXML文件\n")
    
    if total_files == 0:
        print("⚠️ 未找到JRXML文件")
        return None
    
    # 统计变量
    all_chunks = []
    stats = {
        "total_files": total_files,
        "success": 0,
        "failed": 0,
        "total_chunks": 0,
        "failed_files": [],
        "chunks_per_file": defaultdict(int),
        "chunk_types": defaultdict(int),
        "started_at": datetime.now().isoformat()
    }
    
    start_time = time.time()
    
    # 逐个处理文件
    for i, jrxml_file in enumerate(jrxml_files, 1):
        relative_path = jrxml_file.relative_to(input_path)
        
        try:
            file_start = time.time()
            chunks = chunker.chunk_file(str(jrxml_file))
            file_duration = time.time() - file_start
            
            all_chunks.extend(chunks)
            
            # 统计
            stats["success"] += 1
            stats["total_chunks"] += len(chunks)
            stats["chunks_per_file"][str(relative_path)] = len(chunks)
            
            for chunk in chunks:
                stats["chunk_types"][chunk["chunk_type"]] += 1
            
            print(f"[{i}/{total_files}] ✅ {relative_path} → {len(chunks)} chunks ({file_duration:.2f}s)")
            
        except Exception as e:
            stats["failed"] += 1
            error_info = {"file": str(relative_path), "error": str(e)}
            stats["failed_files"].append(error_info)
            print(f"[{i}/{total_files}] ❌ {relative_path} → {e}")
    
    total_duration = time.time() - start_time
    stats["processing_time"] = round(total_duration, 2)
    stats["finished_at"] = datetime.now().isoformat()
    
    # 保存所有chunks
    all_chunks_path = output_path / "all_chunks.json"
    save_chunks_to_json(all_chunks, str(all_chunks_path))
    
    # 保存统计报告
    stats_path = output_path / "processing_stats.json"
    with open(stats_path, "w", encoding="utf-8") as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)
    
    # 按文件保存独立chunks
    per_file_dir = output_path / "per_file"
    per_file_dir.mkdir(exist_ok=True)
    
    chunks_by_file = defaultdict(list)
    for chunk in all_chunks:
        # 从context中提取文件名
        context = chunk.get("context", "")
        chunks_by_file[context].append(chunk)
    
    for context, file_chunks in chunks_by_file.items():
        # 简化文件名
        safe_name = context.replace("'", "").replace(" ", "_").replace("Report_", "")[:100]
        file_path = per_file_dir / f"{safe_name}.json"
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(file_chunks, f, ensure_ascii=False, indent=2)
    
    # 打印总结
    print(f"\n{'='*60}")
    print(f"处理完成！")
    print(f"{'='*60}")
    print(f"✅ 成功: {stats['success']} 文件")
    print(f"❌ 失败: {stats['failed']} 文件")
    print(f"📦 总Chunks: {stats['total_chunks']}")
    print(f"⏱️ 总耗时: {total_duration:.2f}s")
    print(f"📂 输出目录: {output_path}")
    print(f"\n主要文件:")
    print(f"  - {all_chunks_path}")
    print(f"  - {stats_path}")
    print(f"  - {per_file_dir}/ (按文件分类的chunks)")
    
    print(f"\nChunk类型分布:")
    print_chunk_summary(all_chunks)
    
    if stats["failed_files"]:
        print(f"\n⚠️ 失败文件详情:")
        for fail in stats["failed_files"]:
            print(f"  - {fail['file']}: {fail['error']}")
    
    return {
        "chunks": all_chunks,
        "stats": stats,
        "output_path": str(output_path)
    }


def chunk_single_file_with_report(file_path: str, output_dir: str = None):
    """处理单个文件并生成详细报告"""
    file_path = Path(file_path).resolve()
    
    if not file_path.exists():
        print(f"❌ 文件不存在: {file_path}")
        return None
    
    if output_dir is None:
        output_dir = file_path.parent / f"{file_path.stem}_chunks"
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    print(f"\n处理文件: {file_path.name}")
    print(f"输出目录: {output_path}\n")
    
    chunker = JRXMLSemanticChunker(max_chunk_size=2000)
    
    start_time = time.time()
    chunks = chunker.chunk_file(str(file_path))
    duration = time.time() - start_time
    
    # 保存结果
    chunks_path = output_path / f"{file_path.stem}_chunks.json"
    save_chunks_to_json(chunks, str(chunks_path))
    
    # 生成人类可读的摘要
    summary_path = output_path / f"{file_path.stem}_summary.txt"
    with open(summary_path, "w", encoding="utf-8") as f:
        f.write(f"JRXML Chunking Report: {file_path.name}\n")
        f.write(f"{'='*60}\n")
        f.write(f"Processing time: {duration:.2f}s\n")
        f.write(f"Total chunks: {len(chunks)}\n\n")
        
        for chunk in chunks:
            f.write(f"[Chunk {chunk['chunk_id']}] {chunk['chunk_type']}\n")
            f.write(f"  Description: {chunk['human_description'][:200]}\n")
            f.write(f"  XML Length: {len(chunk['raw_xml'])} chars\n")
            f.write(f"  Context: {chunk.get('context', 'N/A')}\n\n")
    
    print(f"✅ 生成 {len(chunks)} chunks")
    print(f"📄 Chunks JSON: {chunks_path}")
    print(f"📄 可读摘要: {summary_path}")
    print(f"⏱️ 耗时: {duration:.2f}s")
    
    print_chunk_summary(chunks)
    
    return chunks


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("=" * 60)
        print("JRXML Semantic Chunking v3.0 - 批量处理工具")
        print("=" * 60)
        print(f"\n默认输入目录: {JRXML_SOURCE_DIR}")
        print(f"默认输出目录: {CHUNKER_OUTPUT_DIR}")
        print("\n用法:")
        print("  python jrxml_banch_chunker.py <目录路径>")
        print("  python jrxml_banch_chunker.py <文件路径>")
        print("  python jrxml_banch_chunker.py                  (使用默认配置)")
        print("\n参数:")
        print("  <路径>            JRXML文件所在目录 或 单个JRXML文件路径")
        print("  --output <目录>   指定输出目录 (可选)")
        print("\n示例:")
        print("  python jrxml_banch_chunker.py")
        print("  python jrxml_banch_chunker.py ./jasper_reports")
        print("  python jrxml_banch_chunker.py ./jasper_reports --output ./chunks")
        print("  python jrxml_banch_chunker.py report.jrxml")
        sys.exit(0)
    
    input_path = sys.argv[1]
    
    output_dir = None
    if "--output" in sys.argv:
        idx = sys.argv.index("--output")
        if idx + 1 < len(sys.argv):
            output_dir = sys.argv[idx + 1]
    
    if os.path.isdir(input_path):
        batch_chunk_with_report(input_path, output_dir)
    elif os.path.isfile(input_path):
        chunk_single_file_with_report(input_path, output_dir)
    else:
        print(f"❌ 路径无效: {input_path}")
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00			`"""`
			`JRXML Batch Chunker`
			`批量处理JRXML模板文件的入口脚本`
			`"""`

			`import os`
			`import sys`
			`import json`
			`import time`
			`from pathlib import Path`
			`from datetime import datetime`
			`from collections import defaultdict`
			`from jrxml_chunker import JRXMLSemanticChunker, save_chunks_to_json, print_chunk_summary`
refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00			`from config import JRXML_SOURCE_DIR, CHUNKER_OUTPUT_DIR, MAX_CHUNK_SIZE`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00

refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00			`def batch_chunk_with_report(input_dir: str = None, output_dir: str = None, max_chunk_size: int = None):`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00			`"""`
			`批量分块并生成详细报告`

			`Args:`
			`input_dir: JRXML文件目录`
			`output_dir: 输出目录，默认为 input_dir/../chunked_output`
			`max_chunk_size: 单个chunk最大字节数`
			`"""`
refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00			`if input_dir is None:`
			`input_dir = str(JRXML_SOURCE_DIR)`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00			`input_path = Path(input_dir).resolve()`

			`if not input_path.exists():`
			`print(f"❌ 目录不存在: {input_path}")`
			`return None`

			`if not input_path.is_dir():`
			`print(f"❌ 不是目录: {input_path}")`
			`return None`

			`if output_dir is None:`
refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00			`output_dir = str(CHUNKER_OUTPUT_DIR)`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00			`output_path = Path(output_dir)`
			`output_path.mkdir(parents=True, exist_ok=True)`
refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00
			`if max_chunk_size is None:`
			`max_chunk_size = MAX_CHUNK_SIZE`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00
			`print(f"\n{'='*60}")`
			`print(f"JRXML 语义分块 v3.0 - 批量处理")`
			`print(f"{'='*60}")`
			`print(f"输入目录: {input_path}")`
			`print(f"输出目录: {output_path}")`
			`print(f"{'='*60}\n")`

			`# 初始化`
			`chunker = JRXMLSemanticChunker(max_chunk_size=max_chunk_size)`

			`# 收集所有JRXML文件`
			`jrxml_files = list(input_path.rglob(".jrxml")) + list(input_path.rglob(".JRXML"))`
			`total_files = len(jrxml_files)`
			`print(f"找到 {total_files} 个JRXML文件\n")`

			`if total_files == 0:`
			`print("⚠️ 未找到JRXML文件")`
			`return None`

			`# 统计变量`
			`all_chunks = []`
			`stats = {`
			`"total_files": total_files,`
			`"success": 0,`
			`"failed": 0,`
			`"total_chunks": 0,`
			`"failed_files": [],`
			`"chunks_per_file": defaultdict(int),`
			`"chunk_types": defaultdict(int),`
			`"started_at": datetime.now().isoformat()`
			`}`

			`start_time = time.time()`

			`# 逐个处理文件`
			`for i, jrxml_file in enumerate(jrxml_files, 1):`
			`relative_path = jrxml_file.relative_to(input_path)`

			`try:`
			`file_start = time.time()`
			`chunks = chunker.chunk_file(str(jrxml_file))`
			`file_duration = time.time() - file_start`

			`all_chunks.extend(chunks)`

			`# 统计`
			`stats["success"] += 1`
			`stats["total_chunks"] += len(chunks)`
			`stats["chunks_per_file"][str(relative_path)] = len(chunks)`

			`for chunk in chunks:`
			`stats["chunk_types"][chunk["chunk_type"]] += 1`

			`print(f"[{i}/{total_files}] ✅ {relative_path} → {len(chunks)} chunks ({file_duration:.2f}s)")`

			`except Exception as e:`
			`stats["failed"] += 1`
			`error_info = {"file": str(relative_path), "error": str(e)}`
			`stats["failed_files"].append(error_info)`
			`print(f"[{i}/{total_files}] ❌ {relative_path} → {e}")`

			`total_duration = time.time() - start_time`
			`stats["processing_time"] = round(total_duration, 2)`
			`stats["finished_at"] = datetime.now().isoformat()`

			`# 保存所有chunks`
			`all_chunks_path = output_path / "all_chunks.json"`
			`save_chunks_to_json(all_chunks, str(all_chunks_path))`

			`# 保存统计报告`
			`stats_path = output_path / "processing_stats.json"`
			`with open(stats_path, "w", encoding="utf-8") as f:`
			`json.dump(stats, f, ensure_ascii=False, indent=2)`

			`# 按文件保存独立chunks`
			`per_file_dir = output_path / "per_file"`
			`per_file_dir.mkdir(exist_ok=True)`

			`chunks_by_file = defaultdict(list)`
			`for chunk in all_chunks:`
			`# 从context中提取文件名`
			`context = chunk.get("context", "")`
			`chunks_by_file[context].append(chunk)`

			`for context, file_chunks in chunks_by_file.items():`
			`# 简化文件名`
			`safe_name = context.replace("'", "").replace(" ", "_").replace("Report_", "")[:100]`
			`file_path = per_file_dir / f"{safe_name}.json"`
			`with open(file_path, "w", encoding="utf-8") as f:`
			`json.dump(file_chunks, f, ensure_ascii=False, indent=2)`

			`# 打印总结`
			`print(f"\n{'='*60}")`
			`print(f"处理完成！")`
			`print(f"{'='*60}")`
			`print(f"✅ 成功: {stats['success']} 文件")`
			`print(f"❌ 失败: {stats['failed']} 文件")`
			`print(f"📦 总Chunks: {stats['total_chunks']}")`
			`print(f"⏱️ 总耗时: {total_duration:.2f}s")`
			`print(f"📂 输出目录: {output_path}")`
			`print(f"\n主要文件:")`
			`print(f" - {all_chunks_path}")`
			`print(f" - {stats_path}")`
			`print(f" - {per_file_dir}/ (按文件分类的chunks)")`

			`print(f"\nChunk类型分布:")`
			`print_chunk_summary(all_chunks)`

			`if stats["failed_files"]:`
			`print(f"\n⚠️ 失败文件详情:")`
			`for fail in stats["failed_files"]:`
			`print(f" - {fail['file']}: {fail['error']}")`

			`return {`
			`"chunks": all_chunks,`
			`"stats": stats,`
			`"output_path": str(output_path)`
			`}`


			`def chunk_single_file_with_report(file_path: str, output_dir: str = None):`
			`"""处理单个文件并生成详细报告"""`
			`file_path = Path(file_path).resolve()`

			`if not file_path.exists():`
			`print(f"❌ 文件不存在: {file_path}")`
			`return None`

			`if output_dir is None:`
			`output_dir = file_path.parent / f"{file_path.stem}_chunks"`

			`output_path = Path(output_dir)`
			`output_path.mkdir(parents=True, exist_ok=True)`

			`print(f"\n处理文件: {file_path.name}")`
			`print(f"输出目录: {output_path}\n")`

			`chunker = JRXMLSemanticChunker(max_chunk_size=2000)`

			`start_time = time.time()`
			`chunks = chunker.chunk_file(str(file_path))`
			`duration = time.time() - start_time`

			`# 保存结果`
			`chunks_path = output_path / f"{file_path.stem}_chunks.json"`
			`save_chunks_to_json(chunks, str(chunks_path))`

			`# 生成人类可读的摘要`
			`summary_path = output_path / f"{file_path.stem}_summary.txt"`
			`with open(summary_path, "w", encoding="utf-8") as f:`
			`f.write(f"JRXML Chunking Report: {file_path.name}\n")`
			`f.write(f"{'='*60}\n")`
			`f.write(f"Processing time: {duration:.2f}s\n")`
			`f.write(f"Total chunks: {len(chunks)}\n\n")`

			`for chunk in chunks:`
			`f.write(f"[Chunk {chunk['chunk_id']}] {chunk['chunk_type']}\n")`
			`f.write(f" Description: {chunk['human_description'][:200]}\n")`
			`f.write(f" XML Length: {len(chunk['raw_xml'])} chars\n")`
			`f.write(f" Context: {chunk.get('context', 'N/A')}\n\n")`

			`print(f"✅ 生成 {len(chunks)} chunks")`
			`print(f"📄 Chunks JSON: {chunks_path}")`
			`print(f"📄 可读摘要: {summary_path}")`
			`print(f"⏱️ 耗时: {duration:.2f}s")`

			`print_chunk_summary(chunks)`

			`return chunks`


			`if __name__ == "__main__":`
			`if len(sys.argv) < 2:`
			`print("=" * 60)`
			`print("JRXML Semantic Chunking v3.0 - 批量处理工具")`
			`print("=" * 60)`
refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00			`print(f"\n默认输入目录: {JRXML_SOURCE_DIR}")`
			`print(f"默认输出目录: {CHUNKER_OUTPUT_DIR}")`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00			`print("\n用法:")`
refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00			`print(" python jrxml_banch_chunker.py <目录路径>")`
			`print(" python jrxml_banch_chunker.py <文件路径>")`
			`print(" python jrxml_banch_chunker.py (使用默认配置)")`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00			`print("\n参数:")`
refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00			`print(" <路径> JRXML文件所在目录或单个JRXML文件路径")`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00			`print(" --output <目录> 指定输出目录 (可选)")`
			`print("\n示例:")`
refactor: 重构项目配置管理，统一使用.env配置 2026-05-12 08:29:17 +08:00			`print(" python jrxml_banch_chunker.py")`
			`print(" python jrxml_banch_chunker.py ./jasper_reports")`
			`print(" python jrxml_banch_chunker.py ./jasper_reports --output ./chunks")`
			`print(" python jrxml_banch_chunker.py report.jrxml")`
feat: 添加Qwen3嵌入模型及JRXML报告相关文件 2026-05-11 08:34:03 +08:00			`sys.exit(0)`

			`input_path = sys.argv[1]`

			`output_dir = None`
			`if "--output" in sys.argv:`
			`idx = sys.argv.index("--output")`
			`if idx + 1 < len(sys.argv):`
			`output_dir = sys.argv[idx + 1]`

			`if os.path.isdir(input_path):`
			`batch_chunk_with_report(input_path, output_dir)`
			`elif os.path.isfile(input_path):`
			`chunk_single_file_with_report(input_path, output_dir)`
			`else:`
			`print(f"❌ 路径无效: {input_path}")`