refactor: 重构项目配置管理,统一使用.env配置

- 新增config.py统一读取.env配置,移除硬编码路径和参数
- 重构collect_jrxml.py支持命令行参数和环境变量配置源目录
- 新增.env.example示例配置文件,整理所有可配置项
- 重构down_embedding_model.py、import_to_chroma.py等所有脚本使用统一配置
- 新增Windows一键部署脚本setup.bat
- 修正jrxml_banch_chunker.py的文件名拼写错误
This commit is contained in:
2026-05-12 08:29:17 +08:00
parent bd98486de0
commit 9d78a49625
9 changed files with 396 additions and 67 deletions
+18 -12
View File
@@ -11,9 +11,10 @@ from pathlib import Path
from datetime import datetime
from collections import defaultdict
from jrxml_chunker import JRXMLSemanticChunker, save_chunks_to_json, print_chunk_summary
from config import JRXML_SOURCE_DIR, CHUNKER_OUTPUT_DIR, MAX_CHUNK_SIZE
def batch_chunk_with_report(input_dir: str, output_dir: str = None, max_chunk_size: int = 2000):
def batch_chunk_with_report(input_dir: str = None, output_dir: str = None, max_chunk_size: int = None):
"""
批量分块并生成详细报告
@@ -22,6 +23,8 @@ def batch_chunk_with_report(input_dir: str, output_dir: str = None, max_chunk_si
output_dir: 输出目录,默认为 input_dir/../chunked_output
max_chunk_size: 单个chunk最大字节数
"""
if input_dir is None:
input_dir = str(JRXML_SOURCE_DIR)
input_path = Path(input_dir).resolve()
if not input_path.exists():
@@ -32,11 +35,13 @@ def batch_chunk_with_report(input_dir: str, output_dir: str = None, max_chunk_si
print(f"❌ 不是目录: {input_path}")
return None
# 设置输出目录
if output_dir is None:
output_dir = input_path.parent / f"{input_path.name}_chunked_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
output_dir = str(CHUNKER_OUTPUT_DIR)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
if max_chunk_size is None:
max_chunk_size = MAX_CHUNK_SIZE
print(f"\n{'='*60}")
print(f"JRXML 语义分块 v3.0 - 批量处理")
@@ -214,21 +219,24 @@ if __name__ == "__main__":
print("=" * 60)
print("JRXML Semantic Chunking v3.0 - 批量处理工具")
print("=" * 60)
print(f"\n默认输入目录: {JRXML_SOURCE_DIR}")
print(f"默认输出目录: {CHUNKER_OUTPUT_DIR}")
print("\n用法:")
print(" python batch_chunker.py <目录路径>")
print(" python batch_chunker.py <文件路径>")
print(" python jrxml_banch_chunker.py <目录路径>")
print(" python jrxml_banch_chunker.py <文件路径>")
print(" python jrxml_banch_chunker.py (使用默认配置)")
print("\n参数:")
print(" <路径> JRXML文件所在目录 或 单个JRXML文件路径")
print(" <路径> JRXML文件所在目录 或 单个JRXML文件路径")
print(" --output <目录> 指定输出目录 (可选)")
print("\n示例:")
print(" python batch_chunker.py ./jasper_reports")
print(" python batch_chunker.py ./jasper_reports --output ./chunks")
print(" python batch_chunker.py report.jrxml")
print(" python jrxml_banch_chunker.py")
print(" python jrxml_banch_chunker.py ./jasper_reports")
print(" python jrxml_banch_chunker.py ./jasper_reports --output ./chunks")
print(" python jrxml_banch_chunker.py report.jrxml")
sys.exit(0)
input_path = sys.argv[1]
# 解析--output参数
output_dir = None
if "--output" in sys.argv:
idx = sys.argv.index("--output")
@@ -236,10 +244,8 @@ if __name__ == "__main__":
output_dir = sys.argv[idx + 1]
if os.path.isdir(input_path):
# 批量处理目录
batch_chunk_with_report(input_path, output_dir)
elif os.path.isfile(input_path):
# 处理单个文件
chunk_single_file_with_report(input_path, output_dir)
else:
print(f"❌ 路径无效: {input_path}")