refactor: 重构项目配置管理,统一使用.env配置
- 新增config.py统一读取.env配置,移除硬编码路径和参数 - 重构collect_jrxml.py支持命令行参数和环境变量配置源目录 - 新增.env.example示例配置文件,整理所有可配置项 - 重构down_embedding_model.py、import_to_chroma.py等所有脚本使用统一配置 - 新增Windows一键部署脚本setup.bat - 修正jrxml_banch_chunker.py的文件名拼写错误
This commit is contained in:
+18
-12
@@ -11,9 +11,10 @@ from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
from jrxml_chunker import JRXMLSemanticChunker, save_chunks_to_json, print_chunk_summary
|
||||
from config import JRXML_SOURCE_DIR, CHUNKER_OUTPUT_DIR, MAX_CHUNK_SIZE
|
||||
|
||||
|
||||
def batch_chunk_with_report(input_dir: str, output_dir: str = None, max_chunk_size: int = 2000):
|
||||
def batch_chunk_with_report(input_dir: str = None, output_dir: str = None, max_chunk_size: int = None):
|
||||
"""
|
||||
批量分块并生成详细报告
|
||||
|
||||
@@ -22,6 +23,8 @@ def batch_chunk_with_report(input_dir: str, output_dir: str = None, max_chunk_si
|
||||
output_dir: 输出目录,默认为 input_dir/../chunked_output
|
||||
max_chunk_size: 单个chunk最大字节数
|
||||
"""
|
||||
if input_dir is None:
|
||||
input_dir = str(JRXML_SOURCE_DIR)
|
||||
input_path = Path(input_dir).resolve()
|
||||
|
||||
if not input_path.exists():
|
||||
@@ -32,11 +35,13 @@ def batch_chunk_with_report(input_dir: str, output_dir: str = None, max_chunk_si
|
||||
print(f"❌ 不是目录: {input_path}")
|
||||
return None
|
||||
|
||||
# 设置输出目录
|
||||
if output_dir is None:
|
||||
output_dir = input_path.parent / f"{input_path.name}_chunked_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
output_dir = str(CHUNKER_OUTPUT_DIR)
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if max_chunk_size is None:
|
||||
max_chunk_size = MAX_CHUNK_SIZE
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"JRXML 语义分块 v3.0 - 批量处理")
|
||||
@@ -214,21 +219,24 @@ if __name__ == "__main__":
|
||||
print("=" * 60)
|
||||
print("JRXML Semantic Chunking v3.0 - 批量处理工具")
|
||||
print("=" * 60)
|
||||
print(f"\n默认输入目录: {JRXML_SOURCE_DIR}")
|
||||
print(f"默认输出目录: {CHUNKER_OUTPUT_DIR}")
|
||||
print("\n用法:")
|
||||
print(" python batch_chunker.py <目录路径>")
|
||||
print(" python batch_chunker.py <文件路径>")
|
||||
print(" python jrxml_banch_chunker.py <目录路径>")
|
||||
print(" python jrxml_banch_chunker.py <文件路径>")
|
||||
print(" python jrxml_banch_chunker.py (使用默认配置)")
|
||||
print("\n参数:")
|
||||
print(" <路径> JRXML文件所在目录 或 单个JRXML文件路径")
|
||||
print(" <路径> JRXML文件所在目录 或 单个JRXML文件路径")
|
||||
print(" --output <目录> 指定输出目录 (可选)")
|
||||
print("\n示例:")
|
||||
print(" python batch_chunker.py ./jasper_reports")
|
||||
print(" python batch_chunker.py ./jasper_reports --output ./chunks")
|
||||
print(" python batch_chunker.py report.jrxml")
|
||||
print(" python jrxml_banch_chunker.py")
|
||||
print(" python jrxml_banch_chunker.py ./jasper_reports")
|
||||
print(" python jrxml_banch_chunker.py ./jasper_reports --output ./chunks")
|
||||
print(" python jrxml_banch_chunker.py report.jrxml")
|
||||
sys.exit(0)
|
||||
|
||||
input_path = sys.argv[1]
|
||||
|
||||
# 解析--output参数
|
||||
output_dir = None
|
||||
if "--output" in sys.argv:
|
||||
idx = sys.argv.index("--output")
|
||||
@@ -236,10 +244,8 @@ if __name__ == "__main__":
|
||||
output_dir = sys.argv[idx + 1]
|
||||
|
||||
if os.path.isdir(input_path):
|
||||
# 批量处理目录
|
||||
batch_chunk_with_report(input_path, output_dir)
|
||||
elif os.path.isfile(input_path):
|
||||
# 处理单个文件
|
||||
chunk_single_file_with_report(input_path, output_dir)
|
||||
else:
|
||||
print(f"❌ 路径无效: {input_path}")
|
||||
Reference in New Issue
Block a user