refactor: 重构项目配置管理,统一使用.env配置

- 新增config.py统一读取.env配置,移除硬编码路径和参数
- 重构collect_jrxml.py支持命令行参数和环境变量配置源目录
- 新增.env.example示例配置文件,整理所有可配置项
- 重构down_embedding_model.py、import_to_chroma.py等所有脚本使用统一配置
- 新增Windows一键部署脚本setup.bat
- 修正jrxml_banch_chunker.py的文件名拼写错误
This commit is contained in:
2026-05-12 08:29:17 +08:00
parent bd98486de0
commit 9d78a49625
9 changed files with 396 additions and 67 deletions
+11 -7
View File
@@ -11,11 +11,12 @@ from pathlib import Path
import numpy as np
import chromadb
from tqdm import tqdm
from config import EMBEDDINGS_DIR, CHROMA_DB_PATH, CHROMA_COLLECTION_NAME
def main(embeddings_dir: str = None,
chroma_path: str = None,
collection_name: str = "jrxml_chunks"):
collection_name: str = None):
"""
从 embeddings 目录读取向量和 chunks,导入 Chroma 持久化数据库
@@ -27,15 +28,18 @@ def main(embeddings_dir: str = None,
project_root = Path(__file__).resolve().parent
if embeddings_dir is None:
embeddings_dir = project_root / "embeddings"
embeddings_dir = EMBEDDINGS_DIR
else:
embeddings_dir = Path(embeddings_dir)
if chroma_path is None:
chroma_path = project_root / "chroma_db"
chroma_path = CHROMA_DB_PATH
else:
chroma_path = Path(chroma_path)
if collection_name is None:
collection_name = CHROMA_COLLECTION_NAME
embeddings_file = embeddings_dir / "embeddings.npy"
chunks_file = embeddings_dir / "chunks.json"
@@ -164,11 +168,11 @@ if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="JRXML Chunks 导入 Chroma 工具")
parser.add_argument("--embeddings_dir", "-e", default=None,
help="向量文件目录 (默认: embeddings)")
help=f"向量文件目录 (默认: {EMBEDDINGS_DIR})")
parser.add_argument("--chroma_path", "-c", default=None,
help="Chroma 数据库路径 (默认: chroma_db)")
parser.add_argument("--collection_name", "-n", default="jrxml_chunks",
help="集合名称 (默认: jrxml_chunks)")
help=f"Chroma 数据库路径 (默认: {CHROMA_DB_PATH})")
parser.add_argument("--collection_name", "-n", default=CHROMA_COLLECTION_NAME,
help=f"集合名称 (默认: {CHROMA_COLLECTION_NAME})")
args = parser.parse_args()