135 lines
4.3 KiB
Python
135 lines
4.3 KiB
Python
|
|
"""初始化默认用户和预置知识库。
|
|||
|
|
|
|||
|
|
解析 rag/jrxml_source/ 下的全部 JRXML 模板 + 接口文档,
|
|||
|
|
创建默认用户 "默认用户" 和知识库 "F6-汽车维修打印默认知识库",
|
|||
|
|
执行完整的 parse -> chunk -> embed 管线。
|
|||
|
|
|
|||
|
|
用法:
|
|||
|
|
python scripts/init_default_kb.py
|
|||
|
|
python scripts/init_default_kb.py --force # 强制重建
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import argparse
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|||
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|||
|
|
|
|||
|
|
from dotenv import load_dotenv
|
|||
|
|
load_dotenv()
|
|||
|
|
|
|||
|
|
from backend.logger import get_logger
|
|||
|
|
|
|||
|
|
_log = get_logger("init_kb")
|
|||
|
|
|
|||
|
|
DEFAULT_USER_NAME = "默认用户"
|
|||
|
|
DEFAULT_KB_NAME = "F6-汽车维修打印默认知识库"
|
|||
|
|
DEFAULT_KB_DESC = "预置的汽车维修打印单模板与接口文档知识库"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def find_source_files(source_dir: Path) -> list[str]:
|
|||
|
|
"""收集源目录下的所有 .jrxml 和 .md 文件。"""
|
|||
|
|
files = []
|
|||
|
|
if not source_dir.exists():
|
|||
|
|
_log.warning("源目录不存在: %s", source_dir)
|
|||
|
|
return files
|
|||
|
|
|
|||
|
|
for pattern in ("*.jrxml", "*.md", "**/*.jrxml", "**/*.md"):
|
|||
|
|
for fp in source_dir.glob(pattern):
|
|||
|
|
if fp.is_file():
|
|||
|
|
files.append(str(fp))
|
|||
|
|
|
|||
|
|
return list(dict.fromkeys(files))
|
|||
|
|
|
|||
|
|
|
|||
|
|
def init_default_kb(force: bool = False) -> dict:
|
|||
|
|
"""初始化默认 KB,返回结果摘要。"""
|
|||
|
|
from backend.kb_manager import create_user, list_users, create_kb, list_kbs
|
|||
|
|
from backend.kb_parser import build_kb_from_files
|
|||
|
|
|
|||
|
|
# 1. 查找或创建默认用户
|
|||
|
|
users = list_users()
|
|||
|
|
default_user = None
|
|||
|
|
for u in users:
|
|||
|
|
if u.get("name") == DEFAULT_USER_NAME:
|
|||
|
|
default_user = u
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if default_user is None:
|
|||
|
|
default_user = create_user(DEFAULT_USER_NAME)
|
|||
|
|
_log.info("创建默认用户: %s", default_user["user_id"])
|
|||
|
|
else:
|
|||
|
|
_log.info("默认用户已存在: %s", default_user["user_id"])
|
|||
|
|
|
|||
|
|
user_id = default_user["user_id"]
|
|||
|
|
|
|||
|
|
# 2. 查找或创建默认 KB
|
|||
|
|
user_kbs = list_kbs(user_id)
|
|||
|
|
default_kb = None
|
|||
|
|
for kb in user_kbs:
|
|||
|
|
if kb.get("name") == DEFAULT_KB_NAME:
|
|||
|
|
default_kb = kb
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if default_kb is not None and force:
|
|||
|
|
from backend.kb_manager import delete_kb
|
|||
|
|
delete_kb(default_kb["kb_id"])
|
|||
|
|
default_kb = None
|
|||
|
|
_log.info("强制重建: 已删除旧 KB")
|
|||
|
|
|
|||
|
|
if default_kb is None:
|
|||
|
|
default_kb = create_kb(user_id, DEFAULT_KB_NAME, DEFAULT_KB_DESC)
|
|||
|
|
_log.info("创建默认知识库: %s", default_kb["kb_id"])
|
|||
|
|
elif default_kb.get("parse_status") == "ready":
|
|||
|
|
_log.info("默认知识库已就绪: chunks=%s", default_kb.get("chunk_count", 0))
|
|||
|
|
return {"status": "already_ready", "kb_id": default_kb["kb_id"],
|
|||
|
|
"user_id": user_id, "chunk_count": default_kb.get("chunk_count", 0)}
|
|||
|
|
|
|||
|
|
kb_id = default_kb["kb_id"]
|
|||
|
|
|
|||
|
|
# 3. 收集源文件
|
|||
|
|
source_dir = PROJECT_ROOT / "rag" / "jrxml_source"
|
|||
|
|
files = find_source_files(source_dir)
|
|||
|
|
if not files:
|
|||
|
|
_log.warning("未找到源文件,跳过构建")
|
|||
|
|
return {"status": "no_files", "kb_id": kb_id, "user_id": user_id}
|
|||
|
|
|
|||
|
|
_log.info("找到 %d 个源文件", len(files))
|
|||
|
|
|
|||
|
|
# 4. 构建 KB
|
|||
|
|
result = build_kb_from_files(kb_id, files)
|
|||
|
|
_log.info("KB 构建完成: %s", result)
|
|||
|
|
|
|||
|
|
return {**result, "kb_id": kb_id, "user_id": user_id, "file_count": len(files)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(description="初始化默认知识库")
|
|||
|
|
parser.add_argument("--force", action="store_true", help="强制重建(删除已有 KB)")
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("JRXML Agent - 默认知识库初始化")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
result = init_default_kb(force=args.force)
|
|||
|
|
|
|||
|
|
print(f"\n用户: {DEFAULT_USER_NAME}")
|
|||
|
|
print(f"知识库: {DEFAULT_KB_NAME}")
|
|||
|
|
print(f"状态: {result.get('status', 'unknown')}")
|
|||
|
|
print(f"字段数: {result.get('field_count', 0)}")
|
|||
|
|
print(f"模板数: {result.get('template_count', 0)}")
|
|||
|
|
print(f"Chunk数: {result.get('chunk_count', 0)}")
|
|||
|
|
if result.get("errors"):
|
|||
|
|
print(f"错误: {len(result['errors'])} 条")
|
|||
|
|
for e in result["errors"]:
|
|||
|
|
print(f" - {e.get('file', '')}: {e.get('error', '')}")
|
|||
|
|
|
|||
|
|
print("\n初始化完成!")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|