scripts/archive/import_knowledge_base.py

"""
从 IT支持知识库.docx 提取结构化内容，导入到 quick_reply_templates 表。

映射规则：
- Heading 1 → 文档一级分类（用于确定 category 字段）
- Heading 2 → 文档二级子分类（合并到 title 前缀）
- Heading 3 → 快速回复模板标题（title 字段）
- Normal → 模板内容（content 字段，多段合并）

Category 映射：
  办公电脑 → 硬件
  软件工具 → 软件
  办公设备 → 硬件
  办公网络 → 网络
  终端安全 → 安全
  资产管理 → 通用
  其他业务 → 通用
"""
import uuid
import sqlite3
from datetime import datetime, timezone
from docx import Document

# =========================================================================
# 配置
# =========================================================================
DOCX_PATH = r"C:\Users\simon\Downloads\IT支持知识库2026-4-24.docx"
DB_PATH = r"C:\Users\simon\wecom_it_smart_desk\backend\it_smart_desk.db"

# Heading 1 → quick_reply category 映射
CATEGORY_MAP = {
    "办公电脑": "硬件",
    "软件工具": "软件",
    "办公设备": "硬件",
    "办公网络": "网络",
    "终端安全": "安全",  # 终端安全涉及账号/密码/安全策略，用"安全"
    "资产管理": "通用",
    "其他业务": "通用",
}

def extract_items(doc):
    """从文档中提取所有 Heading 3 条目，包含完整的层级上下文。

    遍历流程：
    1. 记录当前的 Heading 1、Heading 2（建立层级上下文）
    2. 遇到 Heading 3 → 开始收集该条目下的所有 Normal 段落
    3. 遇到下个 Heading 3 或 Heading 2/Heading 1 → 条目结束，存储
    
    Returns:
        List[dict]: 每个条目含 h1/h2/h3/content 字段
    """
    items = []
    current_h1 = None
    current_h2 = None
    current_item = None  # 当前正在收集的条目 {h1, h2, h3, content_lines}
    
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        style = para.style.name if para.style else ""
        
        # Heading 1 → 更新一级分类，结束当前条目
        if style == "Heading 1":
            current_h1 = text
            if current_item and current_item["content_lines"]:
                items.append(finalize_item(current_item))
            current_item = None
            continue
        
        # Heading 2 → 更新二级分类，结束当前条目
        if style == "Heading 2":
            current_h2 = text
            if current_item and current_item["content_lines"]:
                items.append(finalize_item(current_item))
            current_item = None
            continue
        
        # Heading 3 → 新条目开始，保存上一个，创建新的
        if style == "Heading 3":
            if current_item and current_item["content_lines"]:
                items.append(finalize_item(current_item))
            current_item = {
                "h1": current_h1,
                "h2": current_h2,
                "h3": text,
                "content_lines": [],
            }
            continue
        
        # Normal / Normal (Web) 等 → 条目内容
        if current_item:
            current_item["content_lines"].append(text)
    
    # 最后一个条目
    if current_item and current_item["content_lines"]:
        items.append(finalize_item(current_item))
    
    return items


def finalize_item(item):
    """将 content_lines 合并为单个 content 字符串，并做格式化处理。"""
    # 合并内容，用换行分隔多段
    content = "\n".join(item["content_lines"])
    # 清理多余空白
    content = content.strip()
    item["content"] = content
    del item["content_lines"]
    return item


def map_category(h1):
    """将文档一级分类映射到快速回复的 category 字段。"""
    for key, cat in CATEGORY_MAP.items():
        if key in h1 if h1 else False:
            return cat
    return "通用"


def to_title(item):
    """生成模板标题：Heading 3 本身作为标题。

    如果 Heading 3 文字太长（>128 字符），截断。
    """
    title = item["h3"]
    if len(title) > 128:
        title = title[:125] + "..."
    return title


def check_existing(conn):
    """检查是否已有数据，避免重复导入。"""
    count = conn.execute("SELECT COUNT(*) FROM quick_reply_templates").fetchone()[0]
    return count


def import_items(conn, items):
    """将条目批量插入 quick_reply_templates 表。"""
    now = datetime.now(timezone.utc).isoformat()
    inserted = 0
    skipped = 0
    
    for i, item in enumerate(items):
        category = map_category(item["h1"])
        title = to_title(item)
        content = item["content"]
        
        # 跳过内容过短的条目（可能是误抓的标题）
        if len(content) < 10:
            skipped += 1
            continue
        
        # 检查是否重复（同标题+同分类）
        existing = conn.execute(
            "SELECT id FROM quick_reply_templates WHERE title = ? AND category = ?",
            (title, category)
        ).fetchone()
        if existing:
            skipped += 1
            continue
        
        template_id = str(uuid.uuid4())
        sort_order = i  # 保持文档原始顺序
        
        conn.execute(
            """INSERT INTO quick_reply_templates 
               (id, category, title, content, variables, sort_order, created_at, updated_at)
               VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
            (
                template_id,
                category,
                title,
                content,
                "[]",  # variables: 空列表（JSON 字符串）
                sort_order,
                now,
                now,
            )
        )
        inserted += 1
    
    conn.commit()
    return inserted, skipped


# =========================================================================
# 主流程
# =========================================================================
if __name__ == "__main__":
    print("=" * 60)
    print("   IT 支持知识库 → 快速回复模板 导入工具")
    print("=" * 60)
    
    # 1. 读取文档
    print(f"\n[1/4] 读取文档: {DOCX_PATH}")
    doc = Document(DOCX_PATH)
    
    # 2. 提取条目
    print("[2/4] 提取结构化条目...")
    items = extract_items(doc)
    print(f"  → 共提取 {len(items)} 个 Heading 3 条目")
    
    # 统计分类分布
    cat_counts = {}
    for item in items:
        cat = map_category(item["h1"])
        cat_counts[cat] = cat_counts.get(cat, 0) + 1
    print(f"  → 分类分布: {dict(sorted(cat_counts.items()))}")
    
    # 3. 连接数据库
    print(f"\n[3/4] 连接数据库: {DB_PATH}")
    conn = sqlite3.connect(DB_PATH)
    existing = check_existing(conn)
    if existing > 0:
        print(f"  ⚠ 数据库中已有 {existing} 条记录，将跳过重复标题。")
    
    # 4. 批量导入
    print("[4/4] 导入数据...")
    inserted, skipped = import_items(conn, items)
    
    # 统计
    total = conn.execute("SELECT COUNT(*) FROM quick_reply_templates").fetchone()[0]
    by_cat = conn.execute(
        "SELECT category, COUNT(*) FROM quick_reply_templates GROUP BY category ORDER BY category"
    ).fetchall()
    
    print(f"\n{'=' * 60}")
    print(f"  导入完成！")
    print(f"  → 新增: {inserted} 条")
    print(f"  → 跳过(重复/内容过短): {skipped} 条")
    print(f"  → 数据库总计: {total} 条")
    print(f"\n  按分类统计:")
    for cat, cnt in by_cat:
        print(f"    {cat}: {cnt}")
    print(f"{'=' * 60}")
    
    # 展示前 5 条样例
    print("\n  【导入样例】（前 5 条）")
    samples = conn.execute(
        "SELECT category, title, substr(content, 1, 80) FROM quick_reply_templates ORDER BY sort_order LIMIT 5"
    ).fetchall()
    for cat, title, snippet in samples:
        print(f"  [{cat}] {title}")
        print(f"       {snippet}...")
        print()
    
    conn.close()
chore: initial baseline with P0-safety .gitignore 2026-06-14 16:49:18 +08:00			`"""`
			`从 IT支持知识库.docx 提取结构化内容，导入到 quick_reply_templates 表。`

			`映射规则：`
			`- Heading 1 → 文档一级分类（用于确定 category 字段）`
			`- Heading 2 → 文档二级子分类（合并到 title 前缀）`
			`- Heading 3 → 快速回复模板标题（title 字段）`
			`- Normal → 模板内容（content 字段，多段合并）`

			`Category 映射：`
			`办公电脑 → 硬件`
			`软件工具 → 软件`
			`办公设备 → 硬件`
			`办公网络 → 网络`
			`终端安全 → 安全`
			`资产管理 → 通用`
			`其他业务 → 通用`
			`"""`
			`import uuid`
			`import sqlite3`
			`from datetime import datetime, timezone`
			`from docx import Document`

			`# =========================================================================`
			`# 配置`
			`# =========================================================================`
			`DOCX_PATH = r"C:\Users\simon\Downloads\IT支持知识库2026-4-24.docx"`
			`DB_PATH = r"C:\Users\simon\wecom_it_smart_desk\backend\it_smart_desk.db"`

			`# Heading 1 → quick_reply category 映射`
			`CATEGORY_MAP = {`
			`"办公电脑": "硬件",`
			`"软件工具": "软件",`
			`"办公设备": "硬件",`
			`"办公网络": "网络",`
			`"终端安全": "安全", # 终端安全涉及账号/密码/安全策略，用"安全"`
			`"资产管理": "通用",`
			`"其他业务": "通用",`
			`}`

			`def extract_items(doc):`
			`"""从文档中提取所有 Heading 3 条目，包含完整的层级上下文。`

			`遍历流程：`
			`1. 记录当前的 Heading 1、Heading 2（建立层级上下文）`
			`2. 遇到 Heading 3 → 开始收集该条目下的所有 Normal 段落`
			`3. 遇到下个 Heading 3 或 Heading 2/Heading 1 → 条目结束，存储`

			`Returns:`
			`List[dict]: 每个条目含 h1/h2/h3/content 字段`
			`"""`
			`items = []`
			`current_h1 = None`
			`current_h2 = None`
			`current_item = None # 当前正在收集的条目 {h1, h2, h3, content_lines}`

			`for para in doc.paragraphs:`
			`text = para.text.strip()`
			`if not text:`
			`continue`
			`style = para.style.name if para.style else ""`

			`# Heading 1 → 更新一级分类，结束当前条目`
			`if style == "Heading 1":`
			`current_h1 = text`
			`if current_item and current_item["content_lines"]:`
			`items.append(finalize_item(current_item))`
			`current_item = None`
			`continue`

			`# Heading 2 → 更新二级分类，结束当前条目`
			`if style == "Heading 2":`
			`current_h2 = text`
			`if current_item and current_item["content_lines"]:`
			`items.append(finalize_item(current_item))`
			`current_item = None`
			`continue`

			`# Heading 3 → 新条目开始，保存上一个，创建新的`
			`if style == "Heading 3":`
			`if current_item and current_item["content_lines"]:`
			`items.append(finalize_item(current_item))`
			`current_item = {`
			`"h1": current_h1,`
			`"h2": current_h2,`
			`"h3": text,`
			`"content_lines": [],`
			`}`
			`continue`

			`# Normal / Normal (Web) 等 → 条目内容`
			`if current_item:`
			`current_item["content_lines"].append(text)`

			`# 最后一个条目`
			`if current_item and current_item["content_lines"]:`
			`items.append(finalize_item(current_item))`

			`return items`


			`def finalize_item(item):`
			`"""将 content_lines 合并为单个 content 字符串，并做格式化处理。"""`
			`# 合并内容，用换行分隔多段`
			`content = "\n".join(item["content_lines"])`
			`# 清理多余空白`
			`content = content.strip()`
			`item["content"] = content`
			`del item["content_lines"]`
			`return item`


			`def map_category(h1):`
			`"""将文档一级分类映射到快速回复的 category 字段。"""`
			`for key, cat in CATEGORY_MAP.items():`
			`if key in h1 if h1 else False:`
			`return cat`
			`return "通用"`


			`def to_title(item):`
			`"""生成模板标题：Heading 3 本身作为标题。`

			`如果 Heading 3 文字太长（>128 字符），截断。`
			`"""`
			`title = item["h3"]`
			`if len(title) > 128:`
			`title = title[:125] + "..."`
			`return title`


			`def check_existing(conn):`
			`"""检查是否已有数据，避免重复导入。"""`
			`count = conn.execute("SELECT COUNT(*) FROM quick_reply_templates").fetchone()[0]`
			`return count`


			`def import_items(conn, items):`
			`"""将条目批量插入 quick_reply_templates 表。"""`
			`now = datetime.now(timezone.utc).isoformat()`
			`inserted = 0`
			`skipped = 0`

			`for i, item in enumerate(items):`
			`category = map_category(item["h1"])`
			`title = to_title(item)`
			`content = item["content"]`

			`# 跳过内容过短的条目（可能是误抓的标题）`
			`if len(content) < 10:`
			`skipped += 1`
			`continue`

			`# 检查是否重复（同标题+同分类）`
			`existing = conn.execute(`
			`"SELECT id FROM quick_reply_templates WHERE title = ? AND category = ?",`
			`(title, category)`
			`).fetchone()`
			`if existing:`
			`skipped += 1`
			`continue`

			`template_id = str(uuid.uuid4())`
			`sort_order = i # 保持文档原始顺序`

			`conn.execute(`
			`"""INSERT INTO quick_reply_templates`
			`(id, category, title, content, variables, sort_order, created_at, updated_at)`
			`VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",`
			`(`
			`template_id,`
			`category,`
			`title,`
			`content,`
			`"[]", # variables: 空列表（JSON 字符串）`
			`sort_order,`
			`now,`
			`now,`
			`)`
			`)`
			`inserted += 1`

			`conn.commit()`
			`return inserted, skipped`


			`# =========================================================================`
			`# 主流程`
			`# =========================================================================`
			`if __name__ == "__main__":`
			`print("=" * 60)`
			`print(" IT 支持知识库 → 快速回复模板导入工具")`
			`print("=" * 60)`

			`# 1. 读取文档`
			`print(f"\n[1/4] 读取文档: {DOCX_PATH}")`
			`doc = Document(DOCX_PATH)`

			`# 2. 提取条目`
			`print("[2/4] 提取结构化条目...")`
			`items = extract_items(doc)`
			`print(f" → 共提取 {len(items)} 个 Heading 3 条目")`

			`# 统计分类分布`
			`cat_counts = {}`
			`for item in items:`
			`cat = map_category(item["h1"])`
			`cat_counts[cat] = cat_counts.get(cat, 0) + 1`
			`print(f" → 分类分布: {dict(sorted(cat_counts.items()))}")`

			`# 3. 连接数据库`
			`print(f"\n[3/4] 连接数据库: {DB_PATH}")`
			`conn = sqlite3.connect(DB_PATH)`
			`existing = check_existing(conn)`
			`if existing > 0:`
			`print(f" ⚠ 数据库中已有 {existing} 条记录，将跳过重复标题。")`

			`# 4. 批量导入`
			`print("[4/4] 导入数据...")`
			`inserted, skipped = import_items(conn, items)`

			`# 统计`
			`total = conn.execute("SELECT COUNT(*) FROM quick_reply_templates").fetchone()[0]`
			`by_cat = conn.execute(`
			`"SELECT category, COUNT(*) FROM quick_reply_templates GROUP BY category ORDER BY category"`
			`).fetchall()`

			`print(f"\n{'=' * 60}")`
			`print(f" 导入完成！")`
			`print(f" → 新增: {inserted} 条")`
			`print(f" → 跳过(重复/内容过短): {skipped} 条")`
			`print(f" → 数据库总计: {total} 条")`
			`print(f"\n 按分类统计:")`
			`for cat, cnt in by_cat:`
			`print(f" {cat}: {cnt}")`
			`print(f"{'=' * 60}")`

			`# 展示前 5 条样例`
			`print("\n 【导入样例】（前 5 条）")`
			`samples = conn.execute(`
			`"SELECT category, title, substr(content, 1, 80) FROM quick_reply_templates ORDER BY sort_order LIMIT 5"`
			`).fetchall()`
			`for cat, title, snippet in samples:`
			`print(f" [{cat}] {title}")`
			`print(f" {snippet}...")`
			`print()`

			`conn.close()`