chore: initial baseline with P0-safety .gitignore

2026-06-14 16:49:18 +08:00
commit 63262292d7
510 changed files with 146008 additions and 0 deletions
@@ -0,0 +1,35 @@
+# =============================================================================
+# RAGFlow 集成模块
+# =============================================================================
+
+from .client import RagflowClient
+from .config import get_ragflow_client
+from .exceptions import (
+    RagflowApiError,
+    RagflowAuthError,
+    RagflowConfigError,
+    RagflowConnectionError,
+    RagflowError,
+)
+from .models import (
+    DatasetInfo,
+    DocAggregate,
+    DocumentInfo,
+    RetrievalChunk,
+    RetrievalResult,
+)
+
+__all__ = [
+    "RagflowClient",
+    "get_ragflow_client",
+    "RagflowError",
+    "RagflowConfigError",
+    "RagflowAuthError",
+    "RagflowApiError",
+    "RagflowConnectionError",
+    "RetrievalChunk",
+    "DocAggregate",
+    "RetrievalResult",
+    "DatasetInfo",
+    "DocumentInfo",
+]
@@ -0,0 +1,449 @@
+# =============================================================================
+# RAGFlow API 客户端
+# =============================================================================
+# 说明：封装 RAGFlow 知识检索引擎的 API 调用
+# 核心功能：
+#   1. 知识检索 — POST /api/v1/retrieval（核心接口）
+#   2. 数据集管理 — 列出/创建/删除知识库
+#   3. 文档管理 — 上传/列出/删除文档
+#   4. 测试连接 — 验证 API Key 是否有效
+# 认证方式：Authorization: Bearer <API_KEY>
+# 参考文档：https://ragflow.io/docs/http_api_reference
+# =============================================================================
+
+import logging
+from typing import Any, Dict, List, Optional
+
+import httpx
+
+from .exceptions import (
+    RagflowApiError,
+    RagflowAuthError,
+    RagflowConfigError,
+    RagflowConnectionError,
+    RagflowError,
+)
+from .models import (
+    DatasetInfo,
+    DocAggregate,
+    DocumentInfo,
+    RetrievalChunk,
+    RetrievalResult,
+)
+
+logger = logging.getLogger(__name__)
+
+# 默认请求超时（秒）
+DEFAULT_TIMEOUT = 30.0
+
+# 默认分页大小
+DEFAULT_PAGE_SIZE = 20
+
+
+class RagflowClient:
+    """RAGFlow API 客户端。
+
+    封装 RAGFlow 知识检索引擎的 API 调用，支持：
+    - 知识检索（核心功能）
+    - 数据集（知识库）管理
+    - 文档管理
+    - 连接测试
+
+    使用方式：
+        client = RagflowClient(
+            api_key="sk-xxx",
+            base_url="http://10.80.0.85:9380"
+        )
+        result = await client.retrieval("VPN怎么连？", dataset_ids=["xxx"])
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str = "http://10.80.0.85:9380",
+        timeout: float = DEFAULT_TIMEOUT,
+    ):
+        """初始化 RAGFlow 客户端。
+
+        Args:
+            api_key: RAGFlow API Key（Bearer Token）
+            base_url: RAGFlow API 基础地址（不含尾部斜杠）
+            timeout: 默认请求超时（秒）
+
+        Raises:
+            RagflowConfigError: API Key 为空
+        """
+        if not api_key:
+            raise RagflowConfigError("RAGFlow API Key 不能为空")
+
+        self.api_key = api_key
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+
+    def _headers(self) -> Dict[str, str]:
+        """构建请求头。
+
+        Returns:
+            Dict: 包含 Authorization 和 Content-Type 的请求头
+        """
+        return {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+
+    async def _request(
+        self,
+        method: str,
+        path: str,
+        json_data: Optional[Dict] = None,
+        params: Optional[Dict] = None,
+        timeout: Optional[float] = None,
+    ) -> Dict[str, Any]:
+        """统一请求封装。
+
+        Args:
+            method: HTTP 方法（GET/POST/PUT/DELETE）
+            path: API 路径（如 /api/v1/retrieval）
+            json_data: JSON 请求体
+            params: 查询参数
+            timeout: 覆盖默认超时
+
+        Returns:
+            Dict: API 响应的 JSON 数据
+
+        Raises:
+            RagflowAuthError: 认证失败（401）
+            RagflowApiError: API 返回错误
+            RagflowConnectionError: 网络连接失败
+        """
+        url = f"{self.base_url}{path}"
+        req_timeout = timeout or self.timeout
+
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.request(
+                    method=method,
+                    url=url,
+                    headers=self._headers(),
+                    json=json_data,
+                    params=params,
+                    timeout=req_timeout,
+                )
+
+                # 处理 HTTP 错误
+                if response.status_code == 401:
+                    raise RagflowAuthError("RAGFlow API Key 无效或已过期")
+
+                if response.status_code >= 400:
+                    try:
+                        err_body = response.json()
+                        err_msg = err_body.get("message", response.text)
+                    except Exception:
+                        err_msg = response.text
+                    raise RagflowApiError(
+                        code=response.status_code,
+                        message=f"RAGFlow API 错误 ({response.status_code}): {err_msg}",
+                    )
+
+                # 解析响应
+                result = response.json()
+
+                # RAGFlow 统一响应格式：{code: 0, data: ..., message: ...}
+                if result.get("code") != 0:
+                    raise RagflowApiError(
+                        code=result.get("code", -1),
+                        message=result.get("message", "未知错误"),
+                    )
+
+                return result
+
+        except httpx.TimeoutException:
+            raise RagflowConnectionError(f"RAGFlow 请求超时 ({req_timeout}s): {path}")
+        except httpx.ConnectError:
+            raise RagflowConnectionError(f"RAGFlow 连接失败: {self.base_url}")
+        except (RagflowAuthError, RagflowApiError, RagflowConnectionError):
+            raise
+        except Exception as e:
+            raise RagflowError(f"RAGFlow 请求异常: {str(e)}")
+
+    # ==========================================================================
+    # 测试连接
+    # ==========================================================================
+
+    async def test_connection(self) -> Dict[str, Any]:
+        """测试 RAGFlow API 连接。
+
+        通过列出数据集（limit=1）验证 API Key 是否有效。
+
+        Returns:
+            Dict: {success: bool, message: str}
+        """
+        try:
+            result = await self.list_datasets(page=1, page_size=1)
+            return {
+                "success": True,
+                "message": f"连接成功，共 {result.get('total', 0)} 个知识库",
+            }
+        except RagflowAuthError:
+            return {"success": False, "message": "API Key 无效或已过期"}
+        except RagflowConnectionError as e:
+            return {"success": False, "message": f"连接失败: {e.message}"}
+        except RagflowError as e:
+            return {"success": False, "message": e.message}
+
+    # ==========================================================================
+    # 知识检索（核心接口）
+    # ==========================================================================
+
+    async def retrieval(
+        self,
+        question: str,
+        dataset_ids: Optional[List[str]] = None,
+        document_ids: Optional[List[str]] = None,
+        similarity_threshold: float = 0.2,
+        vector_similarity_weight: float = 0.3,
+        top_k: int = 1024,
+        keyword: bool = False,
+        highlight: bool = False,
+    ) -> RetrievalResult:
+        """知识检索 — 从知识库中搜索相关文档片段。
+
+        这是 RAGFlow 的核心接口，用于根据用户问题检索最相关的文本块。
+
+        Args:
+            question: 用户查询问题
+            dataset_ids: 要搜索的数据集ID列表（与 document_ids 二选一）
+            document_ids: 要搜索的文档ID列表
+            similarity_threshold: 最小相似度阈值（0-1），默认 0.2
+            vector_similarity_weight: 向量相似度权重（0-1），默认 0.3
+            top_k: 参与计算的块数量，默认 1024
+            keyword: 是否启用关键词匹配，默认 False
+            highlight: 是否高亮匹配术语，默认 False
+
+        Returns:
+            RetrievalResult: 检索结果（含文本块、文档聚合、总数）
+
+        Raises:
+            RagflowError: 检索失败
+        """
+        body: Dict[str, Any] = {
+            "question": question,
+            "similarity_threshold": similarity_threshold,
+            "vector_similarity_weight": vector_similarity_weight,
+            "top_k": top_k,
+            "keyword": keyword,
+            "highlight": highlight,
+        }
+
+        if dataset_ids:
+            body["dataset_ids"] = dataset_ids
+        if document_ids:
+            body["document_ids"] = document_ids
+
+        result = await self._request("POST", "/api/v1/retrieval", json_data=body)
+
+        data = result.get("data", {})
+
+        # 解析文本块
+        chunks = [
+            RetrievalChunk.model_validate(chunk)
+            for chunk in data.get("chunks", [])
+        ]
+
+        # 解析文档聚合
+        doc_aggs = [
+            DocAggregate.model_validate(agg)
+            for agg in data.get("doc_aggs", [])
+        ]
+
+        return RetrievalResult(
+            chunks=chunks,
+            doc_aggs=doc_aggs,
+            total=data.get("total", 0),
+        )
+
+    # ==========================================================================
+    # 数据集（知识库）管理
+    # ==========================================================================
+
+    async def list_datasets(
+        self,
+        page: int = 1,
+        page_size: int = DEFAULT_PAGE_SIZE,
+    ) -> Dict[str, Any]:
+        """列出所有数据集（知识库）。
+
+        Args:
+            page: 页码
+            page_size: 每页条数
+
+        Returns:
+            Dict: {items: List[DatasetInfo], total: int}
+        """
+        result = await self._request(
+            "GET",
+            "/api/v1/datasets",
+            params={"page": page, "page_size": page_size},
+        )
+
+        data = result.get("data", {})
+        items = [
+            DatasetInfo.model_validate(ds)
+            for ds in data.get("datasets", [])
+        ]
+
+        return {"items": items, "total": data.get("total", 0)}
+
+    async def create_dataset(
+        self,
+        name: str,
+        embedding_model: str = "BAAI/bge-m3@BAAI",
+        chunk_method: str = "naive",
+        permission: str = "me",
+    ) -> DatasetInfo:
+        """创建数据集（知识库）。
+
+        Args:
+            name: 数据集名称
+            embedding_model: 向量模型
+            chunk_method: 分块方法（naive/qa/book/laws 等）
+            permission: 权限（me/team）
+
+        Returns:
+            DatasetInfo: 创建的数据集信息
+        """
+        body = {
+            "name": name,
+            "embedding_model": embedding_model,
+            "chunk_method": chunk_method,
+            "permission": permission,
+        }
+
+        result = await self._request("POST", "/api/v1/datasets", json_data=body)
+        return DatasetInfo.model_validate(result.get("data", {}))
+
+    async def delete_dataset(self, dataset_ids: List[str]) -> bool:
+        """删除数据集。
+
+        Args:
+            dataset_ids: 要删除的数据集ID列表
+
+        Returns:
+            bool: 是否成功
+        """
+        await self._request(
+            "DELETE",
+            "/api/v1/datasets",
+            json_data={"ids": dataset_ids},
+        )
+        return True
+
+    # ==========================================================================
+    # 文档管理
+    # ==========================================================================
+
+    async def list_documents(
+        self,
+        dataset_id: str,
+        page: int = 1,
+        page_size: int = DEFAULT_PAGE_SIZE,
+    ) -> Dict[str, Any]:
+        """列出数据集中的文档。
+
+        Args:
+            dataset_id: 数据集ID
+            page: 页码
+            page_size: 每页条数
+
+        Returns:
+            Dict: {items: List[DocumentInfo], total: int}
+        """
+        result = await self._request(
+            "GET",
+            f"/api/v1/datasets/{dataset_id}/documents",
+            params={"page": page, "page_size": page_size},
+        )
+
+        data = result.get("data", {})
+        items = [
+            DocumentInfo.model_validate(doc)
+            for doc in data.get("documents", [])
+        ]
+
+        return {"items": items, "total": data.get("total", 0)}
+
+    async def upload_document(
+        self,
+        dataset_id: str,
+        file_path: str,
+        file_name: Optional[str] = None,
+    ) -> DocumentInfo:
+        """上传文档到数据集。
+
+        Args:
+            dataset_id: 数据集ID
+            file_path: 本地文件路径
+            file_name: 文件名（可选，默认取 file_path 的文件名）
+
+        Returns:
+            DocumentInfo: 上传的文档信息
+        """
+        import os
+
+        if not os.path.exists(file_path):
+            raise RagflowError(f"文件不存在: {file_path}")
+
+        fname = file_name or os.path.basename(file_path)
+
+        url = f"{self.base_url}/api/v1/datasets/{dataset_id}/documents"
+
+        try:
+            async with httpx.AsyncClient() as client:
+                with open(file_path, "rb") as f:
+                    response = await client.post(
+                        url=url,
+                        headers={"Authorization": f"Bearer {self.api_key}"},
+                        files={"file": (fname, f)},
+                        timeout=60.0,
+                    )
+
+                if response.status_code == 401:
+                    raise RagflowAuthError()
+
+                result = response.json()
+                if result.get("code") != 0:
+                    raise RagflowApiError(
+                        code=result.get("code", -1),
+                        message=result.get("message", "上传失败"),
+                    )
+
+                docs = result.get("data", {}).get("documents", [])
+                if docs:
+                    return DocumentInfo.model_validate(docs[0])
+                return DocumentInfo(name=fname)
+
+        except (RagflowAuthError, RagflowApiError):
+            raise
+        except Exception as e:
+            raise RagflowError(f"文档上传失败: {str(e)}")
+
+    async def delete_documents(
+        self,
+        dataset_id: str,
+        document_ids: List[str],
+    ) -> bool:
+        """删除文档。
+
+        Args:
+            dataset_id: 数据集ID
+            document_ids: 要删除的文档ID列表
+
+        Returns:
+            bool: 是否成功
+        """
+        await self._request(
+            "DELETE",
+            f"/api/v1/datasets/{dataset_id}/documents",
+            json_data={"ids": document_ids},
+        )
+        return True
@@ -0,0 +1,61 @@
+# =============================================================================
+# RAGFlow 配置加载器
+# =============================================================================
+# 说明：从数据库 system_configs 表加载 RAGFlow 配置，创建客户端实例
+# 配置项：integration_ragflow_api_url + integration_ragflow_api_key
+
+import logging
+from typing import Optional
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.models.system_config import SystemConfig
+
+from .client import RagflowClient
+from .exceptions import RagflowConfigError
+
+logger = logging.getLogger(__name__)
+
+# 默认 RAGFlow API 地址（生产环境）
+DEFAULT_RAGFLOW_BASE_URL = "http://10.80.0.85:9380"
+
+
+async def _get_config(db: AsyncSession, key: str) -> str:
+    """从数据库读取单个配置值。"""
+    result = await db.execute(
+        select(SystemConfig.config_value).where(SystemConfig.config_key == key)
+    )
+    row = result.scalar()
+    return row if row else ""
+
+
+async def get_ragflow_client(db: AsyncSession) -> RagflowClient:
+    """从数据库配置创建 RAGFlow 客户端实例。
+
+    读取 system_configs 表中的：
+    - integration_ragflow_api_url: RAGFlow API 地址
+    - integration_ragflow_api_key: RAGFlow API Key
+
+    Args:
+        db: 数据库会话
+
+    Returns:
+        RagflowClient: 客户端实例
+
+    Raises:
+        RagflowConfigError: 配置缺失
+    """
+    api_url = await _get_config(db, "integration_ragflow_api_url")
+    api_key = await _get_config(db, "integration_ragflow_api_key")
+
+    # 如果数据库没有配置，使用默认地址
+    if not api_url:
+        api_url = DEFAULT_RAGFLOW_BASE_URL
+
+    if not api_key:
+        raise RagflowConfigError(
+            "RAGFlow API Key 未配置，请在管理后台 → 集成管理 → RAGFlow 中设置"
+        )
+
+    return RagflowClient(api_key=api_key, base_url=api_url)
@@ -0,0 +1,35 @@
+# =============================================================================
+# RAGFlow API 异常定义
+# =============================================================================
+
+
+class RagflowError(Exception):
+    """RAGFlow 基础异常。"""
+    def __init__(self, message: str = "RAGFlow 错误"):
+        self.message = message
+        super().__init__(self.message)
+
+
+class RagflowConfigError(RagflowError):
+    """配置错误（缺少 API Key 或 Base URL）。"""
+    def __init__(self, message: str = "RAGFlow 配置缺失"):
+        super().__init__(message)
+
+
+class RagflowAuthError(RagflowError):
+    """认证失败（API Key 无效）。"""
+    def __init__(self, message: str = "RAGFlow 认证失败"):
+        super().__init__(message)
+
+
+class RagflowApiError(RagflowError):
+    """API 调用失败（非 200 响应）。"""
+    def __init__(self, code: int = 0, message: str = "RAGFlow API 错误"):
+        self.code = code
+        super().__init__(message)
+
+
+class RagflowConnectionError(RagflowError):
+    """网络连接失败。"""
+    def __init__(self, message: str = "RAGFlow 连接失败"):
+        super().__init__(message)
@@ -0,0 +1,110 @@
+# =============================================================================
+# RAGFlow API 数据模型
+# =============================================================================
+# 说明：定义 RAGFlow API 请求/响应的 Pydantic 数据模型
+# 参考：https://ragflow.io/docs/http_api_reference
+
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class RetrievalChunk(BaseModel):
+    """检索返回的单个文本块。
+
+    Attributes:
+        id: 块唯一ID
+        content: 块内容文本
+        document_id: 所属文档ID
+        document_keyword: 所属文档名称
+        similarity: 综合相似度分数
+        term_similarity: 关键词相似度
+        vector_similarity: 向量相似度
+        highlight: 高亮标记的内容（可选）
+    """
+    id: str = Field(default="", description="块唯一ID")
+    content: str = Field(default="", description="块内容文本")
+    document_id: str = Field(default="", description="所属文档ID")
+    document_keyword: str = Field(default="", description="所属文档名称")
+    similarity: float = Field(default=0.0, description="综合相似度分数")
+    term_similarity: float = Field(default=0.0, description="关键词相似度")
+    vector_similarity: float = Field(default=0.0, description="向量相似度")
+    highlight: Optional[str] = Field(default=None, description="高亮标记的内容")
+
+    model_config = {"from_attributes": True}
+
+
+class DocAggregate(BaseModel):
+    """文档聚合统计。
+
+    Attributes:
+        doc_id: 文档ID
+        doc_name: 文档名称
+        count: 命中的块数量
+    """
+    doc_id: str = Field(default="", description="文档ID")
+    doc_name: str = Field(default="", description="文档名称")
+    count: int = Field(default=0, description="命中块数量")
+
+    model_config = {"from_attributes": True}
+
+
+class RetrievalResult(BaseModel):
+    """检索结果。
+
+    Attributes:
+        chunks: 命中的文本块列表
+        doc_aggs: 按文档聚合统计
+        total: 命中总数
+    """
+    chunks: List[RetrievalChunk] = Field(default_factory=list, description="命中文本块列表")
+    doc_aggs: List[DocAggregate] = Field(default_factory=list, description="文档聚合统计")
+    total: int = Field(default=0, description="命中总数")
+
+    model_config = {"from_attributes": True}
+
+
+class DatasetInfo(BaseModel):
+    """数据集（知识库）信息。
+
+    Attributes:
+        id: 数据集ID
+        name: 数据集名称
+        chunk_method: 分块方法
+        permission: 权限
+        document_count: 文档数量
+        embedding_model: 向量模型
+        create_time: 创建时间
+        update_time: 更新时间
+    """
+    id: str = Field(default="", description="数据集ID")
+    name: str = Field(default="", description="数据集名称")
+    chunk_method: str = Field(default="naive", description="分块方法")
+    permission: str = Field(default="me", description="权限")
+    document_count: int = Field(default=0, description="文档数量")
+    embedding_model: str = Field(default="", description="向量模型")
+    create_time: Optional[str] = Field(default=None, description="创建时间")
+    update_time: Optional[str] = Field(default=None, description="更新时间")
+
+    model_config = {"from_attributes": True}
+
+
+class DocumentInfo(BaseModel):
+    """文档信息。
+
+    Attributes:
+        id: 文档ID
+        name: 文档名称
+        chunk_method: 分块方法
+        chunk_count: 块数量
+        create_time: 创建时间
+        update_time: 更新时间
+    """
+    id: str = Field(default="", description="文档ID")
+    name: str = Field(default="", description="文档名称")
+    chunk_method: str = Field(default="naive", description="分块方法")
+    chunk_count: int = Field(default=0, description="块数量")
+    create_time: Optional[str] = Field(default=None, description="创建时间")
+    update_time: Optional[str] = Field(default=None, description="更新时间")
+
+    model_config = {"from_attributes": True}