chore: initial baseline with P0-safety .gitignore

This commit is contained in:
Simon
2026-06-14 16:49:18 +08:00
commit 63262292d7
510 changed files with 146008 additions and 0 deletions
@@ -0,0 +1,35 @@
# =============================================================================
# RAGFlow 集成模块
# =============================================================================
from .client import RagflowClient
from .config import get_ragflow_client
from .exceptions import (
RagflowApiError,
RagflowAuthError,
RagflowConfigError,
RagflowConnectionError,
RagflowError,
)
from .models import (
DatasetInfo,
DocAggregate,
DocumentInfo,
RetrievalChunk,
RetrievalResult,
)
__all__ = [
"RagflowClient",
"get_ragflow_client",
"RagflowError",
"RagflowConfigError",
"RagflowAuthError",
"RagflowApiError",
"RagflowConnectionError",
"RetrievalChunk",
"DocAggregate",
"RetrievalResult",
"DatasetInfo",
"DocumentInfo",
]
+449
View File
@@ -0,0 +1,449 @@
# =============================================================================
# RAGFlow API 客户端
# =============================================================================
# 说明:封装 RAGFlow 知识检索引擎的 API 调用
# 核心功能:
# 1. 知识检索 — POST /api/v1/retrieval(核心接口)
# 2. 数据集管理 — 列出/创建/删除知识库
# 3. 文档管理 — 上传/列出/删除文档
# 4. 测试连接 — 验证 API Key 是否有效
# 认证方式:Authorization: Bearer <API_KEY>
# 参考文档:https://ragflow.io/docs/http_api_reference
# =============================================================================
import logging
from typing import Any, Dict, List, Optional
import httpx
from .exceptions import (
RagflowApiError,
RagflowAuthError,
RagflowConfigError,
RagflowConnectionError,
RagflowError,
)
from .models import (
DatasetInfo,
DocAggregate,
DocumentInfo,
RetrievalChunk,
RetrievalResult,
)
logger = logging.getLogger(__name__)
# 默认请求超时(秒)
DEFAULT_TIMEOUT = 30.0
# 默认分页大小
DEFAULT_PAGE_SIZE = 20
class RagflowClient:
"""RAGFlow API 客户端。
封装 RAGFlow 知识检索引擎的 API 调用,支持:
- 知识检索(核心功能)
- 数据集(知识库)管理
- 文档管理
- 连接测试
使用方式:
client = RagflowClient(
api_key="sk-xxx",
base_url="http://10.80.0.85:9380"
)
result = await client.retrieval("VPN怎么连?", dataset_ids=["xxx"])
"""
def __init__(
self,
api_key: str,
base_url: str = "http://10.80.0.85:9380",
timeout: float = DEFAULT_TIMEOUT,
):
"""初始化 RAGFlow 客户端。
Args:
api_key: RAGFlow API KeyBearer Token
base_url: RAGFlow API 基础地址(不含尾部斜杠)
timeout: 默认请求超时(秒)
Raises:
RagflowConfigError: API Key 为空
"""
if not api_key:
raise RagflowConfigError("RAGFlow API Key 不能为空")
self.api_key = api_key
self.base_url = base_url.rstrip("/")
self.timeout = timeout
def _headers(self) -> Dict[str, str]:
"""构建请求头。
Returns:
Dict: 包含 Authorization 和 Content-Type 的请求头
"""
return {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
async def _request(
self,
method: str,
path: str,
json_data: Optional[Dict] = None,
params: Optional[Dict] = None,
timeout: Optional[float] = None,
) -> Dict[str, Any]:
"""统一请求封装。
Args:
method: HTTP 方法(GET/POST/PUT/DELETE
path: API 路径(如 /api/v1/retrieval
json_data: JSON 请求体
params: 查询参数
timeout: 覆盖默认超时
Returns:
Dict: API 响应的 JSON 数据
Raises:
RagflowAuthError: 认证失败(401
RagflowApiError: API 返回错误
RagflowConnectionError: 网络连接失败
"""
url = f"{self.base_url}{path}"
req_timeout = timeout or self.timeout
try:
async with httpx.AsyncClient() as client:
response = await client.request(
method=method,
url=url,
headers=self._headers(),
json=json_data,
params=params,
timeout=req_timeout,
)
# 处理 HTTP 错误
if response.status_code == 401:
raise RagflowAuthError("RAGFlow API Key 无效或已过期")
if response.status_code >= 400:
try:
err_body = response.json()
err_msg = err_body.get("message", response.text)
except Exception:
err_msg = response.text
raise RagflowApiError(
code=response.status_code,
message=f"RAGFlow API 错误 ({response.status_code}): {err_msg}",
)
# 解析响应
result = response.json()
# RAGFlow 统一响应格式:{code: 0, data: ..., message: ...}
if result.get("code") != 0:
raise RagflowApiError(
code=result.get("code", -1),
message=result.get("message", "未知错误"),
)
return result
except httpx.TimeoutException:
raise RagflowConnectionError(f"RAGFlow 请求超时 ({req_timeout}s): {path}")
except httpx.ConnectError:
raise RagflowConnectionError(f"RAGFlow 连接失败: {self.base_url}")
except (RagflowAuthError, RagflowApiError, RagflowConnectionError):
raise
except Exception as e:
raise RagflowError(f"RAGFlow 请求异常: {str(e)}")
# ==========================================================================
# 测试连接
# ==========================================================================
async def test_connection(self) -> Dict[str, Any]:
"""测试 RAGFlow API 连接。
通过列出数据集(limit=1)验证 API Key 是否有效。
Returns:
Dict: {success: bool, message: str}
"""
try:
result = await self.list_datasets(page=1, page_size=1)
return {
"success": True,
"message": f"连接成功,共 {result.get('total', 0)} 个知识库",
}
except RagflowAuthError:
return {"success": False, "message": "API Key 无效或已过期"}
except RagflowConnectionError as e:
return {"success": False, "message": f"连接失败: {e.message}"}
except RagflowError as e:
return {"success": False, "message": e.message}
# ==========================================================================
# 知识检索(核心接口)
# ==========================================================================
async def retrieval(
self,
question: str,
dataset_ids: Optional[List[str]] = None,
document_ids: Optional[List[str]] = None,
similarity_threshold: float = 0.2,
vector_similarity_weight: float = 0.3,
top_k: int = 1024,
keyword: bool = False,
highlight: bool = False,
) -> RetrievalResult:
"""知识检索 — 从知识库中搜索相关文档片段。
这是 RAGFlow 的核心接口,用于根据用户问题检索最相关的文本块。
Args:
question: 用户查询问题
dataset_ids: 要搜索的数据集ID列表(与 document_ids 二选一)
document_ids: 要搜索的文档ID列表
similarity_threshold: 最小相似度阈值(0-1),默认 0.2
vector_similarity_weight: 向量相似度权重(0-1),默认 0.3
top_k: 参与计算的块数量,默认 1024
keyword: 是否启用关键词匹配,默认 False
highlight: 是否高亮匹配术语,默认 False
Returns:
RetrievalResult: 检索结果(含文本块、文档聚合、总数)
Raises:
RagflowError: 检索失败
"""
body: Dict[str, Any] = {
"question": question,
"similarity_threshold": similarity_threshold,
"vector_similarity_weight": vector_similarity_weight,
"top_k": top_k,
"keyword": keyword,
"highlight": highlight,
}
if dataset_ids:
body["dataset_ids"] = dataset_ids
if document_ids:
body["document_ids"] = document_ids
result = await self._request("POST", "/api/v1/retrieval", json_data=body)
data = result.get("data", {})
# 解析文本块
chunks = [
RetrievalChunk.model_validate(chunk)
for chunk in data.get("chunks", [])
]
# 解析文档聚合
doc_aggs = [
DocAggregate.model_validate(agg)
for agg in data.get("doc_aggs", [])
]
return RetrievalResult(
chunks=chunks,
doc_aggs=doc_aggs,
total=data.get("total", 0),
)
# ==========================================================================
# 数据集(知识库)管理
# ==========================================================================
async def list_datasets(
self,
page: int = 1,
page_size: int = DEFAULT_PAGE_SIZE,
) -> Dict[str, Any]:
"""列出所有数据集(知识库)。
Args:
page: 页码
page_size: 每页条数
Returns:
Dict: {items: List[DatasetInfo], total: int}
"""
result = await self._request(
"GET",
"/api/v1/datasets",
params={"page": page, "page_size": page_size},
)
data = result.get("data", {})
items = [
DatasetInfo.model_validate(ds)
for ds in data.get("datasets", [])
]
return {"items": items, "total": data.get("total", 0)}
async def create_dataset(
self,
name: str,
embedding_model: str = "BAAI/bge-m3@BAAI",
chunk_method: str = "naive",
permission: str = "me",
) -> DatasetInfo:
"""创建数据集(知识库)。
Args:
name: 数据集名称
embedding_model: 向量模型
chunk_method: 分块方法(naive/qa/book/laws 等)
permission: 权限(me/team
Returns:
DatasetInfo: 创建的数据集信息
"""
body = {
"name": name,
"embedding_model": embedding_model,
"chunk_method": chunk_method,
"permission": permission,
}
result = await self._request("POST", "/api/v1/datasets", json_data=body)
return DatasetInfo.model_validate(result.get("data", {}))
async def delete_dataset(self, dataset_ids: List[str]) -> bool:
"""删除数据集。
Args:
dataset_ids: 要删除的数据集ID列表
Returns:
bool: 是否成功
"""
await self._request(
"DELETE",
"/api/v1/datasets",
json_data={"ids": dataset_ids},
)
return True
# ==========================================================================
# 文档管理
# ==========================================================================
async def list_documents(
self,
dataset_id: str,
page: int = 1,
page_size: int = DEFAULT_PAGE_SIZE,
) -> Dict[str, Any]:
"""列出数据集中的文档。
Args:
dataset_id: 数据集ID
page: 页码
page_size: 每页条数
Returns:
Dict: {items: List[DocumentInfo], total: int}
"""
result = await self._request(
"GET",
f"/api/v1/datasets/{dataset_id}/documents",
params={"page": page, "page_size": page_size},
)
data = result.get("data", {})
items = [
DocumentInfo.model_validate(doc)
for doc in data.get("documents", [])
]
return {"items": items, "total": data.get("total", 0)}
async def upload_document(
self,
dataset_id: str,
file_path: str,
file_name: Optional[str] = None,
) -> DocumentInfo:
"""上传文档到数据集。
Args:
dataset_id: 数据集ID
file_path: 本地文件路径
file_name: 文件名(可选,默认取 file_path 的文件名)
Returns:
DocumentInfo: 上传的文档信息
"""
import os
if not os.path.exists(file_path):
raise RagflowError(f"文件不存在: {file_path}")
fname = file_name or os.path.basename(file_path)
url = f"{self.base_url}/api/v1/datasets/{dataset_id}/documents"
try:
async with httpx.AsyncClient() as client:
with open(file_path, "rb") as f:
response = await client.post(
url=url,
headers={"Authorization": f"Bearer {self.api_key}"},
files={"file": (fname, f)},
timeout=60.0,
)
if response.status_code == 401:
raise RagflowAuthError()
result = response.json()
if result.get("code") != 0:
raise RagflowApiError(
code=result.get("code", -1),
message=result.get("message", "上传失败"),
)
docs = result.get("data", {}).get("documents", [])
if docs:
return DocumentInfo.model_validate(docs[0])
return DocumentInfo(name=fname)
except (RagflowAuthError, RagflowApiError):
raise
except Exception as e:
raise RagflowError(f"文档上传失败: {str(e)}")
async def delete_documents(
self,
dataset_id: str,
document_ids: List[str],
) -> bool:
"""删除文档。
Args:
dataset_id: 数据集ID
document_ids: 要删除的文档ID列表
Returns:
bool: 是否成功
"""
await self._request(
"DELETE",
f"/api/v1/datasets/{dataset_id}/documents",
json_data={"ids": document_ids},
)
return True
@@ -0,0 +1,61 @@
# =============================================================================
# RAGFlow 配置加载器
# =============================================================================
# 说明:从数据库 system_configs 表加载 RAGFlow 配置,创建客户端实例
# 配置项:integration_ragflow_api_url + integration_ragflow_api_key
import logging
from typing import Optional
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.system_config import SystemConfig
from .client import RagflowClient
from .exceptions import RagflowConfigError
logger = logging.getLogger(__name__)
# 默认 RAGFlow API 地址(生产环境)
DEFAULT_RAGFLOW_BASE_URL = "http://10.80.0.85:9380"
async def _get_config(db: AsyncSession, key: str) -> str:
"""从数据库读取单个配置值。"""
result = await db.execute(
select(SystemConfig.config_value).where(SystemConfig.config_key == key)
)
row = result.scalar()
return row if row else ""
async def get_ragflow_client(db: AsyncSession) -> RagflowClient:
"""从数据库配置创建 RAGFlow 客户端实例。
读取 system_configs 表中的:
- integration_ragflow_api_url: RAGFlow API 地址
- integration_ragflow_api_key: RAGFlow API Key
Args:
db: 数据库会话
Returns:
RagflowClient: 客户端实例
Raises:
RagflowConfigError: 配置缺失
"""
api_url = await _get_config(db, "integration_ragflow_api_url")
api_key = await _get_config(db, "integration_ragflow_api_key")
# 如果数据库没有配置,使用默认地址
if not api_url:
api_url = DEFAULT_RAGFLOW_BASE_URL
if not api_key:
raise RagflowConfigError(
"RAGFlow API Key 未配置,请在管理后台 → 集成管理 → RAGFlow 中设置"
)
return RagflowClient(api_key=api_key, base_url=api_url)
@@ -0,0 +1,35 @@
# =============================================================================
# RAGFlow API 异常定义
# =============================================================================
class RagflowError(Exception):
"""RAGFlow 基础异常。"""
def __init__(self, message: str = "RAGFlow 错误"):
self.message = message
super().__init__(self.message)
class RagflowConfigError(RagflowError):
"""配置错误(缺少 API Key 或 Base URL)。"""
def __init__(self, message: str = "RAGFlow 配置缺失"):
super().__init__(message)
class RagflowAuthError(RagflowError):
"""认证失败(API Key 无效)。"""
def __init__(self, message: str = "RAGFlow 认证失败"):
super().__init__(message)
class RagflowApiError(RagflowError):
"""API 调用失败(非 200 响应)。"""
def __init__(self, code: int = 0, message: str = "RAGFlow API 错误"):
self.code = code
super().__init__(message)
class RagflowConnectionError(RagflowError):
"""网络连接失败。"""
def __init__(self, message: str = "RAGFlow 连接失败"):
super().__init__(message)
+110
View File
@@ -0,0 +1,110 @@
# =============================================================================
# RAGFlow API 数据模型
# =============================================================================
# 说明:定义 RAGFlow API 请求/响应的 Pydantic 数据模型
# 参考:https://ragflow.io/docs/http_api_reference
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field
class RetrievalChunk(BaseModel):
"""检索返回的单个文本块。
Attributes:
id: 块唯一ID
content: 块内容文本
document_id: 所属文档ID
document_keyword: 所属文档名称
similarity: 综合相似度分数
term_similarity: 关键词相似度
vector_similarity: 向量相似度
highlight: 高亮标记的内容(可选)
"""
id: str = Field(default="", description="块唯一ID")
content: str = Field(default="", description="块内容文本")
document_id: str = Field(default="", description="所属文档ID")
document_keyword: str = Field(default="", description="所属文档名称")
similarity: float = Field(default=0.0, description="综合相似度分数")
term_similarity: float = Field(default=0.0, description="关键词相似度")
vector_similarity: float = Field(default=0.0, description="向量相似度")
highlight: Optional[str] = Field(default=None, description="高亮标记的内容")
model_config = {"from_attributes": True}
class DocAggregate(BaseModel):
"""文档聚合统计。
Attributes:
doc_id: 文档ID
doc_name: 文档名称
count: 命中的块数量
"""
doc_id: str = Field(default="", description="文档ID")
doc_name: str = Field(default="", description="文档名称")
count: int = Field(default=0, description="命中块数量")
model_config = {"from_attributes": True}
class RetrievalResult(BaseModel):
"""检索结果。
Attributes:
chunks: 命中的文本块列表
doc_aggs: 按文档聚合统计
total: 命中总数
"""
chunks: List[RetrievalChunk] = Field(default_factory=list, description="命中文本块列表")
doc_aggs: List[DocAggregate] = Field(default_factory=list, description="文档聚合统计")
total: int = Field(default=0, description="命中总数")
model_config = {"from_attributes": True}
class DatasetInfo(BaseModel):
"""数据集(知识库)信息。
Attributes:
id: 数据集ID
name: 数据集名称
chunk_method: 分块方法
permission: 权限
document_count: 文档数量
embedding_model: 向量模型
create_time: 创建时间
update_time: 更新时间
"""
id: str = Field(default="", description="数据集ID")
name: str = Field(default="", description="数据集名称")
chunk_method: str = Field(default="naive", description="分块方法")
permission: str = Field(default="me", description="权限")
document_count: int = Field(default=0, description="文档数量")
embedding_model: str = Field(default="", description="向量模型")
create_time: Optional[str] = Field(default=None, description="创建时间")
update_time: Optional[str] = Field(default=None, description="更新时间")
model_config = {"from_attributes": True}
class DocumentInfo(BaseModel):
"""文档信息。
Attributes:
id: 文档ID
name: 文档名称
chunk_method: 分块方法
chunk_count: 块数量
create_time: 创建时间
update_time: 更新时间
"""
id: str = Field(default="", description="文档ID")
name: str = Field(default="", description="文档名称")
chunk_method: str = Field(default="naive", description="分块方法")
chunk_count: int = Field(default=0, description="块数量")
create_time: Optional[str] = Field(default=None, description="创建时间")
update_time: Optional[str] = Field(default=None, description="更新时间")
model_config = {"from_attributes": True}