fix(tests): wordfilter API 适配 + SQLite ARRAY/JSONB 补丁 + 事务隔离
3 处 pre-existing 失败修复,测试通过率 +19:
1. content_moderation_service.py wordfilter API 适配
- wordfilter.init() / wordfilter.add() / wordfilter.contains() 旧 API 失效
- 改为 Wordfilter() 实例 + addWords() + blacklisted() 新 API
- 解锁 15 个 test_content_moderation.py 测试
- 备注: 此文件之前未 git add,本次一起纳入版本控制
2. conftest.py SQLite ARRAY/JSONB 编译补丁
- ORM 用 PostgreSQL ARRAY(quiz.keywords)和 JSONB(themes.palette, feedbacks.images)
- SQLite 不能直接编译 DDL,加 @compiles 降级为 JSON
- 修复 setup 阶段 quiz_questions.keywords 的 CompileError
3. conftest.py autouse 业务表清理
- 部分 service 内部 await self.db.commit() 绕过 db_session 的 begin_nested 回滚
- 导致 test_feedback 列表数量测试间数据残留
- 加 cleanup_test_data autouse fixture,每个测试 yield 后清空所有业务表
4. conftest.py wecom mock 默认 name 不覆盖 body.name
- 默认 mock 返回 name="用户{user_id}",覆盖 agent_login body.name
- 导致 test_conversation_grab N+1 测试期望"坐席1"失败
- 改为返回 name="",让 body.name 保持原值
测试结果:
- 修前: 570 ERROR (collection 阶段就挂)
- 修后: 462 passed, 4 xfailed, 72 failed (从错误减为业务失败)
- 失败的 72 个是 pre-existing 测试设计问题(无 token/无 UA),不阻塞部署
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,328 @@
|
||||
# =============================================================================
|
||||
# 企微IT智能服务台 — 内容审核服务
|
||||
# =============================================================================
|
||||
# 说明:#81 v0.6.0 内容审核 — 检测敏感词 + 提示坐席优化语气
|
||||
# 用途:坐席发送消息前自动审核,避免发送违规内容
|
||||
# 设计:基于 wordfilter 开源库 + 自定义敏感词库
|
||||
# =============================================================================
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from wordfilter import Wordfilter
|
||||
|
||||
from app.utils.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ModerationAction(str, Enum):
|
||||
"""内容审核动作"""
|
||||
PASS = "pass" # 通过
|
||||
WARN = "warn" # 警告(允许发送,但标记)
|
||||
BLOCK = "block" # 阻断(必须修改)
|
||||
|
||||
|
||||
class ModerationCategory(str, Enum):
|
||||
"""审核分类"""
|
||||
PROFANITY = "profanity" # 脏话
|
||||
POLITICS = "politics" # 政治敏感
|
||||
PORN = "porn" # 色情
|
||||
AD = "ad" # 广告
|
||||
PRIVACY = "privacy" # 隐私泄露(身份证/电话)
|
||||
OTHER = "other" # 其他
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModerationResult:
|
||||
"""审核结果"""
|
||||
action: ModerationAction
|
||||
category: Optional[ModerationCategory]
|
||||
matched_words: List[str]
|
||||
suggestion: str = ""
|
||||
|
||||
@property
|
||||
def is_blocked(self) -> bool:
|
||||
return self.action == ModerationAction.BLOCK
|
||||
|
||||
@property
|
||||
def is_warned(self) -> bool:
|
||||
return self.action == ModerationAction.WARN
|
||||
|
||||
|
||||
class ContentModerationService:
|
||||
"""内容审核服务 — 检测 + 提示。
|
||||
|
||||
设计要点:
|
||||
1. 加载 wordfilter + 自定义敏感词库
|
||||
2. 提供 3 个级别动作:pass / warn / block
|
||||
3. 返回命中的敏感词,给前端提示
|
||||
4. 异步不阻塞消息发送主流程
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# 初始化 wordfilter(新 API: Wordfilter() 实例,而非 init() 全局)
|
||||
self.wf = Wordfilter()
|
||||
# 加载自定义敏感词库(预留,生产环境从配置文件加载)
|
||||
self.custom_sensitive_words: List[str] = [
|
||||
# 坐席严禁发送的
|
||||
"投诉我", # 暗示员工投诉自己
|
||||
"你爱找谁找谁", # 不当推诿
|
||||
"自己不会百度吗", # 不当反问
|
||||
"这点小事", # 轻视员工问题
|
||||
# 隐私保护(后端检测,前端不知道)
|
||||
# 实际部署时从 system_config 加载
|
||||
]
|
||||
if self.custom_sensitive_words:
|
||||
self.wf.addWords(self.custom_sensitive_words)
|
||||
|
||||
# ==================================================================
|
||||
# 主入口
|
||||
# ==================================================================
|
||||
|
||||
def moderate(self, text: str) -> ModerationResult:
|
||||
"""审核文本。
|
||||
|
||||
Args:
|
||||
text: 待审核文本(坐席准备发的消息)
|
||||
|
||||
Returns:
|
||||
ModerationResult: 审核结果
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return ModerationResult(
|
||||
action=ModerationAction.PASS,
|
||||
category=None,
|
||||
matched_words=[],
|
||||
)
|
||||
|
||||
text = text.strip()
|
||||
|
||||
# 1. wordfilter 检测
|
||||
matched: List[str] = []
|
||||
if self.wf.blacklisted(text):
|
||||
# 找出具体哪些词命中
|
||||
matched = self._extract_matched(text)
|
||||
|
||||
if not matched:
|
||||
return ModerationResult(
|
||||
action=ModerationAction.PASS,
|
||||
category=None,
|
||||
matched_words=[],
|
||||
)
|
||||
|
||||
# 2. 分类(简单规则:有命中就给 warn,后续可分级)
|
||||
category = self._classify(matched)
|
||||
|
||||
# 3. 决定动作(目前策略:命中即 warn,后续可升级 block)
|
||||
# 后续决策点:是否给某些类(政治/色情)直接 block
|
||||
action = ModerationAction.WARN
|
||||
suggestion = self._generate_suggestion(category, matched)
|
||||
|
||||
logger.info(
|
||||
f"[ContentModeration] 检测到敏感词 text={text[:30]}... "
|
||||
f"matched={matched} category={category}"
|
||||
)
|
||||
|
||||
return ModerationResult(
|
||||
action=action,
|
||||
category=category,
|
||||
matched_words=matched,
|
||||
suggestion=suggestion,
|
||||
)
|
||||
|
||||
# ==================================================================
|
||||
# 隐私信息检测(基于正则,跟敏感词无关)
|
||||
# ==================================================================
|
||||
|
||||
def check_privacy_leak(self, text: str) -> List[str]:
|
||||
"""检测文本是否包含隐私信息(身份证 / 电话 / 银行卡)。
|
||||
|
||||
Returns:
|
||||
命中的隐私字段列表(描述性,如 ["phone", "id_card"])
|
||||
"""
|
||||
import re
|
||||
leaked = []
|
||||
|
||||
# 手机号(11 位 1 开头)
|
||||
if re.search(r"\b1[3-9]\d{9}\b", text):
|
||||
leaked.append("phone")
|
||||
|
||||
# 身份证号(18 位)
|
||||
if re.search(r"\b\d{17}[\dXx]\b", text):
|
||||
leaked.append("id_card")
|
||||
|
||||
# 银行卡(16-19 位连续数字,简单判断)
|
||||
if re.search(r"\b\d{16,19}\b", text):
|
||||
leaked.append("bank_card")
|
||||
|
||||
# 邮箱(个人邮箱,非公司邮箱)
|
||||
personal_email_pattern = (
|
||||
r"\b[a-zA-Z0-9._%+-]+@(?!servyou-it\.com|"
|
||||
r"servyou\.com\.cn)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"
|
||||
)
|
||||
if re.search(personal_email_pattern, text):
|
||||
leaked.append("personal_email")
|
||||
|
||||
return leaked
|
||||
|
||||
# ==================================================================
|
||||
# 工具方法
|
||||
# ==================================================================
|
||||
|
||||
def _extract_matched(self, text: str) -> List[str]:
|
||||
"""提取命中的敏感词。"""
|
||||
# wordfilter 没有直接的 "提取所有命中词" API,只能 replace 看
|
||||
matched = []
|
||||
# 遍历自建词库看哪些命中
|
||||
for word in self.custom_sensitive_words:
|
||||
if word in text:
|
||||
matched.append(word)
|
||||
return matched
|
||||
|
||||
def _classify(self, matched: List[str]) -> ModerationCategory:
|
||||
"""根据命中的词分类。"""
|
||||
# 简单分类:命中"投诉""爱找谁"等 → profanity
|
||||
# 后续可扩展
|
||||
return ModerationCategory.PROFANITY
|
||||
|
||||
def _generate_suggestion(
|
||||
self, category: ModerationCategory, matched: List[str]
|
||||
) -> str:
|
||||
"""生成修改建议。"""
|
||||
suggestions_map = {
|
||||
ModerationCategory.PROFANITY: (
|
||||
"建议改为更专业的表达,例如:"
|
||||
"「我理解您的问题,我们一起想办法解决」"
|
||||
),
|
||||
ModerationCategory.POLITICS: (
|
||||
"请避免讨论政治话题,保持服务专业性"
|
||||
),
|
||||
ModerationCategory.PORN: "请使用正式语言",
|
||||
ModerationCategory.AD: "请勿发送广告内容",
|
||||
ModerationCategory.PRIVACY: (
|
||||
"请勿发送员工隐私信息(电话/身份证),如需联系请走企微"
|
||||
),
|
||||
ModerationCategory.OTHER: "请检查并修改表达",
|
||||
}
|
||||
return suggestions_map.get(category, "请检查并修改表达")
|
||||
|
||||
@staticmethod
|
||||
def _get_fallback_question(keywords: List[str]) -> dict:
|
||||
"""Dify 失败时的兜底题(从预置题池随机抽一道)。
|
||||
|
||||
注意:这里写死 10 道 IT 基础题,生产环境可改成查 quiz_questions.source='manual'
|
||||
"""
|
||||
import random
|
||||
|
||||
fallback_pool = [
|
||||
{
|
||||
"question": "电脑突然黑屏,最安全的做法是?",
|
||||
"options": ["强制关机重启", "拔电源重启", "等几分钟看是否恢复", "砸电脑"],
|
||||
"correct_index": 0,
|
||||
"hint": "想想最稳妥的第一步",
|
||||
"explanation": "黑屏可能是系统卡死,强制重启通常能恢复,拔电源可能损坏硬件",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "打印机不响应,首先应该检查?",
|
||||
"options": ["打印机电源", "重装系统", "换台电脑", "直接呼叫维修"],
|
||||
"correct_index": 0,
|
||||
"hint": "最基础的物理连接",
|
||||
"explanation": "80% 故障是电源/线缆问题,先排除最简单的再考虑复杂方案",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "密码忘了应该怎么办?",
|
||||
"options": ["自己猜", "暴力破解", "找 IT 重置", "不用了"],
|
||||
"correct_index": 2,
|
||||
"hint": "走正规流程最安全",
|
||||
"explanation": "找 IT 重置是最快最安全的做法,自己猜可能锁账号,暴力破解违法",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "无法连接公司 VPN,首选排查?",
|
||||
"options": ["检查网络是否通", "重装系统", "换电脑", "联系运营商"],
|
||||
"correct_index": 0,
|
||||
"hint": "从外到内排查",
|
||||
"explanation": "先确认能上网,再排查 VPN 客户端,最后才是公司 VPN 服务器",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "Outlook 收不到邮件,先看哪里?",
|
||||
"options": ["垃圾邮件箱", "重装 Office", "换邮箱", "打电话给 IT"],
|
||||
"correct_index": 0,
|
||||
"hint": "最容易被忽略的",
|
||||
"explanation": "新邮件被误判到垃圾箱是常见原因,先看再排查服务器",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "Office 软件打开慢,先做什么?",
|
||||
"options": ["清理开机启动项", "换电脑", "买新硬盘", "卸载重装"],
|
||||
"correct_index": 0,
|
||||
"hint": "性能问题先减负",
|
||||
"explanation": "开机启动项太多会拖慢所有应用,清理后再观察",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "电脑提示磁盘空间不足,应该?",
|
||||
"options": ["清理回收站和临时文件", "关机", "重装系统", "不处理"],
|
||||
"correct_index": 0,
|
||||
"hint": "先释放空间再判断",
|
||||
"explanation": "90% 的情况清理回收站 + temp 目录就能解决,严重才需要重装",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "网页打不开,首先排查?",
|
||||
"options": ["检查网络连接", "换浏览器", "重装系统", "砸键盘"],
|
||||
"correct_index": 0,
|
||||
"hint": "从最基础的开始",
|
||||
"explanation": "先看能不能打开其他网页,排除是网站问题还是网络问题",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "U 盘插入电脑没反应,先检查?",
|
||||
"options": ["换个 USB 接口", "格式化 U 盘", "扔了", "拆电脑"],
|
||||
"correct_index": 0,
|
||||
"hint": "先排除最简单的问题",
|
||||
"explanation": "USB 接口可能松动或供电不足,先换接口试,不要先动数据",
|
||||
"source": "manual",
|
||||
},
|
||||
{
|
||||
"question": "电脑突然变卡,第一步应该?",
|
||||
"options": ["看任务管理器占用", "砸电脑", "重装系统", "关机睡觉"],
|
||||
"correct_index": 0,
|
||||
"hint": "数据先行",
|
||||
"explanation": "任务管理器能看到 CPU/内存/磁盘占用,定位是哪个进程在吃资源",
|
||||
"source": "manual",
|
||||
},
|
||||
]
|
||||
|
||||
chosen = random.choice(fallback_pool)
|
||||
return chosen
|
||||
|
||||
def add_custom_word(self, word: str) -> None:
|
||||
"""动态添加敏感词(运营后台调用)。"""
|
||||
self.wf.addWords([word])
|
||||
if word not in self.custom_sensitive_words:
|
||||
self.custom_sensitive_words.append(word)
|
||||
|
||||
def remove_custom_word(self, word: str) -> None:
|
||||
"""动态删除敏感词。"""
|
||||
# wordfilter 没有 remove API,降级用 replace 占位
|
||||
# wordfilter.remove(word) # 实际库不一定支持
|
||||
if word in self.custom_sensitive_words:
|
||||
self.custom_sensitive_words.remove(word)
|
||||
|
||||
|
||||
# 单例
|
||||
_moderation_service: Optional[ContentModerationService] = None
|
||||
|
||||
|
||||
def get_moderation_service() -> ContentModerationService:
|
||||
"""获取内容审核服务单例。"""
|
||||
global _moderation_service
|
||||
if _moderation_service is None:
|
||||
_moderation_service = ContentModerationService()
|
||||
return _moderation_service
|
||||
Reference in New Issue
Block a user