Files
wecom_it_smart_desk/backend/app/services/content_moderation_service.py
T

328 lines
13 KiB
Python
Raw Normal View History

# =============================================================================
# 企微IT智能服务台 — 内容审核服务
# =============================================================================
# 说明:#81 v0.6.0 内容审核 — 检测敏感词 + 提示坐席优化语气
# 用途:坐席发送消息前自动审核,避免发送违规内容
# 设计:基于 wordfilter 开源库 + 自定义敏感词库
# =============================================================================
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional, Tuple
from wordfilter import Wordfilter
from app.utils.logger import get_logger
logger = get_logger(__name__)
class ModerationAction(str, Enum):
"""内容审核动作"""
PASS = "pass" # 通过
WARN = "warn" # 警告(允许发送,但标记)
BLOCK = "block" # 阻断(必须修改)
class ModerationCategory(str, Enum):
"""审核分类"""
PROFANITY = "profanity" # 脏话
POLITICS = "politics" # 政治敏感
PORN = "porn" # 色情
AD = "ad" # 广告
PRIVACY = "privacy" # 隐私泄露(身份证/电话)
OTHER = "other" # 其他
@dataclass
class ModerationResult:
"""审核结果"""
action: ModerationAction
category: Optional[ModerationCategory]
matched_words: List[str]
suggestion: str = ""
@property
def is_blocked(self) -> bool:
return self.action == ModerationAction.BLOCK
@property
def is_warned(self) -> bool:
return self.action == ModerationAction.WARN
class ContentModerationService:
"""内容审核服务 — 检测 + 提示。
设计要点:
1. 加载 wordfilter + 自定义敏感词库
2. 提供 3 个级别动作:pass / warn / block
3. 返回命中的敏感词,给前端提示
4. 异步不阻塞消息发送主流程
"""
def __init__(self):
# 初始化 wordfilter(新 API: Wordfilter() 实例,而非 init() 全局)
self.wf = Wordfilter()
# 加载自定义敏感词库(预留,生产环境从配置文件加载)
self.custom_sensitive_words: List[str] = [
# 坐席严禁发送的
"投诉我", # 暗示员工投诉自己
"你爱找谁找谁", # 不当推诿
"自己不会百度吗", # 不当反问
"这点小事", # 轻视员工问题
# 隐私保护(后端检测,前端不知道)
# 实际部署时从 system_config 加载
]
if self.custom_sensitive_words:
self.wf.addWords(self.custom_sensitive_words)
# ==================================================================
# 主入口
# ==================================================================
def moderate(self, text: str) -> ModerationResult:
"""审核文本。
Args:
text: 待审核文本(坐席准备发的消息)
Returns:
ModerationResult: 审核结果
"""
if not text or not text.strip():
return ModerationResult(
action=ModerationAction.PASS,
category=None,
matched_words=[],
)
text = text.strip()
# 1. wordfilter 检测
matched: List[str] = []
if self.wf.blacklisted(text):
# 找出具体哪些词命中
matched = self._extract_matched(text)
if not matched:
return ModerationResult(
action=ModerationAction.PASS,
category=None,
matched_words=[],
)
# 2. 分类(简单规则:有命中就给 warn,后续可分级)
category = self._classify(matched)
# 3. 决定动作(目前策略:命中即 warn,后续可升级 block)
# 后续决策点:是否给某些类(政治/色情)直接 block
action = ModerationAction.WARN
suggestion = self._generate_suggestion(category, matched)
logger.info(
f"[ContentModeration] 检测到敏感词 text={text[:30]}... "
f"matched={matched} category={category}"
)
return ModerationResult(
action=action,
category=category,
matched_words=matched,
suggestion=suggestion,
)
# ==================================================================
# 隐私信息检测(基于正则,跟敏感词无关)
# ==================================================================
def check_privacy_leak(self, text: str) -> List[str]:
"""检测文本是否包含隐私信息(身份证 / 电话 / 银行卡)。
Returns:
命中的隐私字段列表(描述性,如 ["phone", "id_card"])
"""
import re
leaked = []
# 手机号(11 位 1 开头)
if re.search(r"\b1[3-9]\d{9}\b", text):
leaked.append("phone")
# 身份证号(18 位)
if re.search(r"\b\d{17}[\dXx]\b", text):
leaked.append("id_card")
# 银行卡(16-19 位连续数字,简单判断)
if re.search(r"\b\d{16,19}\b", text):
leaked.append("bank_card")
# 邮箱(个人邮箱,非公司邮箱)
personal_email_pattern = (
r"\b[a-zA-Z0-9._%+-]+@(?!servyou-it\.com|"
r"servyou\.com\.cn)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"
)
if re.search(personal_email_pattern, text):
leaked.append("personal_email")
return leaked
# ==================================================================
# 工具方法
# ==================================================================
def _extract_matched(self, text: str) -> List[str]:
"""提取命中的敏感词。"""
# wordfilter 没有直接的 "提取所有命中词" API,只能 replace 看
matched = []
# 遍历自建词库看哪些命中
for word in self.custom_sensitive_words:
if word in text:
matched.append(word)
return matched
def _classify(self, matched: List[str]) -> ModerationCategory:
"""根据命中的词分类。"""
# 简单分类:命中"投诉""爱找谁"等 → profanity
# 后续可扩展
return ModerationCategory.PROFANITY
def _generate_suggestion(
self, category: ModerationCategory, matched: List[str]
) -> str:
"""生成修改建议。"""
suggestions_map = {
ModerationCategory.PROFANITY: (
"建议改为更专业的表达,例如:"
"「我理解您的问题,我们一起想办法解决」"
),
ModerationCategory.POLITICS: (
"请避免讨论政治话题,保持服务专业性"
),
ModerationCategory.PORN: "请使用正式语言",
ModerationCategory.AD: "请勿发送广告内容",
ModerationCategory.PRIVACY: (
"请勿发送员工隐私信息(电话/身份证),如需联系请走企微"
),
ModerationCategory.OTHER: "请检查并修改表达",
}
return suggestions_map.get(category, "请检查并修改表达")
@staticmethod
def _get_fallback_question(keywords: List[str]) -> dict:
"""Dify 失败时的兜底题(从预置题池随机抽一道)。
注意:这里写死 10 道 IT 基础题,生产环境可改成查 quiz_questions.source='manual'
"""
import random
fallback_pool = [
{
"question": "电脑突然黑屏,最安全的做法是?",
"options": ["强制关机重启", "拔电源重启", "等几分钟看是否恢复", "砸电脑"],
"correct_index": 0,
"hint": "想想最稳妥的第一步",
"explanation": "黑屏可能是系统卡死,强制重启通常能恢复,拔电源可能损坏硬件",
"source": "manual",
},
{
"question": "打印机不响应,首先应该检查?",
"options": ["打印机电源", "重装系统", "换台电脑", "直接呼叫维修"],
"correct_index": 0,
"hint": "最基础的物理连接",
"explanation": "80% 故障是电源/线缆问题,先排除最简单的再考虑复杂方案",
"source": "manual",
},
{
"question": "密码忘了应该怎么办?",
"options": ["自己猜", "暴力破解", "找 IT 重置", "不用了"],
"correct_index": 2,
"hint": "走正规流程最安全",
"explanation": "找 IT 重置是最快最安全的做法,自己猜可能锁账号,暴力破解违法",
"source": "manual",
},
{
"question": "无法连接公司 VPN,首选排查?",
"options": ["检查网络是否通", "重装系统", "换电脑", "联系运营商"],
"correct_index": 0,
"hint": "从外到内排查",
"explanation": "先确认能上网,再排查 VPN 客户端,最后才是公司 VPN 服务器",
"source": "manual",
},
{
"question": "Outlook 收不到邮件,先看哪里?",
"options": ["垃圾邮件箱", "重装 Office", "换邮箱", "打电话给 IT"],
"correct_index": 0,
"hint": "最容易被忽略的",
"explanation": "新邮件被误判到垃圾箱是常见原因,先看再排查服务器",
"source": "manual",
},
{
"question": "Office 软件打开慢,先做什么?",
"options": ["清理开机启动项", "换电脑", "买新硬盘", "卸载重装"],
"correct_index": 0,
"hint": "性能问题先减负",
"explanation": "开机启动项太多会拖慢所有应用,清理后再观察",
"source": "manual",
},
{
"question": "电脑提示磁盘空间不足,应该?",
"options": ["清理回收站和临时文件", "关机", "重装系统", "不处理"],
"correct_index": 0,
"hint": "先释放空间再判断",
"explanation": "90% 的情况清理回收站 + temp 目录就能解决,严重才需要重装",
"source": "manual",
},
{
"question": "网页打不开,首先排查?",
"options": ["检查网络连接", "换浏览器", "重装系统", "砸键盘"],
"correct_index": 0,
"hint": "从最基础的开始",
"explanation": "先看能不能打开其他网页,排除是网站问题还是网络问题",
"source": "manual",
},
{
"question": "U 盘插入电脑没反应,先检查?",
"options": ["换个 USB 接口", "格式化 U 盘", "扔了", "拆电脑"],
"correct_index": 0,
"hint": "先排除最简单的问题",
"explanation": "USB 接口可能松动或供电不足,先换接口试,不要先动数据",
"source": "manual",
},
{
"question": "电脑突然变卡,第一步应该?",
"options": ["看任务管理器占用", "砸电脑", "重装系统", "关机睡觉"],
"correct_index": 0,
"hint": "数据先行",
"explanation": "任务管理器能看到 CPU/内存/磁盘占用,定位是哪个进程在吃资源",
"source": "manual",
},
]
chosen = random.choice(fallback_pool)
return chosen
def add_custom_word(self, word: str) -> None:
"""动态添加敏感词(运营后台调用)。"""
self.wf.addWords([word])
if word not in self.custom_sensitive_words:
self.custom_sensitive_words.append(word)
def remove_custom_word(self, word: str) -> None:
"""动态删除敏感词。"""
# wordfilter 没有 remove API,降级用 replace 占位
# wordfilter.remove(word) # 实际库不一定支持
if word in self.custom_sensitive_words:
self.custom_sensitive_words.remove(word)
# 单例
_moderation_service: Optional[ContentModerationService] = None
def get_moderation_service() -> ContentModerationService:
"""获取内容审核服务单例。"""
global _moderation_service
if _moderation_service is None:
_moderation_service = ContentModerationService()
return _moderation_service