328 lines
13 KiB
Python
328 lines
13 KiB
Python
|
|
# =============================================================================
|
||
|
|
# 企微IT智能服务台 — 内容审核服务
|
||
|
|
# =============================================================================
|
||
|
|
# 说明:#81 v0.6.0 内容审核 — 检测敏感词 + 提示坐席优化语气
|
||
|
|
# 用途:坐席发送消息前自动审核,避免发送违规内容
|
||
|
|
# 设计:基于 wordfilter 开源库 + 自定义敏感词库
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from enum import Enum
|
||
|
|
from typing import List, Optional, Tuple
|
||
|
|
|
||
|
|
from wordfilter import Wordfilter
|
||
|
|
|
||
|
|
from app.utils.logger import get_logger
|
||
|
|
|
||
|
|
logger = get_logger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
class ModerationAction(str, Enum):
|
||
|
|
"""内容审核动作"""
|
||
|
|
PASS = "pass" # 通过
|
||
|
|
WARN = "warn" # 警告(允许发送,但标记)
|
||
|
|
BLOCK = "block" # 阻断(必须修改)
|
||
|
|
|
||
|
|
|
||
|
|
class ModerationCategory(str, Enum):
|
||
|
|
"""审核分类"""
|
||
|
|
PROFANITY = "profanity" # 脏话
|
||
|
|
POLITICS = "politics" # 政治敏感
|
||
|
|
PORN = "porn" # 色情
|
||
|
|
AD = "ad" # 广告
|
||
|
|
PRIVACY = "privacy" # 隐私泄露(身份证/电话)
|
||
|
|
OTHER = "other" # 其他
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ModerationResult:
|
||
|
|
"""审核结果"""
|
||
|
|
action: ModerationAction
|
||
|
|
category: Optional[ModerationCategory]
|
||
|
|
matched_words: List[str]
|
||
|
|
suggestion: str = ""
|
||
|
|
|
||
|
|
@property
|
||
|
|
def is_blocked(self) -> bool:
|
||
|
|
return self.action == ModerationAction.BLOCK
|
||
|
|
|
||
|
|
@property
|
||
|
|
def is_warned(self) -> bool:
|
||
|
|
return self.action == ModerationAction.WARN
|
||
|
|
|
||
|
|
|
||
|
|
class ContentModerationService:
|
||
|
|
"""内容审核服务 — 检测 + 提示。
|
||
|
|
|
||
|
|
设计要点:
|
||
|
|
1. 加载 wordfilter + 自定义敏感词库
|
||
|
|
2. 提供 3 个级别动作:pass / warn / block
|
||
|
|
3. 返回命中的敏感词,给前端提示
|
||
|
|
4. 异步不阻塞消息发送主流程
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
# 初始化 wordfilter(新 API: Wordfilter() 实例,而非 init() 全局)
|
||
|
|
self.wf = Wordfilter()
|
||
|
|
# 加载自定义敏感词库(预留,生产环境从配置文件加载)
|
||
|
|
self.custom_sensitive_words: List[str] = [
|
||
|
|
# 坐席严禁发送的
|
||
|
|
"投诉我", # 暗示员工投诉自己
|
||
|
|
"你爱找谁找谁", # 不当推诿
|
||
|
|
"自己不会百度吗", # 不当反问
|
||
|
|
"这点小事", # 轻视员工问题
|
||
|
|
# 隐私保护(后端检测,前端不知道)
|
||
|
|
# 实际部署时从 system_config 加载
|
||
|
|
]
|
||
|
|
if self.custom_sensitive_words:
|
||
|
|
self.wf.addWords(self.custom_sensitive_words)
|
||
|
|
|
||
|
|
# ==================================================================
|
||
|
|
# 主入口
|
||
|
|
# ==================================================================
|
||
|
|
|
||
|
|
def moderate(self, text: str) -> ModerationResult:
|
||
|
|
"""审核文本。
|
||
|
|
|
||
|
|
Args:
|
||
|
|
text: 待审核文本(坐席准备发的消息)
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
ModerationResult: 审核结果
|
||
|
|
"""
|
||
|
|
if not text or not text.strip():
|
||
|
|
return ModerationResult(
|
||
|
|
action=ModerationAction.PASS,
|
||
|
|
category=None,
|
||
|
|
matched_words=[],
|
||
|
|
)
|
||
|
|
|
||
|
|
text = text.strip()
|
||
|
|
|
||
|
|
# 1. wordfilter 检测
|
||
|
|
matched: List[str] = []
|
||
|
|
if self.wf.blacklisted(text):
|
||
|
|
# 找出具体哪些词命中
|
||
|
|
matched = self._extract_matched(text)
|
||
|
|
|
||
|
|
if not matched:
|
||
|
|
return ModerationResult(
|
||
|
|
action=ModerationAction.PASS,
|
||
|
|
category=None,
|
||
|
|
matched_words=[],
|
||
|
|
)
|
||
|
|
|
||
|
|
# 2. 分类(简单规则:有命中就给 warn,后续可分级)
|
||
|
|
category = self._classify(matched)
|
||
|
|
|
||
|
|
# 3. 决定动作(目前策略:命中即 warn,后续可升级 block)
|
||
|
|
# 后续决策点:是否给某些类(政治/色情)直接 block
|
||
|
|
action = ModerationAction.WARN
|
||
|
|
suggestion = self._generate_suggestion(category, matched)
|
||
|
|
|
||
|
|
logger.info(
|
||
|
|
f"[ContentModeration] 检测到敏感词 text={text[:30]}... "
|
||
|
|
f"matched={matched} category={category}"
|
||
|
|
)
|
||
|
|
|
||
|
|
return ModerationResult(
|
||
|
|
action=action,
|
||
|
|
category=category,
|
||
|
|
matched_words=matched,
|
||
|
|
suggestion=suggestion,
|
||
|
|
)
|
||
|
|
|
||
|
|
# ==================================================================
|
||
|
|
# 隐私信息检测(基于正则,跟敏感词无关)
|
||
|
|
# ==================================================================
|
||
|
|
|
||
|
|
def check_privacy_leak(self, text: str) -> List[str]:
|
||
|
|
"""检测文本是否包含隐私信息(身份证 / 电话 / 银行卡)。
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
命中的隐私字段列表(描述性,如 ["phone", "id_card"])
|
||
|
|
"""
|
||
|
|
import re
|
||
|
|
leaked = []
|
||
|
|
|
||
|
|
# 手机号(11 位 1 开头)
|
||
|
|
if re.search(r"\b1[3-9]\d{9}\b", text):
|
||
|
|
leaked.append("phone")
|
||
|
|
|
||
|
|
# 身份证号(18 位)
|
||
|
|
if re.search(r"\b\d{17}[\dXx]\b", text):
|
||
|
|
leaked.append("id_card")
|
||
|
|
|
||
|
|
# 银行卡(16-19 位连续数字,简单判断)
|
||
|
|
if re.search(r"\b\d{16,19}\b", text):
|
||
|
|
leaked.append("bank_card")
|
||
|
|
|
||
|
|
# 邮箱(个人邮箱,非公司邮箱)
|
||
|
|
personal_email_pattern = (
|
||
|
|
r"\b[a-zA-Z0-9._%+-]+@(?!servyou-it\.com|"
|
||
|
|
r"servyou\.com\.cn)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"
|
||
|
|
)
|
||
|
|
if re.search(personal_email_pattern, text):
|
||
|
|
leaked.append("personal_email")
|
||
|
|
|
||
|
|
return leaked
|
||
|
|
|
||
|
|
# ==================================================================
|
||
|
|
# 工具方法
|
||
|
|
# ==================================================================
|
||
|
|
|
||
|
|
def _extract_matched(self, text: str) -> List[str]:
|
||
|
|
"""提取命中的敏感词。"""
|
||
|
|
# wordfilter 没有直接的 "提取所有命中词" API,只能 replace 看
|
||
|
|
matched = []
|
||
|
|
# 遍历自建词库看哪些命中
|
||
|
|
for word in self.custom_sensitive_words:
|
||
|
|
if word in text:
|
||
|
|
matched.append(word)
|
||
|
|
return matched
|
||
|
|
|
||
|
|
def _classify(self, matched: List[str]) -> ModerationCategory:
|
||
|
|
"""根据命中的词分类。"""
|
||
|
|
# 简单分类:命中"投诉""爱找谁"等 → profanity
|
||
|
|
# 后续可扩展
|
||
|
|
return ModerationCategory.PROFANITY
|
||
|
|
|
||
|
|
def _generate_suggestion(
|
||
|
|
self, category: ModerationCategory, matched: List[str]
|
||
|
|
) -> str:
|
||
|
|
"""生成修改建议。"""
|
||
|
|
suggestions_map = {
|
||
|
|
ModerationCategory.PROFANITY: (
|
||
|
|
"建议改为更专业的表达,例如:"
|
||
|
|
"「我理解您的问题,我们一起想办法解决」"
|
||
|
|
),
|
||
|
|
ModerationCategory.POLITICS: (
|
||
|
|
"请避免讨论政治话题,保持服务专业性"
|
||
|
|
),
|
||
|
|
ModerationCategory.PORN: "请使用正式语言",
|
||
|
|
ModerationCategory.AD: "请勿发送广告内容",
|
||
|
|
ModerationCategory.PRIVACY: (
|
||
|
|
"请勿发送员工隐私信息(电话/身份证),如需联系请走企微"
|
||
|
|
),
|
||
|
|
ModerationCategory.OTHER: "请检查并修改表达",
|
||
|
|
}
|
||
|
|
return suggestions_map.get(category, "请检查并修改表达")
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def _get_fallback_question(keywords: List[str]) -> dict:
|
||
|
|
"""Dify 失败时的兜底题(从预置题池随机抽一道)。
|
||
|
|
|
||
|
|
注意:这里写死 10 道 IT 基础题,生产环境可改成查 quiz_questions.source='manual'
|
||
|
|
"""
|
||
|
|
import random
|
||
|
|
|
||
|
|
fallback_pool = [
|
||
|
|
{
|
||
|
|
"question": "电脑突然黑屏,最安全的做法是?",
|
||
|
|
"options": ["强制关机重启", "拔电源重启", "等几分钟看是否恢复", "砸电脑"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "想想最稳妥的第一步",
|
||
|
|
"explanation": "黑屏可能是系统卡死,强制重启通常能恢复,拔电源可能损坏硬件",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "打印机不响应,首先应该检查?",
|
||
|
|
"options": ["打印机电源", "重装系统", "换台电脑", "直接呼叫维修"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "最基础的物理连接",
|
||
|
|
"explanation": "80% 故障是电源/线缆问题,先排除最简单的再考虑复杂方案",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "密码忘了应该怎么办?",
|
||
|
|
"options": ["自己猜", "暴力破解", "找 IT 重置", "不用了"],
|
||
|
|
"correct_index": 2,
|
||
|
|
"hint": "走正规流程最安全",
|
||
|
|
"explanation": "找 IT 重置是最快最安全的做法,自己猜可能锁账号,暴力破解违法",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "无法连接公司 VPN,首选排查?",
|
||
|
|
"options": ["检查网络是否通", "重装系统", "换电脑", "联系运营商"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "从外到内排查",
|
||
|
|
"explanation": "先确认能上网,再排查 VPN 客户端,最后才是公司 VPN 服务器",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "Outlook 收不到邮件,先看哪里?",
|
||
|
|
"options": ["垃圾邮件箱", "重装 Office", "换邮箱", "打电话给 IT"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "最容易被忽略的",
|
||
|
|
"explanation": "新邮件被误判到垃圾箱是常见原因,先看再排查服务器",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "Office 软件打开慢,先做什么?",
|
||
|
|
"options": ["清理开机启动项", "换电脑", "买新硬盘", "卸载重装"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "性能问题先减负",
|
||
|
|
"explanation": "开机启动项太多会拖慢所有应用,清理后再观察",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "电脑提示磁盘空间不足,应该?",
|
||
|
|
"options": ["清理回收站和临时文件", "关机", "重装系统", "不处理"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "先释放空间再判断",
|
||
|
|
"explanation": "90% 的情况清理回收站 + temp 目录就能解决,严重才需要重装",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "网页打不开,首先排查?",
|
||
|
|
"options": ["检查网络连接", "换浏览器", "重装系统", "砸键盘"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "从最基础的开始",
|
||
|
|
"explanation": "先看能不能打开其他网页,排除是网站问题还是网络问题",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "U 盘插入电脑没反应,先检查?",
|
||
|
|
"options": ["换个 USB 接口", "格式化 U 盘", "扔了", "拆电脑"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "先排除最简单的问题",
|
||
|
|
"explanation": "USB 接口可能松动或供电不足,先换接口试,不要先动数据",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "电脑突然变卡,第一步应该?",
|
||
|
|
"options": ["看任务管理器占用", "砸电脑", "重装系统", "关机睡觉"],
|
||
|
|
"correct_index": 0,
|
||
|
|
"hint": "数据先行",
|
||
|
|
"explanation": "任务管理器能看到 CPU/内存/磁盘占用,定位是哪个进程在吃资源",
|
||
|
|
"source": "manual",
|
||
|
|
},
|
||
|
|
]
|
||
|
|
|
||
|
|
chosen = random.choice(fallback_pool)
|
||
|
|
return chosen
|
||
|
|
|
||
|
|
def add_custom_word(self, word: str) -> None:
|
||
|
|
"""动态添加敏感词(运营后台调用)。"""
|
||
|
|
self.wf.addWords([word])
|
||
|
|
if word not in self.custom_sensitive_words:
|
||
|
|
self.custom_sensitive_words.append(word)
|
||
|
|
|
||
|
|
def remove_custom_word(self, word: str) -> None:
|
||
|
|
"""动态删除敏感词。"""
|
||
|
|
# wordfilter 没有 remove API,降级用 replace 占位
|
||
|
|
# wordfilter.remove(word) # 实际库不一定支持
|
||
|
|
if word in self.custom_sensitive_words:
|
||
|
|
self.custom_sensitive_words.remove(word)
|
||
|
|
|
||
|
|
|
||
|
|
# 单例
|
||
|
|
_moderation_service: Optional[ContentModerationService] = None
|
||
|
|
|
||
|
|
|
||
|
|
def get_moderation_service() -> ContentModerationService:
|
||
|
|
"""获取内容审核服务单例。"""
|
||
|
|
global _moderation_service
|
||
|
|
if _moderation_service is None:
|
||
|
|
_moderation_service = ContentModerationService()
|
||
|
|
return _moderation_service
|