# ============================================================================= # 企微IT智能服务台 — 内容审核服务 # ============================================================================= # 说明:#81 v0.6.0 内容审核 — 检测敏感词 + 提示坐席优化语气 # 用途:坐席发送消息前自动审核,避免发送违规内容 # 设计:基于 wordfilter 开源库 + 自定义敏感词库 # ============================================================================= from dataclasses import dataclass from enum import Enum from typing import List, Optional, Tuple from wordfilter import Wordfilter from app.utils.logger import get_logger logger = get_logger(__name__) class ModerationAction(str, Enum): """内容审核动作""" PASS = "pass" # 通过 WARN = "warn" # 警告(允许发送,但标记) BLOCK = "block" # 阻断(必须修改) class ModerationCategory(str, Enum): """审核分类""" PROFANITY = "profanity" # 脏话 POLITICS = "politics" # 政治敏感 PORN = "porn" # 色情 AD = "ad" # 广告 PRIVACY = "privacy" # 隐私泄露(身份证/电话) OTHER = "other" # 其他 @dataclass class ModerationResult: """审核结果""" action: ModerationAction category: Optional[ModerationCategory] matched_words: List[str] suggestion: str = "" @property def is_blocked(self) -> bool: return self.action == ModerationAction.BLOCK @property def is_warned(self) -> bool: return self.action == ModerationAction.WARN class ContentModerationService: """内容审核服务 — 检测 + 提示。 设计要点: 1. 加载 wordfilter + 自定义敏感词库 2. 提供 3 个级别动作:pass / warn / block 3. 返回命中的敏感词,给前端提示 4. 异步不阻塞消息发送主流程 """ def __init__(self): # 初始化 wordfilter(新 API: Wordfilter() 实例,而非 init() 全局) self.wf = Wordfilter() # 加载自定义敏感词库(预留,生产环境从配置文件加载) self.custom_sensitive_words: List[str] = [ # 坐席严禁发送的 "投诉我", # 暗示员工投诉自己 "你爱找谁找谁", # 不当推诿 "自己不会百度吗", # 不当反问 "这点小事", # 轻视员工问题 # 隐私保护(后端检测,前端不知道) # 实际部署时从 system_config 加载 ] if self.custom_sensitive_words: self.wf.addWords(self.custom_sensitive_words) # ================================================================== # 主入口 # ================================================================== def moderate(self, text: str) -> ModerationResult: """审核文本。 Args: text: 待审核文本(坐席准备发的消息) Returns: ModerationResult: 审核结果 """ if not text or not text.strip(): return ModerationResult( action=ModerationAction.PASS, category=None, matched_words=[], ) text = text.strip() # 1. wordfilter 检测 matched: List[str] = [] if self.wf.blacklisted(text): # 找出具体哪些词命中 matched = self._extract_matched(text) if not matched: return ModerationResult( action=ModerationAction.PASS, category=None, matched_words=[], ) # 2. 分类(简单规则:有命中就给 warn,后续可分级) category = self._classify(matched) # 3. 决定动作(目前策略:命中即 warn,后续可升级 block) # 后续决策点:是否给某些类(政治/色情)直接 block action = ModerationAction.WARN suggestion = self._generate_suggestion(category, matched) logger.info( f"[ContentModeration] 检测到敏感词 text={text[:30]}... " f"matched={matched} category={category}" ) return ModerationResult( action=action, category=category, matched_words=matched, suggestion=suggestion, ) # ================================================================== # 隐私信息检测(基于正则,跟敏感词无关) # ================================================================== def check_privacy_leak(self, text: str) -> List[str]: """检测文本是否包含隐私信息(身份证 / 电话 / 银行卡)。 Returns: 命中的隐私字段列表(描述性,如 ["phone", "id_card"]) """ import re leaked = [] # 手机号(11 位 1 开头) if re.search(r"\b1[3-9]\d{9}\b", text): leaked.append("phone") # 身份证号(18 位) if re.search(r"\b\d{17}[\dXx]\b", text): leaked.append("id_card") # 银行卡(16-19 位连续数字,简单判断) if re.search(r"\b\d{16,19}\b", text): leaked.append("bank_card") # 邮箱(个人邮箱,非公司邮箱) personal_email_pattern = ( r"\b[a-zA-Z0-9._%+-]+@(?!servyou-it\.com|" r"servyou\.com\.cn)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b" ) if re.search(personal_email_pattern, text): leaked.append("personal_email") return leaked # ================================================================== # 工具方法 # ================================================================== def _extract_matched(self, text: str) -> List[str]: """提取命中的敏感词。""" # wordfilter 没有直接的 "提取所有命中词" API,只能 replace 看 matched = [] # 遍历自建词库看哪些命中 for word in self.custom_sensitive_words: if word in text: matched.append(word) return matched def _classify(self, matched: List[str]) -> ModerationCategory: """根据命中的词分类。""" # 简单分类:命中"投诉""爱找谁"等 → profanity # 后续可扩展 return ModerationCategory.PROFANITY def _generate_suggestion( self, category: ModerationCategory, matched: List[str] ) -> str: """生成修改建议。""" suggestions_map = { ModerationCategory.PROFANITY: ( "建议改为更专业的表达,例如:" "「我理解您的问题,我们一起想办法解决」" ), ModerationCategory.POLITICS: ( "请避免讨论政治话题,保持服务专业性" ), ModerationCategory.PORN: "请使用正式语言", ModerationCategory.AD: "请勿发送广告内容", ModerationCategory.PRIVACY: ( "请勿发送员工隐私信息(电话/身份证),如需联系请走企微" ), ModerationCategory.OTHER: "请检查并修改表达", } return suggestions_map.get(category, "请检查并修改表达") @staticmethod def _get_fallback_question(keywords: List[str]) -> dict: """Dify 失败时的兜底题(从预置题池随机抽一道)。 注意:这里写死 10 道 IT 基础题,生产环境可改成查 quiz_questions.source='manual' """ import random fallback_pool = [ { "question": "电脑突然黑屏,最安全的做法是?", "options": ["强制关机重启", "拔电源重启", "等几分钟看是否恢复", "砸电脑"], "correct_index": 0, "hint": "想想最稳妥的第一步", "explanation": "黑屏可能是系统卡死,强制重启通常能恢复,拔电源可能损坏硬件", "source": "manual", }, { "question": "打印机不响应,首先应该检查?", "options": ["打印机电源", "重装系统", "换台电脑", "直接呼叫维修"], "correct_index": 0, "hint": "最基础的物理连接", "explanation": "80% 故障是电源/线缆问题,先排除最简单的再考虑复杂方案", "source": "manual", }, { "question": "密码忘了应该怎么办?", "options": ["自己猜", "暴力破解", "找 IT 重置", "不用了"], "correct_index": 2, "hint": "走正规流程最安全", "explanation": "找 IT 重置是最快最安全的做法,自己猜可能锁账号,暴力破解违法", "source": "manual", }, { "question": "无法连接公司 VPN,首选排查?", "options": ["检查网络是否通", "重装系统", "换电脑", "联系运营商"], "correct_index": 0, "hint": "从外到内排查", "explanation": "先确认能上网,再排查 VPN 客户端,最后才是公司 VPN 服务器", "source": "manual", }, { "question": "Outlook 收不到邮件,先看哪里?", "options": ["垃圾邮件箱", "重装 Office", "换邮箱", "打电话给 IT"], "correct_index": 0, "hint": "最容易被忽略的", "explanation": "新邮件被误判到垃圾箱是常见原因,先看再排查服务器", "source": "manual", }, { "question": "Office 软件打开慢,先做什么?", "options": ["清理开机启动项", "换电脑", "买新硬盘", "卸载重装"], "correct_index": 0, "hint": "性能问题先减负", "explanation": "开机启动项太多会拖慢所有应用,清理后再观察", "source": "manual", }, { "question": "电脑提示磁盘空间不足,应该?", "options": ["清理回收站和临时文件", "关机", "重装系统", "不处理"], "correct_index": 0, "hint": "先释放空间再判断", "explanation": "90% 的情况清理回收站 + temp 目录就能解决,严重才需要重装", "source": "manual", }, { "question": "网页打不开,首先排查?", "options": ["检查网络连接", "换浏览器", "重装系统", "砸键盘"], "correct_index": 0, "hint": "从最基础的开始", "explanation": "先看能不能打开其他网页,排除是网站问题还是网络问题", "source": "manual", }, { "question": "U 盘插入电脑没反应,先检查?", "options": ["换个 USB 接口", "格式化 U 盘", "扔了", "拆电脑"], "correct_index": 0, "hint": "先排除最简单的问题", "explanation": "USB 接口可能松动或供电不足,先换接口试,不要先动数据", "source": "manual", }, { "question": "电脑突然变卡,第一步应该?", "options": ["看任务管理器占用", "砸电脑", "重装系统", "关机睡觉"], "correct_index": 0, "hint": "数据先行", "explanation": "任务管理器能看到 CPU/内存/磁盘占用,定位是哪个进程在吃资源", "source": "manual", }, ] chosen = random.choice(fallback_pool) return chosen def add_custom_word(self, word: str) -> None: """动态添加敏感词(运营后台调用)。""" self.wf.addWords([word]) if word not in self.custom_sensitive_words: self.custom_sensitive_words.append(word) def remove_custom_word(self, word: str) -> None: """动态删除敏感词。""" # wordfilter 没有 remove API,降级用 replace 占位 # wordfilter.remove(word) # 实际库不一定支持 if word in self.custom_sensitive_words: self.custom_sensitive_words.remove(word) # 单例 _moderation_service: Optional[ContentModerationService] = None def get_moderation_service() -> ContentModerationService: """获取内容审核服务单例。""" global _moderation_service if _moderation_service is None: _moderation_service = ContentModerationService() return _moderation_service