|
一、什么是敏感词过滤?
敏感词过滤是一种处理网络内容的技术,可以检测和过滤出网络中的敏感/违禁词汇。它通过给定的关键字或字符串,判断网络内容是否包含某些敏感信息,从而防止违反法律法规的信息流通。
通常,可以使用两种方法来过滤敏感词:
- 黑名单过滤:即定义一个黑名单,将所有敏感词择记录在其中,然后对输入的文本进行对比,如果发现有敏感词,就将其过滤掉。
- 白名单过滤:即定义一个白名单,将所有不敏感的词汇记录在其中,然后对输入的文本进行对比,如果发现有不在白名单中的词汇,就将其过滤掉。
二、ToolGood.Words是什么?
ToolGood.Words是一款高性能非法词(敏感词)检测组件,附带繁体简体互换,支持全角半角互换,获取拼音首字母,获取拼音字母,拼音模糊搜索等功能。
ToolGood.Words的源码网站:ToolGood.Words源码网站
三、在Visual Studio中安装ToolGood.Words
3.1、右键项目解决方案,选择“管理NuGet程序包”,如下图所示:
3.2、切换到“浏览”选项卡,搜索“ToolGood.Words”并安装:
安装完之后最好重新编译生成项目
四、创建“subContentCheck”类
敏感/违禁词汇因特殊内容不便上传,可自行在网站上查找- using Microsoft.AspNetCore.DataProtection.KeyManagement;
- using Microsoft.AspNetCore.Http;
- using Microsoft.CodeAnalysis.Text;
- using Newtonsoft.Json;
- using System.Collections;
- using System.Text;
- using ToolGood.Words;
- using static System.Net.Mime.MediaTypeNames;
- using IHostingEnvironment = Microsoft.AspNetCore.Hosting.IHostingEnvironment;
- namespace WebApplication1 //放在自己项目中时,需要更换为自己的命名空间
- {
- public class keywords
- {
- public List<string> IllegalKeywords { get; set; }
- }
- public class urlwords
- {
- public List<string> IllegalUrls { get; set; }
- }
- /// <summary>
- /// 提交的内容敏感违禁词检查类
- /// </summary>
- public class subContentCheck
- {
- /// <summary>
- /// 本地静态文件地址路径
- /// </summary>
- private IHostingEnvironment _hostingEnv;
- /// <summary>
- /// 敏感词库
- /// </summary>
- private string dictionaryPath = "/sensitiveWords/sensitiveWords.txt";
- /// <summary>
- /// 敏感链接、网站、网址库
- /// </summary>
- private string urlsPath = "/sensitiveWords/IllegalUrls.txt";
- /// <summary>
- /// 保存敏感词组
- /// </summary>
- public string[] Words { get; set; }
- /// <summary>
- /// 一个参数的构造函数
- /// </summary>
- /// <param name="hostingEnv">本地静态文件地址路径</param>
- public subContentCheck(IHostingEnvironment hostingEnv)
- {
- _hostingEnv = hostingEnv;
- InitDictionary();
- }
- /// <summary>
- /// 初始化内存敏感词库
- /// </summary>
- public void InitDictionary()
- {
- Words = new string[] { };
- string wordsPath = _hostingEnv.WebRootPath + dictionaryPath;
- string urlPath = _hostingEnv.WebRootPath + urlsPath;
- //List<keywords> keys = new List<keywords>();
- //List<urlwords> urls = new List<urlwords>();
- string[] readAllWords = System.IO.File.ReadAllLines(wordsPath, System.Text.Encoding.UTF8);
- string[] readAllurl = System.IO.File.ReadAllLines(urlPath, System.Text.Encoding.UTF8);
- //由于数组是非动态的,不能进行动态的添加,所有先将它转成list,操作
- ArrayList arrayList = new ArrayList(Words.ToList());
- if (readAllWords.Length > 0 || readAllurl.Length > 0)
- {
- if (readAllWords.Length > 1)
- {
- //keywords key = new keywords();
- //key.IllegalKeywords = new List<string>();
- foreach (string itemWords in readAllWords)
- {
- string[] allSplitWords = itemWords.Split('|');
- foreach (string itemSplitWords in allSplitWords)
- {
- if (!string.IsNullOrEmpty(itemSplitWords))
- {
- arrayList.Add(itemSplitWords);
- //string aaa = itemSplitWords;
- //key.IllegalKeywords.Add(aaa);
- //IllegalKeywords.Add(itemSplitWords);
- }
- }
- }
- //keys.Add(key);
- }
- else
- {
- if (readAllWords.Length == 1)
- {
- string[] allSplitWords = readAllWords[0].Split('|');
- //keywords key = new keywords();
- //key.IllegalKeywords = new List<string>();
- foreach (string itemSplitWords in allSplitWords)
- {
- if (!string.IsNullOrEmpty(itemSplitWords))
- {
- arrayList.Add(itemSplitWords);
- //string aaa = itemSplitWords;
- //key.IllegalKeywords.Add(aaa);
- //IllegalKeywords.Add(itemSplitWords);
- }
- }
- //keys.Add(key);
- }
- }
- if (readAllurl.Length > 1)
- {
- //urlwords url = new urlwords();
- //url.IllegalUrls = new List<string>();
- foreach (string itemUrls in readAllurl)
- {
- string[] allSplitUrls = itemUrls.Split('|');
- foreach (string itemSplitUrls in allSplitUrls)
- {
- if (!string.IsNullOrEmpty(itemSplitUrls))
- {
- arrayList.Add(itemSplitUrls);
- //string Keyword = itemSplitUrls;
- //url.IllegalUrls.Add(Keyword);
- //IllegalUrls.Add(itemSplitUrls);
- }
- }
- }
- //urls.Add(url);
- }
- else
- {
- if (readAllurl.Length == 1)
- {
- string[] allSplitUrls = readAllurl[0].Split('|');
- //urlwords url = new urlwords();
- //url.IllegalUrls = new List<string>();
- foreach (string itemSplitUrls in allSplitUrls)
- {
- if (!string.IsNullOrEmpty(itemSplitUrls))
- {
- arrayList.Add(itemSplitUrls);
- //IllegalUrls.Add(itemSplitUrls);
- //string Keyword = itemSplitUrls;
- //url.IllegalUrls.Add(Keyword);
- }
- }
- //urls.Add(url);
- }
- }
- }
- //我们在将list转换成String[]数组
- Words = (string[])arrayList.ToArray(typeof(string));
- }
- /// <summary>
- /// 过滤替换敏感词
- /// </summary>
- /// <param name="sourceText">需要过滤替换的原内容</param>
- /// <param name="replaceChar">敏感词替换的字符;默认替换为‘*’</param>
- /// <returns>返回状态码;为空则表示传入的内容为空;“0”:设置违禁词时发生错误;“1”:敏感内容替换时发生错误;“2”:需要替换的文本内容为空;其余则返回替换成功的字符串内容</returns>
- public string FilterWithChar(string sourceText, char replaceChar = '*')
- {
- if (!string.IsNullOrEmpty(sourceText))
- {
- string result = "";
- WordsSearch wordsSearch = new WordsSearch();
- try
- {
- wordsSearch.SetKeywords(Words);
- }
- catch (Exception ex)
- {
- result = "0";
- return result;
- }
- try
- {
- result = wordsSearch.Replace(sourceText, replaceChar);
- return result;
- }
- catch (Exception ex)
- {
- return result = "1";
- }
- }
- else
- {
- return "2";
- }
- }
- /// <summary>
- /// 查找原内容中知否包含敏感/违禁词
- /// </summary>
- /// <param name="sourceText">需要判断的原内容</param>
- /// <returns>返回状态码;为空则表示传入的内容为空;“0”:设置违禁词时发生错误;“1”:敏感内容查询时发生错误;“2”:需要替换的文本内容为空;“3”:原内容中包含敏感/违禁词汇;“4”:原内容中不包含敏感/违禁词汇</returns>
- public string FindSensitiveKey(string sourceText)
- {
- string result = "";
- if (!string.IsNullOrEmpty(sourceText))
- {
- WordsSearch wordsSearch = new WordsSearch();
- try
- {
- wordsSearch.SetKeywords(Words);
- }
- catch (Exception ex)
- {
- result = "0";
- return result;
- }
- try
- {
- bool res = wordsSearch.ContainsAny(sourceText);
- if (res)
- {
- result = "3";
- return result;
- }
- else
- {
- result = "4";
- return result;
- }
- }
- catch (Exception ex)
- {
- return result = "1";
- }
- }
- else
- {
- result = "2";
- }
- return result;
- }
- /// <summary>
- /// 把对象写入到json文件中
- /// </summary>
- /// <param name="obj"></param>
- /// <returns></returns>
- public static void Write(List<keywords> jsonData, List<urlwords> urlJsonData, string filename)
- {
- var directorypath = Directory.GetCurrentDirectory();
- string strFileName = directorypath + "\" + filename + ".json";
- string ListJson = "";
- if (jsonData != null)
- {
- ListJson = JsonConvert.SerializeObject(jsonData);
- }
- else
- {
- ListJson = JsonConvert.SerializeObject(urlJsonData);
- }
- Console.WriteLine(ListJson);
- writeJsonFile(strFileName, ListJson);
- //将序列化的json字符串内容写入Json文件,并且保存
- void writeJsonFile(string path, string jsonConents)
- {
- using (FileStream fs = new FileStream(path, FileMode.OpenOrCreate, System.IO.FileAccess.ReadWrite, FileShare.ReadWrite))
- {
- //如果json文件中有中文数据,可能会出现乱码的现象,那么需要加上如下代码
- Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
- using (StreamWriter sw = new StreamWriter(fs, Encoding.GetEncoding("GB2312")))
- {
- sw.WriteLine(jsonConents);
- }
- }
- }
- }
- }
- }
复制代码 五、写API接口
- /// <summary>
- /// 进行敏感词脱敏
- /// </summary>
- /// <param name="sourctText">需要脱敏的文本内容</param>
- /// <returns></returns>
- [HttpPost]
- public IActionResult sensitive_words_replace2(string sourctText)
- {
- string resultStr = "";
- //实例化敏感词库
- subContentCheck strCheck = new subContentCheck(_hostingEnv);
- if (string.IsNullOrEmpty(sourctText))
- {
- return Json(new { code = 230, msg = "需要替换的文本内容为空!", resultStr = resultStr });
- }
- try
- {
- resultStr = strCheck.FilterWithChar(sourctText);
- string resMsg = "";
- int resCode = 200;
- if (resultStr=="0")
- {
- resCode = 210;
- resultStr = "";
- resMsg = "设置违禁词时发生错误,请联系管理员!";
- }else if (resultStr=="1")
- {
- resCode = 240;
- resultStr = "";
- resMsg = "敏感内容替换时发生错误!";
- }
- else if (resultStr == "2")
- {
- resCode = 260;
- resultStr = "";
- resMsg = "需要替换的文本内容为空!";
- }
- else
- {
- resCode = 200;
- resMsg = "敏感词替换请求成功!";
- }
- return Json(new { code = resCode, msg = resMsg, resultStr = resultStr });
- }
- catch (Exception ex)
- {
- return Json(new { code = 220, msg = "敏感内容替换时发生错误!", resultStr = "" });
- }
- }
- /// <summary>
- /// 进行敏感词判断
- /// </summary>
- /// <param name="sourctText">需要脱敏的文本内容</param>
- /// <returns></returns>
- [HttpPost]
- public IActionResult whether_sensitive_words(string sourctText)
- {
- string resultStr = "";
- //实例化敏感词库
- subContentCheck strCheck = new subContentCheck(_hostingEnv);
- if (string.IsNullOrEmpty(sourctText))
- {
- return Json(new { code = 230, msg = "需要替换的文本内容为空!", resultStr = resultStr });
- }
- try
- {
- resultStr = strCheck.FindSensitiveKey(sourctText);
- string resMsg = "";
- int resCode = 200;
- if (resultStr == "0")
- {
- resCode = 210;
- resultStr = "";
- resMsg = "设置违禁词时发生错误,请联系管理员!";
- }
- else if (resultStr == "1")
- {
- resCode = 240;
- resultStr = "";
- resMsg = "敏感内容匹配时发生错误!";
- }
- else if (resultStr == "2")
- {
- resCode = 260;
- resultStr = "";
- resMsg = "需要判断的文本内容为空!";
- }
- else if (resultStr == "3")
- {
- resCode = 270;
- resultStr = "";
- resMsg = "内容中含有敏感/违禁词!";
- }
- else
- {
- resCode = 200;
- resMsg = "内容中不含敏感/违禁词!";
- }
- return Json(new { code = resCode, msg = resMsg, resultStr = resultStr });
- }
- catch (Exception ex)
- {
- return Json(new { code = 220, msg = "敏感内容匹配时发生错误!", resultStr = "" });
- }
- }
复制代码 六、前端封装JS方法
- /**
- * 敏感词/违禁词替换
- * @param {string} sourctText 需要进行替换的内容
- * @param {string} boxid 将替换成功之后的内容赋值的元素容器id属性名
- * @param {object} layui Layui实例
- * @returns 替换之后的文本内容
- */
- function sensitive_words_replace(sourctText, boxid, layui) {
- let resultStr = "";
- //let url = ["/Home/sensitive_words_replace", "/Home/sensitive_words_replace1", "/Home/sensitive_words_replace2"];
- $.ajax({
- url: "/Home/sensitive_words_replace2",//请求后端接口的路径
- dataType: "JSON",
- type: "POST",
- data: {
- "sourctText": sourctText
- },
- success: function (res) {
- let resCode = res.code;
- let resMsg = res.msg;
- if ((resCode == "210" || resCode == 210) || (resCode == 220 || resCode == "220") || (resCode == 230 || resCode == "230") || (resCode == 240 || resCode == "240") || (resCode == 260 || resCode == "260")) {
- //返回数据后关闭loading
- layer.closeAll();
- resultStr = res.resultStr;
- layui.layer.alert(resMsg, { icon: 5, title: "温馨提示", closeBtn: 0 });
- } else if (resCode == 200 || resCode == "200") {
- resultStr = res.resultStr;
- $("#" + boxid).val(resultStr);
- //返回数据后关闭loading
- layer.closeAll();
- }
- },
- error: function (error) {
- //返回数据后关闭loading
- layer.closeAll();
- layui.layer.alert(error, { icon: 5, title: "温馨提示", closeBtn: 0 });
- }
- });
- return resultStr;
- }
- /**
- * 查询是否包含敏感/违禁词
- * @param {string} sourctText 需要进行替换的内容
- * @param {string} boxid 将替换成功之后的内容赋值的元素容器id属性名
- * @param {object} layui Layui实例
- * @returns 返回Bool;包含:“true”;不包含:“false”
- */
- function whether_sensitive_words(sourctText, boxid, layui) {
- let resultBool = false;
- $.ajax({
- url: "/Home/whether_sensitive_words",//请求后端接口的路径
- dataType: "JSON",
- type: "POST",
- async: false,//此处需要注意的是要想获取ajax返回的值这个async属性必须设置成同步的,否则获取不到返回值
- data: {
- "sourctText": sourctText
- },
- success: function (res) {
- let resCode = res.code;
- let resMsg = res.msg;
- if ((resCode == "210" || resCode == 210) || (resCode == 220 || resCode == "220") || (resCode == 230 || resCode == "230") || (resCode == 240 || resCode == "240") || (resCode == 260 || resCode == "260")) {
- resultBool = false;
- layui.layer.alert(resMsg, { icon: 5, title: "温馨提示", closeBtn: 0 });
- } else if (resCode == 270 || resCode == "270") {
- resultBool = true;
- } else if (resCode == 200 || resCode == "200") {
- resultBool = false;
- //返回数据后关闭loading
- layer.closeAll();
- }
- },
- error: function (error) {
- layui.layer.alert(error, { icon: 5, title: "温馨提示", closeBtn: 0 });
- }
- });
- return resultBool;
- }
复制代码 来源:https://www.cnblogs.com/lucasDC/archive/2023/03/25/17255906.html
免责声明:由于采集信息均来自互联网,如果侵犯了您的权益,请联系我们【E-Mail:cb@itdo.tech】 我们会及时删除侵权内容,谢谢合作! |
本帖子中包含更多资源
您需要 登录 才可以下载或查看,没有账号?立即注册
x
|