SensitiveHelper.php 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. <?php
  2. /**
  3. * 敏感词类库.
  4. * User: Lustre
  5. * Date: 17/3/9
  6. * Time: 上午9:11
  7. * Url: https://github.com/FireLustre/php-dfa-sensitive
  8. */
  9. namespace addons\cms\library;
  10. class SensitiveHelper
  11. {
  12. /**
  13. * 待检测语句长度
  14. *
  15. * @var int
  16. */
  17. protected $contentLength = 0;
  18. /**
  19. * 敏感词单例
  20. *
  21. * @var object|null
  22. */
  23. private static $_instance = null;
  24. /**
  25. * 铭感词库树
  26. *
  27. * @var HashMap|null
  28. */
  29. protected $wordTree = null;
  30. /**
  31. * 存放待检测语句铭感词
  32. *
  33. * @var array|null
  34. */
  35. protected static $badWordList = null;
  36. /**
  37. * 获取单例
  38. *
  39. * @return self
  40. */
  41. public static function init()
  42. {
  43. if (!self::$_instance instanceof self) {
  44. self::$_instance = new self();
  45. }
  46. return self::$_instance;
  47. }
  48. /**
  49. * 构建铭感词树【文件模式】
  50. *
  51. * @param string $filepath
  52. * @return $this
  53. * @throws \Exception
  54. */
  55. public function setTreeByFile($filepath = '')
  56. {
  57. if (!file_exists($filepath)) {
  58. throw new \Exception('词库文件不存在');
  59. }
  60. // 词库树初始化
  61. $this->wordTree = new HashMap();
  62. foreach ($this->yieldToReadFile($filepath) as $word) {
  63. $this->buildWordToTree(trim($word));
  64. }
  65. return $this;
  66. }
  67. /**
  68. * 构建铭感词树【数组模式】
  69. *
  70. * @param null $sensitiveWords
  71. * @return $this
  72. * @throws \Exception
  73. */
  74. public function setTree($sensitiveWords = null)
  75. {
  76. if (empty($sensitiveWords)) {
  77. throw new \Exception('词库不能为空');
  78. }
  79. $this->wordTree = new HashMap();
  80. foreach ($sensitiveWords as $word) {
  81. $this->buildWordToTree($word);
  82. }
  83. return $this;
  84. }
  85. /**
  86. * 检测文字中的敏感词
  87. *
  88. * @param string $content 待检测内容
  89. * @param int $matchType 匹配类型 [默认为最小匹配规则]
  90. * @param int $wordNum 需要获取的敏感词数量 [默认获取全部]
  91. * @return array
  92. */
  93. public function getBadWord($content, $matchType = 1, $wordNum = 0)
  94. {
  95. $this->contentLength = mb_strlen($content, 'utf-8');
  96. $badWordList = array();
  97. for ($length = 0; $length < $this->contentLength; $length++) {
  98. $matchFlag = 0;
  99. $flag = false;
  100. $tempMap = $this->wordTree;
  101. for ($i = $length; $i < $this->contentLength; $i++) {
  102. $keyChar = mb_substr($content, $i, 1, 'utf-8');
  103. // 获取指定节点树
  104. $nowMap = $tempMap->get($keyChar);
  105. // 不存在节点树,直接返回
  106. if (empty($nowMap)) {
  107. break;
  108. }
  109. // 存在,则判断是否为最后一个
  110. $tempMap = $nowMap;
  111. // 找到相应key,偏移量+1
  112. $matchFlag++;
  113. // 如果为最后一个匹配规则,结束循环,返回匹配标识数
  114. if (false === $nowMap->get('ending')) {
  115. continue;
  116. }
  117. $flag = true;
  118. // 最小规则,直接退出
  119. if (1 === $matchType) {
  120. break;
  121. }
  122. }
  123. if (!$flag) {
  124. $matchFlag = 0;
  125. }
  126. // 找到相应key
  127. if ($matchFlag <= 0) {
  128. continue;
  129. }
  130. $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');
  131. // 有返回数量限制
  132. if ($wordNum > 0 && count($badWordList) == $wordNum) {
  133. return $badWordList;
  134. }
  135. // 需匹配内容标志位往后移
  136. $length = $length + $matchFlag - 1;
  137. }
  138. return $badWordList;
  139. }
  140. /**
  141. * 替换敏感字字符
  142. *
  143. * @param $content
  144. * @param $replaceChar
  145. * @param string $sTag
  146. * @param string $eTag
  147. * @param int $matchType
  148. * @return mixed
  149. */
  150. public function replace($content, $replaceChar = '', $sTag = '', $eTag = '', $matchType = 1)
  151. {
  152. if (empty($content)) {
  153. throw new \Exception('请填写检测的内容');
  154. }
  155. if (empty(self::$badWordList)) {
  156. $badWordList = $this->getBadWord($content, $matchType);
  157. } else {
  158. $badWordList = self::$badWordList;
  159. }
  160. // 未检测到敏感词,直接返回
  161. if (empty($badWordList)) {
  162. return $content;
  163. }
  164. foreach ($badWordList as $badWord) {
  165. if ($sTag || $eTag) {
  166. $replaceChar = $sTag . $badWord . $eTag;
  167. }
  168. $content = str_replace($badWord, $replaceChar, $content);
  169. }
  170. return $content;
  171. }
  172. /**
  173. * 被检测内容是否合法,合法返回true,非法返回false
  174. * @param $content
  175. * @return bool
  176. */
  177. public function islegal($content)
  178. {
  179. $this->contentLength = mb_strlen($content, 'utf-8');
  180. for ($length = 0; $length < $this->contentLength; $length++) {
  181. $matchFlag = 0;
  182. $tempMap = $this->wordTree;
  183. for ($i = $length; $i < $this->contentLength; $i++) {
  184. $keyChar = mb_substr($content, $i, 1, 'utf-8');
  185. // 获取指定节点树
  186. $nowMap = $tempMap->get($keyChar);
  187. // 不存在节点树,直接返回
  188. if (empty($nowMap)) {
  189. break;
  190. }
  191. // 找到相应key,偏移量+1
  192. $tempMap = $nowMap;
  193. $matchFlag++;
  194. // 如果为最后一个匹配规则,结束循环,返回匹配标识数
  195. if (false === $nowMap->get('ending')) {
  196. continue;
  197. }
  198. return false;
  199. }
  200. // 找到相应key
  201. if ($matchFlag <= 0) {
  202. continue;
  203. }
  204. // 需匹配内容标志位往后移
  205. $length = $length + $matchFlag - 1;
  206. }
  207. return true;
  208. }
  209. protected function yieldToReadFile($filepath)
  210. {
  211. $fp = fopen($filepath, 'r');
  212. while (!feof($fp)) {
  213. yield fgets($fp);
  214. }
  215. fclose($fp);
  216. }
  217. // 将单个敏感词构建成树结构
  218. protected function buildWordToTree($word = '')
  219. {
  220. if ('' === $word) {
  221. return;
  222. }
  223. $tree = $this->wordTree;
  224. $wordLength = mb_strlen($word, 'utf-8');
  225. for ($i = 0; $i < $wordLength; $i++) {
  226. $keyChar = mb_substr($word, $i, 1, 'utf-8');
  227. // 获取子节点树结构
  228. $tempTree = $tree->get($keyChar);
  229. if ($tempTree) {
  230. $tree = $tempTree;
  231. } else {
  232. // 设置标志位
  233. $newTree = new HashMap();
  234. $newTree->put('ending', false);
  235. // 添加到集合
  236. $tree->put($keyChar, $newTree);
  237. $tree = $newTree;
  238. }
  239. // 到达最后一个节点
  240. if ($i == $wordLength - 1) {
  241. $tree->put('ending', true);
  242. }
  243. }
  244. return;
  245. }
  246. }