非法词汇过滤
采用DFA算法,检查一段文本是否包含某些词汇。
树节点:
<?php
declare(strict_types=1);
class Node
{
/**
* @var array|Node[]
*/
public array $child = [];
public bool $isEnd = false;
}
树:
<?php
declare(strict_types=1);
use Generator;
class Tree
{
/**
* @var Node
*/
public Node $root;
public function __construct()
{
$this->root = new Node();
}
public function checkOnce(string $text): array
{
$curNode = $this->root;
$words = [];
$ret = [];
$generator = self::eachStr($text);
foreach ($generator as $word) {
if (isset($curNode->child[$word])) {
$curNode = $curNode->child[$word];
$words[] = $word;
if ($curNode->isEnd) {
$ret[] = implode('', $words);
return $ret;
}
} else {
$words = [];
$curNode = $this->root;
}
}
return $ret;
}
public function checkFull(string $text): array
{
$curNode = $this->root;
$words = [];
$ret = [];
$generator = self::eachStr($text);
foreach ($generator as $word) {
if (isset($curNode->child[$word])) {
$curNode = $curNode->child[$word];
$words[] = $word;
if ($curNode->isEnd) {
$ret[] = implode('', $words);
}
if (count($curNode->child) == 0) {
$words = [];
$curNode = $this->root;
}
} else {
$words = [];
$curNode = $this->root;
}
}
return $ret;
}
/**
* 设置敏感词
* @param string $words
* @return void
*/
public function setWords(string $words): void
{
$curNode = $this->root;
$generator = self::eachStr($words);
foreach ($generator as $word) {
if (!isset($curNode->child[$word])) {
$curNode->child[$word] = new Node();
}
$curNode = $curNode->child[$word];
}
if ($curNode != $this->root) {
$curNode->isEnd = true;
}
}
/**
* 遍历字符串
* @param string $str
* @return Generator
*/
protected static function eachStr(string $str): Generator
{
$strLen = strlen($str);
for ($i = 0; $i < $strLen; $i++) {
//unicode范围 --> ord 范围
//一字节 0-127 --> 0 - 127
//二字节 128-2047 --> 194 - 223
//三字节 2048-65535 --> 224 - 239
//四字节 65536-1114111 --> 240 - 244
//@see http://shouce.jb51.net/gopl-zh/ch3/ch3-05.html
$ord = ord($str[$i]);
if ($ord <= 127) {
$word = $str[$i];
} elseif ($ord <= 223) {
$word = $str[$i] . $str[$i + 1];
$i += 1;
} elseif ($ord <= 239) {
$word = $str[$i] . $str[$i + 1] . $str[$i + 2];
$i += 2;
} elseif ($ord <= 244) {
//四字节
$word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
$i += 3;
} else {
//五字节php都溢出了
//Parse error: Invalid UTF-8 codepoint escape sequence: Codepoint too large
continue;
}
yield $word;
}
}
}
测试:
<?php
declare(strict_types=1);
use PHPUnit\Framework\TestCase;
class TestDFA extends TestCase
{
public function testCheckOnce()
{
$tree = new Tree();
$tree->setWords("车载群发器");
$tree->setWords("车载群发");
$tree->setWords("车载流动群发器");
$ret1 = $tree->checkOnce("车载群发");
$this->assertNotEmpty($ret1);
$this->assertEquals("车载群发", $ret1[0]);
$ret2 = $tree->checkOnce("车载群发器");
$this->assertNotEmpty($ret2);
$this->assertEquals("车载群发", $ret2[0]);
$ret3 = $tree->checkOnce("车载流动群发器");
$this->assertNotEmpty($ret3);
$this->assertEquals("车载流动群发器", $ret3[0]);
$ret4 = $tree->checkOnce("车载的群发器");
$this->assertEmpty($ret4);
$ret5 = $tree->checkOnce("车载|群发器");
$this->assertEmpty($ret5);
}
public function testCheckFull()
{
$tree = new Tree();
$tree->setWords("车载群发器");
$tree->setWords("车载群发");
$tree->setWords("车载流动群发器");
$ret = $tree->checkFull("车载群发器车载流动群发器");
$this->assertNotEmpty($ret);
$this->assertEquals("车载群发", $ret[0]);
$this->assertEquals("车载群发器", $ret[1]);
$this->assertEquals("车载流动群发器", $ret[2]);
$ret = $tree->checkFull("车载群发器车载x车载流动群发器");
$this->assertNotEmpty($ret);
$this->assertEquals("车载群发", $ret[0]);
$this->assertEquals("车载群发器", $ret[1]);
$this->assertEquals("车载流动群发器", $ret[2]);
}
}
本作品采用《CC 协议》,转载必须注明作者和本文链接