非法词汇过滤

AI摘要
本文分享基于DFA算法的敏感词检测PHP实现。核心包括:构建Trie树存储敏感词,支持单次匹配和全文本扫描两种检测模式,并正确处理UTF-8字符编码。通过单元测试验证了算法的准确性和边界情况处理能力。

采用DFA算法,检查一段文本是否包含某些词汇。

树节点:

<?php
declare(strict_types=1);

class Node
{
    /**
     * @var array|Node[]
     */
    public array $child = [];
    public bool $isEnd = false;
}

树:

<?php
declare(strict_types=1);

use Generator;

class Tree
{
    /**
     * @var Node
     */
    public Node $root;

    public function __construct()
    {
        $this->root = new Node();
    }

    public function checkOnce(string $text): array
    {
        $curNode = $this->root;
        $words = [];
        $ret = [];
        $generator = self::eachStr($text);
        foreach ($generator as $word) {
            if (isset($curNode->child[$word])) {
                $curNode = $curNode->child[$word];
                $words[] = $word;
                if ($curNode->isEnd) {
                    $ret[] = implode('', $words);
                    return $ret;
                }
            } else {
                $words = [];
                $curNode = $this->root;
            }
        }
        return $ret;
    }

    public function checkFull(string $text): array
    {
        $curNode = $this->root;
        $words = [];
        $ret = [];
        $generator = self::eachStr($text);
        foreach ($generator as $word) {
            if (isset($curNode->child[$word])) {
                $curNode = $curNode->child[$word];
                $words[] = $word;
                if ($curNode->isEnd) {
                    $ret[] = implode('', $words);
                }
                if (count($curNode->child) == 0) {
                    $words = [];
                    $curNode = $this->root;
                }
            } else {
                $words = [];
                $curNode = $this->root;
            }
        }
        return $ret;
    }

    /**
     * 设置敏感词
     * @param string $words
     * @return void
     */
    public function setWords(string $words): void
    {
        $curNode = $this->root;
        $generator = self::eachStr($words);
        foreach ($generator as $word) {
            if (!isset($curNode->child[$word])) {
                $curNode->child[$word] = new Node();
            }
            $curNode = $curNode->child[$word];
        }
        if ($curNode != $this->root) {
            $curNode->isEnd = true;
        }
    }

    /**
     * 遍历字符串
     * @param string $str
     * @return Generator
     */
    protected static function eachStr(string $str): Generator
    {
        $strLen = strlen($str);
        for ($i = 0; $i < $strLen; $i++) {
            //unicode范围 --> ord 范围
            //一字节 0-127 --> 0 - 127
            //二字节 128-2047 --> 194 - 223
            //三字节 2048-65535 --> 224 - 239
            //四字节 65536-1114111 --> 240 - 244
            //@see http://shouce.jb51.net/gopl-zh/ch3/ch3-05.html
            $ord = ord($str[$i]);
            if ($ord <= 127) {
                $word = $str[$i];
            } elseif ($ord <= 223) {
                $word = $str[$i] . $str[$i + 1];
                $i += 1;
            } elseif ($ord <= 239) {
                $word = $str[$i] . $str[$i + 1] . $str[$i + 2];
                $i += 2;
            } elseif ($ord <= 244) {
                //四字节
                $word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
                $i += 3;
            } else {
                //五字节php都溢出了
                //Parse error: Invalid UTF-8 codepoint escape sequence: Codepoint too large
                continue;
            }
            yield $word;
        }
    }
}

测试:

<?php
declare(strict_types=1);

use PHPUnit\Framework\TestCase;

class TestDFA extends TestCase
{
    public function testCheckOnce()
    {
        $tree = new Tree();
        $tree->setWords("车载群发器");
        $tree->setWords("车载群发");
        $tree->setWords("车载流动群发器");

        $ret1 = $tree->checkOnce("车载群发");
        $this->assertNotEmpty($ret1);
        $this->assertEquals("车载群发", $ret1[0]);

        $ret2 = $tree->checkOnce("车载群发器");
        $this->assertNotEmpty($ret2);
        $this->assertEquals("车载群发", $ret2[0]);

        $ret3 = $tree->checkOnce("车载流动群发器");
        $this->assertNotEmpty($ret3);
        $this->assertEquals("车载流动群发器", $ret3[0]);

        $ret4 = $tree->checkOnce("车载的群发器");
        $this->assertEmpty($ret4);

        $ret5 = $tree->checkOnce("车载|群发器");
        $this->assertEmpty($ret5);
    }

    public function testCheckFull()
    {
        $tree = new Tree();
        $tree->setWords("车载群发器");
        $tree->setWords("车载群发");
        $tree->setWords("车载流动群发器");

        $ret = $tree->checkFull("车载群发器车载流动群发器");
        $this->assertNotEmpty($ret);
        $this->assertEquals("车载群发", $ret[0]);
        $this->assertEquals("车载群发器", $ret[1]);
        $this->assertEquals("车载流动群发器", $ret[2]);

        $ret = $tree->checkFull("车载群发器车载x车载流动群发器");
        $this->assertNotEmpty($ret);
        $this->assertEquals("车载群发", $ret[0]);
        $this->assertEquals("车载群发器", $ret[1]);
        $this->assertEquals("车载流动群发器", $ret[2]);
    }
}
本作品采用《CC 协议》,转载必须注明作者和本文链接
梦想星辰大海
讨论数量: 0
(= ̄ω ̄=)··· 暂无内容!

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!