算法简介
Aho-Corasick 算法是一种多模式字符串匹配算法,由 Alfred Aho 和 Margaret Corasick 于 1975 年提出。它能够在一次遍历中同时查找多个模式串,效率远高于对每个模式串单独进行 KMP 匹配。
💡 核心思想
将多个模式串构建成 Trie 树,然后添加失配边 (Fail Link),使得匹配失败时能够快速跳转到下一个可能的匹配位置。
O(Σ×m)
构建时间复杂度
O(n + z)
匹配时间复杂度
O(Σ×m)
空间复杂度
其中:Σ 是字符集大小,m 是模式串总长度,n 是文本长度,z 是匹配次数
构建 Trie 树
Trie 树(字典树)是 AC 自动机的基础结构,将所有模式串组织成一棵树形结构。
Trie 树构建可视化
graph TD
A[开始] --> B[创建根节点]
B --> C[遍历每个模式串]
C --> D[逐字符插入 Trie]
D --> E{字符存在?}
E -->|是 | F[移动到子节点]
E -->|否 | G[创建新节点]
G --> F
F --> H{还有字符?}
H -->|是 | D
H -->|否 | I[标记结束节点]
I --> J{还有模式串?}
J -->|是 | C
J -->|否 | K[Trie 构建完成]
style A fill:#48bb78,color:#fff
style K fill:#48bb78,color:#fff
style E fill:#ed8936,color:#fff
style I fill:#667eea,color:#fff
失配边 (Fail Link) 构建
失配边是 AC 自动机的核心,当当前字符无法匹配时,通过失配边跳转到下一个可能的匹配位置。
步骤 1: 根节点子节点
根节点的所有直接子节点的失配边指向根节点
步骤 2: BFS 层次遍历
使用 BFS 按层次遍历 Trie 树
步骤 3: 计算失配边
通过父节点的失配边找到最长后缀匹配
步骤 4: 优化输出
将输出边指向最近的结束节点
失配边计算规则
fail[child] = go(fail[parent], char)
其中 go(state, char) 返回从 state 状态经过字符 char 能到达的状态
其中 go(state, char) 返回从 state 状态经过字符 char 能到达的状态
匹配过程
使用构建好的 AC 自动机在文本中查找所有模式串的出现位置。
文本:ushers
开始匹配...
sequenceDiagram
participant T as 文本指针
participant S as 当前状态
participant F as 失配边
participant O as 输出
T->>S: 读取字符 c
S->>S: 尝试转移 go(s, c)
S->>F: 失败?沿 fail 跳转
F->>S: 找到可转移状态
S->>O: 到达结束节点?输出匹配
O->>T: 继续下一个字符
Note over T,O: 只需遍历文本一次
代码实现
class AhoCorasick {
constructor() {
this.root = { children: {}, fail: null, output: [], depth: 0 };
}
// 插入模式串
insert(word) {
let node = this.root;
for (const char of word) {
if (!node.children[char]) {
node.children[char] = { children: {}, fail: null, output: [], depth: node.depth + 1 };
}
node = node.children[char];
}
node.output.push(word);
}
// 构建失配边
build() {
const queue = [];
// 根节点的子节点失配边指向根
for (const char in this.root.children) {
this.root.children[char].fail = this.root;
queue.push(this.root.children[char]);
}
// BFS 构建失配边
while (queue.length > 0) {
const curr = queue.shift();
for (const char in curr.children) {
const child = curr.children[char];
queue.push(child);
// 通过父节点的失配边找到最长后缀
let fail = curr.fail;
while (fail && !fail.children[char]) {
fail = fail.fail;
}
child.fail = fail ? fail.children[char] : this.root;
// 合并输出
child.output = [...child.output, ...child.fail.output];
}
}
}
// 搜索文本
search(text) {
const results = [];
let node = this.root;
for (let i = 0; i < text.length; i++) {
const char = text[i];
// 沿失配边查找
while (node && !node.children[char]) {
node = node.fail;
}
if (node) {
node = node.children[char];
// 记录所有匹配
for (const pattern of node.output) {
results.push({
pattern,
endPos: i,
startPos: i - pattern.length + 1
});
}
}
}
return results;
}
}
// 使用示例
const ac = new AhoCorasick();
ac.insert("he");
ac.insert("she");
ac.insert("his");
ac.insert("hers");
ac.build();
const results = ac.search("ushers");
console.log(results);
// [{pattern: "she", startPos: 1, endPos: 3},
// {pattern: "he", startPos: 2, endPos: 3},
// {pattern: "hers", startPos: 2, endPos: 5}]from collections import deque, defaultdict
class AhoCorasick:
def __init__(self):
self.root = {'children': {}, 'fail': None, 'output': [], 'depth': 0}
def insert(self, word: str) -> None:
node = self.root
for char in word:
if char not in node['children']:
node['children'][char] = {
'children': {},
'fail': None,
'output': [],
'depth': node['depth'] + 1
}
node = node['children'][char]
node['output'].append(word)
def build(self) -> None:
queue = deque()
# 根节点的子节点失配边指向根
for char, child in self.root['children'].items():
child['fail'] = self.root
queue.append(child)
# BFS 构建失配边
while queue:
curr = queue.popleft()
for char, child in curr['children'].items():
queue.append(child)
# 通过父节点的失配边找到最长后缀
fail = curr['fail']
while fail and char not in fail['children']:
fail = fail['fail']
child['fail'] = fail['children'][char] if fail else self.root
# 合并输出
child['output'].extend(child['fail']['output'])
def search(self, text: str) -> list:
results = []
node = self.root
for i, char in enumerate(text):
# 沿失配边查找
while node and char not in node['children']:
node = node['fail']
if node and char in node['children']:
node = node['children'][char]
# 记录所有匹配
for pattern in node['output']:
results.append({
'pattern': pattern,
'startPos': i - len(pattern) + 1,
'endPos': i
})
return results
# 使用示例
ac = AhoCorasick()
for word in ["he", "she", "his", "hers"]:
ac.insert(word)
ac.build()
results = ac.search("ushers")
print(results)package main
import "fmt"
type Node struct {
children map[rune]*Node
fail *Node
output []string
depth int
}
func newNode(depth int) *Node {
return &Node{
children: make(map[rune]*Node),
output: []string{},
depth: depth,
}
}
type AhoCorasick struct {
root *Node
}
func NewAhoCorasick() *AhoCorasick {
return &AhoCorasick{root: newNode(0)}
}
func (ac *AhoCorasick) Insert(word string) {
node := ac.root
for _, char := range word {
if _, ok := node.children[char]; !ok {
node.children[char] = newNode(node.depth + 1)
}
node = node.children[char]
}
node.output = append(node.output, word)
}
func (ac *AhoCorasick) Build() {
queue := []*Node{}
// 根节点的子节点失配边指向根
for _, child := range ac.root.children {
child.fail = ac.root
queue = append(queue, child)
}
// BFS 构建失配边
for len(queue) > 0 {
curr := queue[0]
queue = queue[1:]
for char, child := range curr.children {
queue = append(queue, child)
// 通过父节点的失配边找到最长后缀
fail := curr.fail
for fail != nil {
if _, ok := fail.children[char]; ok {
break
}
fail = fail.fail
}
if fail != nil {
child.fail = fail.children[char]
} else {
child.fail = ac.root
}
// 合并输出
child.output = append(child.output, child.fail.output...)
}
}
}
func (ac *AhoCorasick) Search(text string) []map[string]interface{} {
results := []map[string]interface{}{}
node := ac.root
for i, char := range text {
// 沿失配边查找
for node != nil {
if _, ok := node.children[char]; ok {
break
}
node = node.fail
}
if node != nil {
node = node.children[char]
// 记录所有匹配
for _, pattern := range node.output {
results = append(results, map[string]interface{}{
"pattern": pattern,
"startPos": i - len(pattern) + 1,
"endPos": i,
})
}
}
}
return results
}
func main() {
ac := NewAhoCorasick()
for _, word := range []string{"he", "she", "his", "hers"} {
ac.Insert(word)
}
ac.Build()
results := ac.Search("ushers")
fmt.Println(results)
}应用场景
🔒 敏感词过滤
内容审核系统一次性查找多个敏感词
🦠 病毒特征匹配
杀毒软件检测病毒特征码
🔍 搜索引擎
高亮显示多个关键词
📊 生物信息学
DNA 序列模式匹配