算法简介

Aho-Corasick 算法是一种多模式字符串匹配算法,由 Alfred Aho 和 Margaret Corasick 于 1975 年提出。它能够在一次遍历中同时查找多个模式串,效率远高于对每个模式串单独进行 KMP 匹配。

💡 核心思想
将多个模式串构建成 Trie 树,然后添加失配边 (Fail Link),使得匹配失败时能够快速跳转到下一个可能的匹配位置。
O(Σ×m)
构建时间复杂度
O(n + z)
匹配时间复杂度
O(Σ×m)
空间复杂度

其中:Σ 是字符集大小,m 是模式串总长度,n 是文本长度,z 是匹配次数

构建 Trie 树

Trie 树(字典树)是 AC 自动机的基础结构,将所有模式串组织成一棵树形结构。

Trie 树构建可视化

graph TD A[开始] --> B[创建根节点] B --> C[遍历每个模式串] C --> D[逐字符插入 Trie] D --> E{字符存在?} E -->|是 | F[移动到子节点] E -->|否 | G[创建新节点] G --> F F --> H{还有字符?} H -->|是 | D H -->|否 | I[标记结束节点] I --> J{还有模式串?} J -->|是 | C J -->|否 | K[Trie 构建完成] style A fill:#48bb78,color:#fff style K fill:#48bb78,color:#fff style E fill:#ed8936,color:#fff style I fill:#667eea,color:#fff

匹配过程

使用构建好的 AC 自动机在文本中查找所有模式串的出现位置。

文本:ushers
开始匹配...
sequenceDiagram participant T as 文本指针 participant S as 当前状态 participant F as 失配边 participant O as 输出 T->>S: 读取字符 c S->>S: 尝试转移 go(s, c) S->>F: 失败?沿 fail 跳转 F->>S: 找到可转移状态 S->>O: 到达结束节点?输出匹配 O->>T: 继续下一个字符 Note over T,O: 只需遍历文本一次

代码实现

class AhoCorasick {
    constructor() {
        this.root = { children: {}, fail: null, output: [], depth: 0 };
    }

    // 插入模式串
    insert(word) {
        let node = this.root;
        for (const char of word) {
            if (!node.children[char]) {
                node.children[char] = { children: {}, fail: null, output: [], depth: node.depth + 1 };
            }
            node = node.children[char];
        }
        node.output.push(word);
    }

    // 构建失配边
    build() {
        const queue = [];
        
        // 根节点的子节点失配边指向根
        for (const char in this.root.children) {
            this.root.children[char].fail = this.root;
            queue.push(this.root.children[char]);
        }

        // BFS 构建失配边
        while (queue.length > 0) {
            const curr = queue.shift();
            
            for (const char in curr.children) {
                const child = curr.children[char];
                queue.push(child);

                // 通过父节点的失配边找到最长后缀
                let fail = curr.fail;
                while (fail && !fail.children[char]) {
                    fail = fail.fail;
                }
                child.fail = fail ? fail.children[char] : this.root;

                // 合并输出
                child.output = [...child.output, ...child.fail.output];
            }
        }
    }

    // 搜索文本
    search(text) {
        const results = [];
        let node = this.root;

        for (let i = 0; i < text.length; i++) {
            const char = text[i];

            // 沿失配边查找
            while (node && !node.children[char]) {
                node = node.fail;
            }

            if (node) {
                node = node.children[char];
                
                // 记录所有匹配
                for (const pattern of node.output) {
                    results.push({
                        pattern,
                        endPos: i,
                        startPos: i - pattern.length + 1
                    });
                }
            }
        }

        return results;
    }
}

// 使用示例
const ac = new AhoCorasick();
ac.insert("he");
ac.insert("she");
ac.insert("his");
ac.insert("hers");
ac.build();

const results = ac.search("ushers");
console.log(results);
// [{pattern: "she", startPos: 1, endPos: 3}, 
//  {pattern: "he", startPos: 2, endPos: 3},
//  {pattern: "hers", startPos: 2, endPos: 5}]
from collections import deque, defaultdict

class AhoCorasick:
    def __init__(self):
        self.root = {'children': {}, 'fail': None, 'output': [], 'depth': 0}

    def insert(self, word: str) -> None:
        node = self.root
        for char in word:
            if char not in node['children']:
                node['children'][char] = {
                    'children': {}, 
                    'fail': None, 
                    'output': [], 
                    'depth': node['depth'] + 1
                }
            node = node['children'][char]
        node['output'].append(word)

    def build(self) -> None:
        queue = deque()
        
        # 根节点的子节点失配边指向根
        for char, child in self.root['children'].items():
            child['fail'] = self.root
            queue.append(child)

        # BFS 构建失配边
        while queue:
            curr = queue.popleft()
            
            for char, child in curr['children'].items():
                queue.append(child)

                # 通过父节点的失配边找到最长后缀
                fail = curr['fail']
                while fail and char not in fail['children']:
                    fail = fail['fail']
                
                child['fail'] = fail['children'][char] if fail else self.root
                
                # 合并输出
                child['output'].extend(child['fail']['output'])

    def search(self, text: str) -> list:
        results = []
        node = self.root

        for i, char in enumerate(text):
            # 沿失配边查找
            while node and char not in node['children']:
                node = node['fail']

            if node and char in node['children']:
                node = node['children'][char]
                
                # 记录所有匹配
                for pattern in node['output']:
                    results.append({
                        'pattern': pattern,
                        'startPos': i - len(pattern) + 1,
                        'endPos': i
                    })

        return results

# 使用示例
ac = AhoCorasick()
for word in ["he", "she", "his", "hers"]:
    ac.insert(word)
ac.build()

results = ac.search("ushers")
print(results)
package main

import "fmt"

type Node struct {
    children map[rune]*Node
    fail     *Node
    output   []string
    depth    int
}

func newNode(depth int) *Node {
    return &Node{
        children: make(map[rune]*Node),
        output:   []string{},
        depth:    depth,
    }
}

type AhoCorasick struct {
    root *Node
}

func NewAhoCorasick() *AhoCorasick {
    return &AhoCorasick{root: newNode(0)}
}

func (ac *AhoCorasick) Insert(word string) {
    node := ac.root
    for _, char := range word {
        if _, ok := node.children[char]; !ok {
            node.children[char] = newNode(node.depth + 1)
        }
        node = node.children[char]
    }
    node.output = append(node.output, word)
}

func (ac *AhoCorasick) Build() {
    queue := []*Node{}

    // 根节点的子节点失配边指向根
    for _, child := range ac.root.children {
        child.fail = ac.root
        queue = append(queue, child)
    }

    // BFS 构建失配边
    for len(queue) > 0 {
        curr := queue[0]
        queue = queue[1:]

        for char, child := range curr.children {
            queue = append(queue, child)

            // 通过父节点的失配边找到最长后缀
            fail := curr.fail
            for fail != nil {
                if _, ok := fail.children[char]; ok {
                    break
                }
                fail = fail.fail
            }
            if fail != nil {
                child.fail = fail.children[char]
            } else {
                child.fail = ac.root
            }

            // 合并输出
            child.output = append(child.output, child.fail.output...)
        }
    }
}

func (ac *AhoCorasick) Search(text string) []map[string]interface{} {
    results := []map[string]interface{}{}
    node := ac.root

    for i, char := range text {
        // 沿失配边查找
        for node != nil {
            if _, ok := node.children[char]; ok {
                break
            }
            node = node.fail
        }

        if node != nil {
            node = node.children[char]
            
            // 记录所有匹配
            for _, pattern := range node.output {
                results = append(results, map[string]interface{}{
                    "pattern":  pattern,
                    "startPos": i - len(pattern) + 1,
                    "endPos":   i,
                })
            }
        }
    }

    return results
}

func main() {
    ac := NewAhoCorasick()
    for _, word := range []string{"he", "she", "his", "hers"} {
        ac.Insert(word)
    }
    ac.Build()

    results := ac.Search("ushers")
    fmt.Println(results)
}

应用场景

🔒 敏感词过滤

内容审核系统一次性查找多个敏感词

🦠 病毒特征匹配

杀毒软件检测病毒特征码

🔍 搜索引擎

高亮显示多个关键词

📊 生物信息学

DNA 序列模式匹配