经典演算法题每日演练——第八题 AC自动机

上一篇我们说了单模式匹配演算法 KMP，现在我们有需求了，我要检查一篇文章中是否有某些敏感词，这其实就是多模式匹配的问题。
当然你也可以用 KMP 演算法求出，那么它的时间复杂度为 O(c*(m+n))，c：为模式串的个数。 m：为模式串的长度,n: 为正文的长度，那
么这个复杂度就不再是线性了，我们学演算法就是希望能把要解决的问题优化到极致，这不，AC 自动机就派上用场了。
其实 AC 自动机就是 Trie 树的一个活用，活用点就是灌输了 kmp 的思想，从而再次把时间复杂度优化到线性的 O(N)，刚好我前面的文
章已经说过了 Trie 树和 KMP，这里还是预设大家都懂。
一：构建 AC 自动机
同样我也用网上的经典例子，现有 say she shr he her 这样 5 个模式串，主串为 yasherhs，我要做的就是哪些模式串在主串中出现过？
1: 构建 trie 树
如果看过我前面的文章，构建 trie 树还是很容易的。

2：失败指标
构建失败指标是 AC 自动机的核心所在，玩转了它也就玩转了 AC 自动机，失败指标非常类似于 KMP 中的 next 阵列，也就是说，
当我的主串在 trie 树中进行匹配的时候，如果当前节点不能再继续进行匹配，那么我们就会走到当前节点的 failNode 节点继续进行
匹配，构建 failnode 节点也是很流程化的。
①：root 节点的子节点的 failnode 都是指向 root 。
②：当走到在 “she” 中的”h“节点时，我们给它的 failnode 设定什么呢？此时就要走该节点（h) 的父节点 (s) 的失败指标，一直回溯直
到找到某个节点的孩子节点也是当初节点同样的字元 (h)，没有找到的话，其失败指标就指向 root 。
比如：h 节点的父节点为 s，s 的 failnode 节点为 root，走到 root 后继续寻找子节点为 h 的节点，恰好我们找到了，（假如还是没
有找到，则继续走该节点的 failnode，嘿嘿，是不是很像一种回溯查询），此时就将 ”she” 中的 “h” 节点的 fainode” 指向
“her” 中的 “h” 节点，好，原理其实就是这样。（看看你的想法是不是跟图一样）

针对图中红线的”h，e“这两个节点，我们想起了什么呢？对”her“中的”e“来说，e 到 root 距离的 n 个字元恰好与”she“中的 e 向上的 n
个字元相等，我也非常类似于 kmp 中 next 函式，当字元失配时，next 阵列中记录著下一次匹配时模式串的起始位置。

1 #region Trie 树节点
2 ///

3 /// Trie 树节点
4 ///

5 public class TrieNode
6 {
7 ///

8 /// 26 个字元，也就是 26 叉树
9 ///

10 public TrieNode[] childNodes;
11
12 ///

13 /// 词频统计
14 ///

15 public int freq;
16
17 ///

18 /// 记录该节点的字元
19 ///

20 public char nodeChar;
21
22 ///

23 /// 失败指标
24 ///

25 public TrieNode faliNode;
26
27 ///

28 /// 插入记录时的编号 id
29 ///

30 public HashSet hashSet = new HashSet();
31
32 ///

33 /// 初始化
34 ///

35 public TrieNode()
36 {
37 childNodes = new TrieNode[26];
38 freq = 0;
39 }
40 }
41 #endregion

刚才我也说到了 parent 和 current 两个节点，在给 trie 中的节点赋 failnode 的时候，如果采用深度优先的话还是很麻烦的，因为我要实时
记录当前节点的父节点，相信写过树的朋友都清楚，除了深搜，我们还有广搜。

1 ///

2 /// 构建失败指标 (这里我们采用 BFS 的做法)
3 ///

4 /// 5 public void BuildFailNodeBFS(ref TrieNode root)
6 {
7 //根节点入队
8 queue.Enqueue(root);
9
10 while (queue.Count != 0)
11 {
12 //出队
13 var temp = queue.Dequeue();
14
15 //失败节点
16 TrieNode failNode = null;
17
18 //26 叉树
19 for (int i = 0; i < 26; i++) 20 { 21 //程式码技巧：用 BFS 方式，从当前节点找其孩子节点，此时孩子节点 22 // 的父亲正是当前节点，（避免了 parent 节点的存在） 23 if (temp.childNodes[i] == null) 24 continue; 25 26 //如果当前是根节点，则根节点的失败指标指向 root 27 if (temp == root) 28 { 29 temp.childNodes[i].faliNode = root; 30 } 31 else 32 { 33 //获取出队节点的失败指标 34 failNode = temp.faliNode; 35 36 //沿著它父节点的失败指标走，一直要找到一个节点，直到它的儿子也包含该节点。 37 while (failNode != null) 38 { 39 //如果不为空，则在父亲失败节点中往子节点中深入。 40 if (failNode.childNodes[i] != null) 41 { 42 temp.childNodes[i].faliNode = failNode.childNodes[i]; 43 break; 44 } 45 //如果无法深入子节点，则退回到父亲失败节点并向 root 节点往根部延伸，直到 null 46 //（一个回溯再深入的过程，非常有意思） 47 failNode = failNode.faliNode; 48 } 49 50 //等于 null 的话，指向 root 节点 51 if (failNode == null) 52 temp.childNodes[i].faliNode = root; 53 } 54 queue.Enqueue(temp.childNodes[i]); 55 } 56 } 57 } 3：模式匹配所有字元在匹配完后都必须要走 failnode 节点来结束自己的旅途, 相当于一个回旋，这样做的目的防止包含节点被忽略掉。比如：我匹配到了”she”，必然会匹配到该字串的字尾”he”，要想在程式中匹配到，则必须节点要走失败指标来结束自己的旅途。从上图中我们可以清楚的看到 “she” 的匹配到字元”e” 后，从 failnode 指标撤退，在撤退途中将其字尾字元 “e” 收入囊肿，这也就是为什么像 kmp 中的 next 函式。 1 ///

2 /// 根据指定的主串，检索是否存在模式串
3 ///

4 /// 5 /// 6 ///
7 public void SearchAC(ref TrieNode root, string s, ref HashSet hashSet)
8 {
9 int freq = 0;
10
11 TrieNode head = root;
12
13 foreach (var c in s)
14 {
15 //计算位置
16 int index = c – ‘a’;
17
18 //如果当前匹配的字元在 trie 树中无子节点并且不是 root，则要走失败指标
19 //回溯的去找它的当前节点的子节点
20 while ((head.childNodes[index] == null) && (head != root))
21 head = head.faliNode;
22
23 //获取该叉树
24 head = head.childNodes[index];
25
26 //如果为空，直接给 root, 表示该字元已经走完毕了
27 if (head == null)
28 head = root;
29
30 var temp = head;
31
32 //在 trie 树中匹配到了字元，标记当前节点为已访问，并继续寻找该节点的失败节点。
33 //直到 root 结束，相当于走了一个回旋。 (注意：最后我们会出现一个 freq=-1 的失败指标链)
34 while (temp != root && temp.freq != -1)
35 {
36 freq += temp.freq;
37
38 //将找到的 id 追加到集合中
39 foreach (var item in temp.hashSet)
40 hashSet.Add(item);
41
42 temp.freq = -1;
43
44 temp = temp.faliNode;
45 }
46 }
47 }

好了，到现在为止，我想大家也比较清楚了，最后上一个总的执行程式码：

1 using System;
2 using System.Collections.Generic;
3 using System.Linq;
4 using System.Text;
5 using System.Diagnostics;
6 using System.Threading;
7 using System.IO;
8
9 namespace ConsoleApplication2
10 {
11 public class Program
12 {
13 public static void Main()
14 {
15 Trie trie = new Trie();
16
17 trie.AddTrieNode(“say”, 1);
18 trie.AddTrieNode(“she”, 2);
19 trie.AddTrieNode(“shr”, 3);
20 trie.AddTrieNode(“her”, 4);
21 trie.AddTrieNode(“he”, 5);
22
23 trie.BuildFailNodeBFS();
24
25 string s = “yasherhs”;
26
27 var hashSet = trie.SearchAC(s);
28
29 Console.WriteLine(“ 在主串 {0} 中存在模式串的编号为:{1}”, s, string.Join(“,”, hashSet));
30
31 Console.Read();
32 }
33 }
34
35 public class Trie
36 {
37 public TrieNode trieNode = new TrieNode();
38
39 ///

40 /// 用光搜的方法来构建失败指标
41 ///

42 public Queue queue = new Queue();
43
44 #region Trie 树节点
45 ///

46 /// Trie 树节点
47 ///

48 public class TrieNode
49 {
50 ///

51 /// 26 个字元，也就是 26 叉树
52 ///

53 public TrieNode[] childNodes;
54
55 ///

56 /// 词频统计
57 ///

58 public int freq;
59
60 ///

61 /// 记录该节点的字元
62 ///

63 public char nodeChar;
64
65 ///

66 /// 失败指标
67 ///

68 public TrieNode faliNode;
69
70 ///

71 /// 插入记录时的编号 id
72 ///

73 public HashSet hashSet = new HashSet();
74
75 ///

76 /// 初始化
77 ///

78 public TrieNode()
79 {
80 childNodes = new TrieNode[26];
81 freq = 0;
82 }
83 }
84 #endregion
85
86 #region 插入操作
87 ///

88 /// 插入操作
89 ///

90 /// 91 /// 92 public void AddTrieNode(string word, int id)
93 {
94 AddTrieNode(ref trieNode, word, id);
95 }
96
97 ///

98 /// 插入操作
99 ///

100 /// 101 /// 102 public void AddTrieNode(ref TrieNode root, string word, int id)
103 {
104 if (word.Length == 0)
105 return;
106
107 //求字元地址，方便将该字元放入到 26 叉树中的哪一叉中
108 int k = word[0] – ‘a’;
109
110 //如果该叉树为空，则初始化
111 if (root.childNodes[k] == null)
112 {
113 root.childNodes[k] = new TrieNode();
114
115 //记录下字元
116 root.childNodes[k].nodeChar = word[0];
117 }
118
119 var nextWord = word.Substring(1);
120
121 //说明是最后一个字元，统计该词出现的次数
122 if (nextWord.Length == 0)
123 {
124 root.childNodes[k].freq++;
125 root.childNodes[k].hashSet.Add(id);
126 }
127
128 AddTrieNode(ref root.childNodes[k], nextWord, id);
129 }
130 #endregion
131
132 #region 构建失败指标
133 ///

134 /// 构建失败指标 (这里我们采用 BFS 的做法)
135 ///

136 public void BuildFailNodeBFS()
137 {
138 BuildFailNodeBFS(ref trieNode);
139 }
140
141 ///

142 /// 构建失败指标 (这里我们采用 BFS 的做法)
143 ///

144 /// 145 public void BuildFailNodeBFS(ref TrieNode root)
146 {
147 //根节点入队
148 queue.Enqueue(root);
149
150 while (queue.Count != 0)
151 {
152 //出队
153 var temp = queue.Dequeue();
154
155 //失败节点
156 TrieNode failNode = null;
157
158 //26 叉树
159 for (int i = 0; i < 26; i++) 160 { 161 //程式码技巧：用 BFS 方式，从当前节点找其孩子节点，此时孩子节点 162 // 的父亲正是当前节点，（避免了 parent 节点的存在） 163 if (temp.childNodes[i] == null) 164 continue; 165 166 //如果当前是根节点，则根节点的失败指标指向 root 167 if (temp == root) 168 { 169 temp.childNodes[i].faliNode = root; 170 } 171 else 172 { 173 //获取出队节点的失败指标 174 failNode = temp.faliNode; 175 176 //沿著它父节点的失败指标走，一直要找到一个节点，直到它的儿子也包含该节点。 177 while (failNode != null) 178 { 179 //如果不为空，则在父亲失败节点中往子节点中深入。 180 if (failNode.childNodes[i] != null) 181 { 182 temp.childNodes[i].faliNode = failNode.childNodes[i]; 183 break; 184 } 185 //如果无法深入子节点，则退回到父亲失败节点并向 root 节点往根部延伸，直到 null 186 //（一个回溯再深入的过程，非常有意思） 187 failNode = failNode.faliNode; 188 } 189 190 //等于 null 的话，指向 root 节点 191 if (failNode == null) 192 temp.childNodes[i].faliNode = root; 193 } 194 queue.Enqueue(temp.childNodes[i]); 195 } 196 } 197 } 198 #endregion 199 200 #region 检索操作 201 ///

202 /// 根据指定的主串，检索是否存在模式串
203 ///

204 /// 205 ///
206 public HashSet SearchAC(string s)
207 {
208 HashSet hash = new HashSet();
209
210 SearchAC(ref trieNode, s, ref hash);
211
212 return hash;
213 }
214
215 ///

216 /// 根据指定的主串，检索是否存在模式串
217 ///

218 /// 219 /// 220 ///
221 public void SearchAC(ref TrieNode root, string s, ref HashSet hashSet)
222 {
223 int freq = 0;
224
225 TrieNode head = root;
226
227 foreach (var c in s)
228 {
229 //计算位置
230 int index = c – ‘a’;
231
232 //如果当前匹配的字元在 trie 树中无子节点并且不是 root，则要走失败指标
233 //回溯的去找它的当前节点的子节点
234 while ((head.childNodes[index] == null) && (head != root))
235 head = head.faliNode;
236
237 //获取该叉树
238 head = head.childNodes[index];
239
240 //如果为空，直接给 root, 表示该字元已经走完毕了
241 if (head == null)
242 head = root;
243
244 var temp = head;
245
246 //在 trie 树中匹配到了字元，标记当前节点为已访问，并继续寻找该节点的失败节点。
247 //直到 root 结束，相当于走了一个回旋。 (注意：最后我们会出现一个 freq=-1 的失败指标链)
248 while (temp != root && temp.freq != -1)
249 {
250 freq += temp.freq;
251
252 //将找到的 id 追加到集合中
253 foreach (var item in temp.hashSet)
254 hashSet.Add(item);
255
256 temp.freq = -1;
257
258 temp = temp.faliNode;
259 }
260 }
261 }
262 #endregion
263 }
264 }

经典演算法题每日演练——第八题 AC 自动机

经典演算法题每日演练——第八题 AC 自动机