Skip to content

Commit 12e6905

Browse files
azhar1038thuva4
andauthored
Added AhoCorasick - Python (#886)
* Create AhoCorasick.py * Update README.md * Update CONTRIBUTING.md * Update README.md Co-authored-by: Thuvarakan Tharmarajasingam <17289840+Thuva4@users.noreply.github.com>
1 parent eb6677a commit 12e6905

File tree

3 files changed

+198
-1
lines changed

3 files changed

+198
-1
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,4 @@ Unfortunately, sometimes the bug can be only reproduced in your project or in yo
186186
- [Esci92](https://github.com/Esci92)
187187
- [ir2010](https://github.com/ir2010)
188188
- [Cc618](https://github.com/Cc618)
189+
- [Md Azharuddin](https://github.com/azhar1038)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,4 @@ Folder structure should be like this
117117

118118
## License
119119

120-
[Apache License 2.0](LICENSE)
120+
[Apache License 2.0](LICENSE)
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
# Python program for implementation of
2+
# Aho-Corasick algorithm for string matching
3+
4+
# defaultdict is used only for creating dictionary
5+
# which is the final output
6+
from collections import defaultdict
7+
8+
# For simplicity, Arrays and Queues have been implemented using lists.
9+
# If you want to improve performace try using them instead
10+
class AhoCorasick:
11+
def __init__(self, words):
12+
13+
# Max number of states in the matching machine.
14+
# Should be equal to the sum of the length of all keywords.
15+
self.max_states = sum([len(word) for word in words])
16+
17+
# Maximum number of characters.
18+
# Currently supports only alphabets [a,z]
19+
self.max_characters = 26
20+
21+
# All the words in dictionary which will be used to create Trie
22+
self.words = words
23+
24+
# OUTPUT FUNCTION IS IMPLEMENTED USING out []
25+
# Bit i in this mask is one if the word with
26+
# index i appears when the machine enters this state.
27+
# Lets say, a state outputs two words "he" and "she" and
28+
# in our provided words list, he has index 0 and she has index 3
29+
# so value of out[state] for this state will be 1001
30+
# It has been initialized to all 0.
31+
# We have taken one extra state for the root.
32+
self.out = [0]*(self.max_states+1)
33+
34+
# FAILURE FUNCTION IS IMPLEMENTED USING fail []
35+
# There is one value for each state + 1 for the root
36+
# It has been initialized to all -1
37+
# This will contain the fail state value for each state
38+
self.fail = [-1]*(self.max_states+1)
39+
40+
# GOTO FUNCTION (OR TRIE) IS IMPLEMENTED USING goto [[]]
41+
# Number of rows = max_states + 1
42+
# Number of columns = max_characters i.e 26 in our case
43+
# It has been initialized to all -1.
44+
self.goto = [[-1]*self.max_characters for _ in range(self.max_states+1)]
45+
46+
# Once the Trie has been built, it will contain the number
47+
# of nodes in Trie which is total number of states required <= max_states
48+
self.states_count = self.__build_matching_machine()
49+
50+
51+
# Builds the String matching machine.
52+
# Returns the number of states that the built machine has.
53+
# States are numbered 0 up to the return value - 1, inclusive.
54+
def __build_matching_machine(self):
55+
k = len(self.words)
56+
57+
# Initially, we just have the 0 state
58+
states = 1
59+
60+
# Convalues for goto function, i.e., fill goto
61+
# This is same as building a Trie for words[]
62+
for i in range(k):
63+
word = self.words[i]
64+
current_state = 0
65+
66+
# Process all the characters of the current word
67+
for character in word:
68+
ch = ord(character) - 97 # Ascii valaue of 'a' = 97
69+
70+
# Allocate a new node (create a new state)
71+
# if a node for ch doesn't exist.
72+
if self.goto[current_state][ch] == -1:
73+
self.goto[current_state][ch] = states
74+
states += 1
75+
76+
current_state = self.goto[current_state][ch]
77+
78+
# Add current word in output function
79+
self.out[current_state] |= (1<<i)
80+
81+
# For all characters which don't have
82+
# an edge from root (or state 0) in Trie,
83+
# add a goto edge to state 0 itself
84+
for ch in range(self.max_characters):
85+
if self.goto[0][ch] == -1:
86+
self.goto[0][ch] = 0
87+
88+
# Failure function is computed in
89+
# breadth first order using a queue
90+
queue = []
91+
92+
# Iterate over every possible input
93+
for ch in range(self.max_characters):
94+
95+
# All nodes of depth 1 have failure function value as 0.
96+
if self.goto[0][ch] != 0:
97+
self.fail[self.goto[0][ch]] = 0
98+
queue.append(self.goto[0][ch])
99+
100+
# Now queue has states 1 and 3
101+
while queue:
102+
103+
# Remove the front state from queue
104+
state = queue.pop(0)
105+
106+
# For the removed state, find failure
107+
# function for all those characters
108+
# for which goto function is not defined.
109+
for ch in range(self.max_characters):
110+
111+
# If goto function is defined for
112+
# character 'ch' and 'state'
113+
if self.goto[state][ch] != -1:
114+
115+
# Find failure state of removed state
116+
failure = self.fail[state]
117+
118+
# Find the deepest node labeled by proper
119+
# suffix of String from root to current state.
120+
while self.goto[failure][ch] == -1:
121+
failure = self.fail[failure]
122+
123+
failure = self.goto[failure][ch]
124+
self.fail[self.goto[state][ch]] = failure
125+
126+
# Merge output values
127+
self.out[self.goto[state][ch]] |= self.out[failure]
128+
129+
# Insert the next level node (of Trie) in Queue
130+
queue.append(self.goto[state][ch])
131+
132+
return states
133+
134+
135+
# Returns the next state the machine will transition to using goto
136+
# and failure functions.
137+
# current_state - The current state of the machine. Must be between
138+
# 0 and the number of states - 1, inclusive.
139+
# next_input - The next character that enters into the machine.
140+
def __find_next_state(self, current_state, next_input):
141+
answer = current_state
142+
ch = ord(next_input) - 97 # Ascii value of 'a' is 97
143+
144+
# If goto is not defined, use
145+
# failure function
146+
while self.goto[answer][ch] == -1:
147+
answer = self.fail[answer]
148+
149+
return self.goto[answer][ch]
150+
151+
152+
# This function finds all occurrences of all words in text.
153+
def search_words(self, text):
154+
155+
# Initialize current_state to 0
156+
current_state = 0
157+
158+
# A dictionary to store the result.
159+
# Key here is the found word
160+
# Value is a list of all occurances start index
161+
result = defaultdict(list)
162+
163+
# Traverse the text through the built machine
164+
# to find all occurrences of words
165+
for i in range(len(text)):
166+
current_state = self.__find_next_state(current_state, text[i])
167+
168+
# If match not found, move to next state
169+
if self.out[current_state] == 0: continue
170+
171+
# Match found, store the word in result dictionary
172+
for j in range(len(self.words)):
173+
if (self.out[current_state] & (1<<j)) > 0:
174+
word = self.words[j]
175+
176+
# Start index of word is (i-len(word)+1)
177+
result[word].append(i-len(word)+1)
178+
179+
# Return the final result dictionary
180+
return result
181+
182+
# Driver code
183+
if __name__ == "__main__":
184+
words = ["he", "she", "hers", "his"]
185+
text = "ahishers"
186+
187+
# Create an Object to initialize the Trie
188+
aho_chorasick = AhoCorasick(words)
189+
190+
# Get the result
191+
result = aho_chorasick.search_words(text)
192+
193+
# Print the result
194+
for word in result:
195+
for i in result[word]:
196+
print("Word", word, "appears from", i, "to", i+len(word)-1)

0 commit comments

Comments
 (0)