|
| 1 | +# Python program for implementation of |
| 2 | +# Aho-Corasick algorithm for string matching |
| 3 | + |
| 4 | +# defaultdict is used only for creating dictionary |
| 5 | +# which is the final output |
| 6 | +from collections import defaultdict |
| 7 | + |
| 8 | +# For simplicity, Arrays and Queues have been implemented using lists. |
| 9 | +# If you want to improve performace try using them instead |
| 10 | +class AhoCorasick: |
| 11 | + def __init__(self, words): |
| 12 | + |
| 13 | + # Max number of states in the matching machine. |
| 14 | + # Should be equal to the sum of the length of all keywords. |
| 15 | + self.max_states = sum([len(word) for word in words]) |
| 16 | + |
| 17 | + # Maximum number of characters. |
| 18 | + # Currently supports only alphabets [a,z] |
| 19 | + self.max_characters = 26 |
| 20 | + |
| 21 | + # All the words in dictionary which will be used to create Trie |
| 22 | + self.words = words |
| 23 | + |
| 24 | + # OUTPUT FUNCTION IS IMPLEMENTED USING out [] |
| 25 | + # Bit i in this mask is one if the word with |
| 26 | + # index i appears when the machine enters this state. |
| 27 | + # Lets say, a state outputs two words "he" and "she" and |
| 28 | + # in our provided words list, he has index 0 and she has index 3 |
| 29 | + # so value of out[state] for this state will be 1001 |
| 30 | + # It has been initialized to all 0. |
| 31 | + # We have taken one extra state for the root. |
| 32 | + self.out = [0]*(self.max_states+1) |
| 33 | + |
| 34 | + # FAILURE FUNCTION IS IMPLEMENTED USING fail [] |
| 35 | + # There is one value for each state + 1 for the root |
| 36 | + # It has been initialized to all -1 |
| 37 | + # This will contain the fail state value for each state |
| 38 | + self.fail = [-1]*(self.max_states+1) |
| 39 | + |
| 40 | + # GOTO FUNCTION (OR TRIE) IS IMPLEMENTED USING goto [[]] |
| 41 | + # Number of rows = max_states + 1 |
| 42 | + # Number of columns = max_characters i.e 26 in our case |
| 43 | + # It has been initialized to all -1. |
| 44 | + self.goto = [[-1]*self.max_characters for _ in range(self.max_states+1)] |
| 45 | + |
| 46 | + # Once the Trie has been built, it will contain the number |
| 47 | + # of nodes in Trie which is total number of states required <= max_states |
| 48 | + self.states_count = self.__build_matching_machine() |
| 49 | + |
| 50 | + |
| 51 | + # Builds the String matching machine. |
| 52 | + # Returns the number of states that the built machine has. |
| 53 | + # States are numbered 0 up to the return value - 1, inclusive. |
| 54 | + def __build_matching_machine(self): |
| 55 | + k = len(self.words) |
| 56 | + |
| 57 | + # Initially, we just have the 0 state |
| 58 | + states = 1 |
| 59 | + |
| 60 | + # Convalues for goto function, i.e., fill goto |
| 61 | + # This is same as building a Trie for words[] |
| 62 | + for i in range(k): |
| 63 | + word = self.words[i] |
| 64 | + current_state = 0 |
| 65 | + |
| 66 | + # Process all the characters of the current word |
| 67 | + for character in word: |
| 68 | + ch = ord(character) - 97 # Ascii valaue of 'a' = 97 |
| 69 | + |
| 70 | + # Allocate a new node (create a new state) |
| 71 | + # if a node for ch doesn't exist. |
| 72 | + if self.goto[current_state][ch] == -1: |
| 73 | + self.goto[current_state][ch] = states |
| 74 | + states += 1 |
| 75 | + |
| 76 | + current_state = self.goto[current_state][ch] |
| 77 | + |
| 78 | + # Add current word in output function |
| 79 | + self.out[current_state] |= (1<<i) |
| 80 | + |
| 81 | + # For all characters which don't have |
| 82 | + # an edge from root (or state 0) in Trie, |
| 83 | + # add a goto edge to state 0 itself |
| 84 | + for ch in range(self.max_characters): |
| 85 | + if self.goto[0][ch] == -1: |
| 86 | + self.goto[0][ch] = 0 |
| 87 | + |
| 88 | + # Failure function is computed in |
| 89 | + # breadth first order using a queue |
| 90 | + queue = [] |
| 91 | + |
| 92 | + # Iterate over every possible input |
| 93 | + for ch in range(self.max_characters): |
| 94 | + |
| 95 | + # All nodes of depth 1 have failure function value as 0. |
| 96 | + if self.goto[0][ch] != 0: |
| 97 | + self.fail[self.goto[0][ch]] = 0 |
| 98 | + queue.append(self.goto[0][ch]) |
| 99 | + |
| 100 | + # Now queue has states 1 and 3 |
| 101 | + while queue: |
| 102 | + |
| 103 | + # Remove the front state from queue |
| 104 | + state = queue.pop(0) |
| 105 | + |
| 106 | + # For the removed state, find failure |
| 107 | + # function for all those characters |
| 108 | + # for which goto function is not defined. |
| 109 | + for ch in range(self.max_characters): |
| 110 | + |
| 111 | + # If goto function is defined for |
| 112 | + # character 'ch' and 'state' |
| 113 | + if self.goto[state][ch] != -1: |
| 114 | + |
| 115 | + # Find failure state of removed state |
| 116 | + failure = self.fail[state] |
| 117 | + |
| 118 | + # Find the deepest node labeled by proper |
| 119 | + # suffix of String from root to current state. |
| 120 | + while self.goto[failure][ch] == -1: |
| 121 | + failure = self.fail[failure] |
| 122 | + |
| 123 | + failure = self.goto[failure][ch] |
| 124 | + self.fail[self.goto[state][ch]] = failure |
| 125 | + |
| 126 | + # Merge output values |
| 127 | + self.out[self.goto[state][ch]] |= self.out[failure] |
| 128 | + |
| 129 | + # Insert the next level node (of Trie) in Queue |
| 130 | + queue.append(self.goto[state][ch]) |
| 131 | + |
| 132 | + return states |
| 133 | + |
| 134 | + |
| 135 | + # Returns the next state the machine will transition to using goto |
| 136 | + # and failure functions. |
| 137 | + # current_state - The current state of the machine. Must be between |
| 138 | + # 0 and the number of states - 1, inclusive. |
| 139 | + # next_input - The next character that enters into the machine. |
| 140 | + def __find_next_state(self, current_state, next_input): |
| 141 | + answer = current_state |
| 142 | + ch = ord(next_input) - 97 # Ascii value of 'a' is 97 |
| 143 | + |
| 144 | + # If goto is not defined, use |
| 145 | + # failure function |
| 146 | + while self.goto[answer][ch] == -1: |
| 147 | + answer = self.fail[answer] |
| 148 | + |
| 149 | + return self.goto[answer][ch] |
| 150 | + |
| 151 | + |
| 152 | + # This function finds all occurrences of all words in text. |
| 153 | + def search_words(self, text): |
| 154 | + |
| 155 | + # Initialize current_state to 0 |
| 156 | + current_state = 0 |
| 157 | + |
| 158 | + # A dictionary to store the result. |
| 159 | + # Key here is the found word |
| 160 | + # Value is a list of all occurances start index |
| 161 | + result = defaultdict(list) |
| 162 | + |
| 163 | + # Traverse the text through the built machine |
| 164 | + # to find all occurrences of words |
| 165 | + for i in range(len(text)): |
| 166 | + current_state = self.__find_next_state(current_state, text[i]) |
| 167 | + |
| 168 | + # If match not found, move to next state |
| 169 | + if self.out[current_state] == 0: continue |
| 170 | + |
| 171 | + # Match found, store the word in result dictionary |
| 172 | + for j in range(len(self.words)): |
| 173 | + if (self.out[current_state] & (1<<j)) > 0: |
| 174 | + word = self.words[j] |
| 175 | + |
| 176 | + # Start index of word is (i-len(word)+1) |
| 177 | + result[word].append(i-len(word)+1) |
| 178 | + |
| 179 | + # Return the final result dictionary |
| 180 | + return result |
| 181 | + |
| 182 | +# Driver code |
| 183 | +if __name__ == "__main__": |
| 184 | + words = ["he", "she", "hers", "his"] |
| 185 | + text = "ahishers" |
| 186 | + |
| 187 | + # Create an Object to initialize the Trie |
| 188 | + aho_chorasick = AhoCorasick(words) |
| 189 | + |
| 190 | + # Get the result |
| 191 | + result = aho_chorasick.search_words(text) |
| 192 | + |
| 193 | + # Print the result |
| 194 | + for word in result: |
| 195 | + for i in result[word]: |
| 196 | + print("Word", word, "appears from", i, "to", i+len(word)-1) |
0 commit comments