Motivation
I have this repository. It contains a program that analyzes an input text file and builds a word graph: in the graph, each node represents a word in the analyzed text. Now, if there are two consecutive words, A and B, we create a directed arc from A to B. When the graph is built, the last step is to draw a probabilistic distribution of children and parents of a node, for each node.
When the graph is complete, in order to generate a sentence, we choose the node corresponding to the single dot (.), and go backwards via parent links until the requested number of nodes has been accummulated. When considering at a node what parent to generate, we watch at all the parents \$w_1, \ldots, w_n\$: then the statistical probabilities of each parent becomes \$p_1, \ldots, p_n\$. Finally, we toss a random coin \$c \in [0, \sum_{i = 1}^{n} p_i)\$, and choose the \$w_j\$ such that \$\sum_{i = 1}^j p_i \leq c\$, where \$j\$ is maximized.
In this program, I did not bother to come up with a fancy framework, but instead concentrate on getting the job done; no more, no less.
The book from which the word graph was trained is War and Peace.
Finally, the typical output might look like this:
Producing sentences took 68 ms. Producing word data took 295 ms. Building word graph took 300 ms. Total preprocessing took 663 ms. > stat belong --- Outgoing word arcs: . , w = 1,0000, p = 0,071429 ? , w = 1,0000, p = 0,071429 entirely, w = 1,0000, p = 0,071429 to , w = 10,0000, p = 0,714286 you , w = 1,0000, p = 0,071429 Total of 5 outgoing arcs. --- Incoming word arcs: could , w = 1,000, p = 0,071429 i , w = 2,000, p = 0,142857 may , w = 1,000, p = 0,071429 might , w = 1,000, p = 0,071429 not , w = 1,000, p = 0,071429 should, w = 1,000, p = 0,071429 to , w = 2,000, p = 0,142857 we , w = 1,000, p = 0,071429 you , w = 4,000, p = 0,285714 Total of 9 incoming arcs. > gen 15 >>> The mechanism which hampered his gleaming on pass by unseen hands clenched his imagination . > gen 15 >>> You she would not our as may turn to remain what had never breathe . > gen 15 >>> Lighting the married her hand more readily as cruel he was gazing at last . > gen 15 >>> Secret reasons why the table to the crowds of the transport having limbered up . Program code
com.github.coderodde.ai.sentencegenerator.BinaryTreeProbabilityDistribution.java:
package com.github.coderodde.ai.sentencegenerator; import java.util.HashMap; import java.util.Map; import java.util.NoSuchElementException; import java.util.Random; final class BinaryTreeProbabilityDistribution { public static final class FieldLengths { public final int maximumWordLength; public final int maximumWeightLength; private FieldLengths(int maximumWordLength, int maximumWeightLength) { this.maximumWordLength = maximumWordLength; this.maximumWeightLength = maximumWeightLength; } } private static final class Node { private final DirectedWordGraphNode element; private double weight; private final boolean isRelayNode; private Node leftChild; private Node rightChild; private Node parent; private int numberOfLeafNodes; Node(DirectedWordGraphNode element, double weight) { this.element = element; this.weight = weight; this.numberOfLeafNodes = 1; this.isRelayNode = false; } Node(double weight) { this.element = null; this.weight = weight; this.numberOfLeafNodes = 1; this.isRelayNode = true; } @Override public String toString() { if (isRelayNode) { return "[" + String.format("%.3f", getWeight()) + " : " + numberOfLeafNodes + "]"; } return "(" + String.format("%.3f", getWeight()) + " : " + element + ")"; } DirectedWordGraphNode getElement() { return element; } double getWeight() { return weight; } void setWeight(double weight) { this.weight = weight; } int getNumberOfLeaves() { return numberOfLeafNodes; } void setNumberOfLeaves(int numberOfLeaves) { this.numberOfLeafNodes = numberOfLeaves; } Node getLeftChild() { return leftChild; } void setLeftChild(Node block) { this.leftChild = block; } Node getRightChild() { return rightChild; } void setRightChild(Node block) { this.rightChild = block; } Node getParent() { return parent; } void setParent(Node block) { this.parent = block; } boolean isRelayNode() { return isRelayNode; } } private final Map<DirectedWordGraphNode, Node> map = new HashMap<>(); private Node root; private double totalWeight; private final Random random; public BinaryTreeProbabilityDistribution() { this(new Random()); } public BinaryTreeProbabilityDistribution(Random random) { this.random = random; } public boolean addElement(DirectedWordGraphNode element, double weight) { checkWeightNotNaNAndIsPositive(weight); Node node = map.get(element); if (node == null) { node = new Node(element, weight); insert(node); map.put(element, node); } else { node.setWeight(node.getWeight() + weight); updateMetadata(node, weight, 0); } totalWeight += weight; return true; } public boolean contains(DirectedWordGraphNode element) { return map.containsKey(element); } public DirectedWordGraphNode sampleElement() { checkNotEmpty(map.size()); double value = totalWeight * random.nextDouble(); Node node = root; while (node.isRelayNode()) { if (value < node.getLeftChild().getWeight()) { node = node.getLeftChild(); } else { value -= node.getLeftChild().getWeight(); node = node.getRightChild(); } } return node.getElement(); } public double getTotalWeight() { return totalWeight; } public double getWeight(DirectedWordGraphNode element) { Node node = map.get(element); if (node == null) { throw new NoSuchElementException( "The input element not found in the distribution."); } return node.getWeight(); } public double getProbability(DirectedWordGraphNode element) { Node node = map.get(element); if (node == null) { throw new NoSuchElementException( "The input element not found in the distribution."); } return node.getWeight() / totalWeight; } public boolean removeElement(DirectedWordGraphNode element) { Node node = map.remove(element); if (node == null) { return false; } delete(node); totalWeight -= node.getWeight(); return true; } public void clear() { root = null; map.clear(); totalWeight = 0.0; } public boolean isEmpty() { return map.isEmpty(); } public int size() { return map.size(); } public FieldLengths getFieldLengths() { int maximumWordLength = 0; int maximumWeightLength = 0; for (Node node : map.values()) { String word = node.getElement().getWord(); double weight = node.getWeight(); int wordLength = word.length(); int weightLength = Double.toString(weight).length(); maximumWordLength = Math.max(maximumWordLength, wordLength); maximumWeightLength = Math.max(maximumWeightLength, weightLength); } return new FieldLengths(maximumWordLength, maximumWeightLength); } public String getEntryString(DirectedWordGraphNode element) { Node node = map.get(element); if (node == null) { return null; } StringBuilder stringBuilder = new StringBuilder(); double probability = node.getWeight() / totalWeight; stringBuilder.append(element.toString()) .append(", w = ") .append(node.getWeight()) .append(", p = ") .append(probability); return stringBuilder.toString(); } // @Override // public String toString() { // StringBuilder stringBuilder = new StringBuilder(); // Node node = getMinimumNode(); // // int maximumWordLength = getMaximumWordLength(); // // while (node != null) { // String str = // String.format( // "%+" + (maximumWordLength + 1) + "s", // node.element.toString()); // // stringBuilder.append(str) // .append(", w = ") // .append(node.getWeight()) // .append(", p = ") // .append(node.getWeight() / totalWeight) // .append("\n"); // // node = getSuccessorOf(node); // } // // return stringBuilder.toString(); // } private Node getMinimumNode() { if (isEmpty()) { return null; } Node node = root; while (node.getLeftChild() != null) { node = node.getLeftChild(); } return node; } private Node getMinimumNode(Node node) { while (node.getLeftChild() != null) { node = node.getLeftChild(); } return node; } private Node getSuccessorOf(Node node) { if (node.getRightChild() != null) { return getMinimumNode(node.getRightChild()); } Node parent = node.getParent(); while (parent != null && parent.getRightChild() == node) { node = parent; parent = parent.getParent(); } return parent; } private void bypassLeafNode(Node leafNodeToBypass, Node newNode) { Node relayNode = new Node(leafNodeToBypass.getWeight()); Node parentOfCurrentNode = leafNodeToBypass.getParent(); relayNode.setLeftChild(leafNodeToBypass); relayNode.setRightChild(newNode); leafNodeToBypass.setParent(relayNode); newNode.setParent(relayNode); if (parentOfCurrentNode == null) { root = relayNode; } else if (parentOfCurrentNode.getLeftChild() == leafNodeToBypass) { relayNode.setParent(parentOfCurrentNode); parentOfCurrentNode.setLeftChild(relayNode); } else { relayNode.setParent(parentOfCurrentNode); parentOfCurrentNode.setRightChild(relayNode); } updateMetadata(relayNode, newNode.getWeight(), 1); } private void insert(Node node) { if (root == null) { root = node; return; } Node currentNode = root; while (currentNode.isRelayNode()) { if (currentNode.getLeftChild().getNumberOfLeaves() < currentNode.getRightChild().getNumberOfLeaves()) { currentNode = currentNode.getLeftChild(); } else { currentNode = currentNode.getRightChild(); } } bypassLeafNode(currentNode, node); } private void delete(Node leafToDelete) { Node relayNode = leafToDelete.getParent(); if (relayNode == null) { root = null; return; } Node parentOfRelayNode = relayNode.getParent(); Node siblingLeaf = relayNode.getLeftChild() == leafToDelete ? relayNode.getRightChild() : relayNode.getLeftChild(); if (parentOfRelayNode == null) { root = siblingLeaf; siblingLeaf.setParent(null); return; } if (parentOfRelayNode.getLeftChild() == relayNode) { parentOfRelayNode.setLeftChild(siblingLeaf); } else { parentOfRelayNode.setRightChild(siblingLeaf); } siblingLeaf.setParent(parentOfRelayNode); updateMetadata(leafToDelete.getParent(), -leafToDelete.getWeight(), -1); } private void updateMetadata(Node node, double weightDelta, int nodeDelta) { while (node != null) { node.setNumberOfLeaves(node.getNumberOfLeaves() + nodeDelta); node.setWeight(node.getWeight() + weightDelta); node = node.getParent(); } } private void checkWeightNotNaNAndIsPositive(double weight) { if (Double.isNaN(weight)) { throw new IllegalArgumentException("The element weight is NaN."); } if (weight <= 0.0) { throw new IllegalArgumentException( "The element weight must be positive. Received " + weight); } if (Double.isInfinite(weight)) { // Once here, 'weight' is positive infinity. throw new IllegalArgumentException( "The element weight is infinite."); } } private void checkNotEmpty(int size) { if (size == 0) { throw new IllegalStateException( "This probability distribution is empty."); } } } com.github.coderodde.ai.sentencegenerator.DirectedWordGraphNode.java:
package com.github.coderodde.ai.sentencegenerator; import java.util.HashMap; import java.util.Map; import java.util.Set; final class DirectedWordGraphNode implements Comparable<DirectedWordGraphNode> { private final String word; private final BinaryTreeProbabilityDistribution parentProbabilityDistribution = new BinaryTreeProbabilityDistribution(); private final BinaryTreeProbabilityDistribution childProbabilityDistribution = new BinaryTreeProbabilityDistribution(); private final Map<DirectedWordGraphNode, Integer> childMap = new HashMap<>(); private final Map<DirectedWordGraphNode, Integer> parentMap = new HashMap<>(); public DirectedWordGraphNode(String word) { this.word = word; } public String getWord() { return word; } public double getChildWeight(DirectedWordGraphNode child) { return getParentProbabilityDistribution().getWeight(child); } public double getParentWeight(DirectedWordGraphNode parent) { return getChildProbabilityDistribution().getWeight(parent); } public void connectToParent(DirectedWordGraphNode parentNode) { if (!parentMap.containsKey(parentNode)) { parentMap.put(parentNode, 1); } else { parentMap.put( parentNode, parentMap.get(parentNode) + 1); } if (!parentNode.childMap.containsKey(this)) { parentNode.childMap.put(this, 1); } else { parentNode.childMap.put( this, parentNode.childMap.get(this) + 1); } } public Set<DirectedWordGraphNode> getChildren() { return childMap.keySet(); } public Set<DirectedWordGraphNode> getParents() { return parentMap.keySet(); } public void computeProbabilityDistribution() { for (Map.Entry<DirectedWordGraphNode, Integer> entry : parentMap.entrySet()) { double weight = (1.0) * entry.getValue(); parentProbabilityDistribution.addElement( entry.getKey(), weight); } for (Map.Entry<DirectedWordGraphNode, Integer> entry : childMap.entrySet()) { double weight = (1.0) * entry.getValue(); childProbabilityDistribution.addElement( entry.getKey(), weight); } } public DirectedWordGraphNode sampleParent() { if (parentProbabilityDistribution.isEmpty()) { return null; } return parentProbabilityDistribution.sampleElement(); } public BinaryTreeProbabilityDistribution getParentProbabilityDistribution() { return parentProbabilityDistribution; } public BinaryTreeProbabilityDistribution getChildProbabilityDistribution() { return childProbabilityDistribution; } @Override public String toString() { return word; } @Override public boolean equals(Object o) { DirectedWordGraphNode other = (DirectedWordGraphNode) o; return word.equals(other.word); } @Override public int hashCode() { return word.hashCode(); } @Override public int compareTo(DirectedWordGraphNode o) { return word.compareTo(o.word); } } com.github.coderodde.ai.sentencegenerator.SentenceGenerator.java:
package com.github.coderodde.ai.sentencegenerator; import com.github.coderodde.ai.sentencegenerator.WordGraphBuilder.Data; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Scanner; final class SentenceGenerator { private static final class CommandNames { private static final String GENERATE_SENTENCE = "gen"; private static final String GET_NUMBER_OF_WORDS = "words"; private static final String GET_NUMBER_OF_SENTENCES = "sentences"; private static final String LIST_ALL_WORDS = "list"; private static final String LIST_WORD_RANGE = "range"; private static final String WORD_STAT = "stat"; private static final String QUIT = "quit"; } public static void main(String[] args) { if (args.length != 1) { System.exit(1); } List<String> sentences = null; long totalPreprocessingDuration = 0L; try { long startTime = System.currentTimeMillis(); sentences = new SentenceProducer(args[0]).getSentences(); long endTime = System.currentTimeMillis(); long duration = endTime - startTime; totalPreprocessingDuration += duration; System.out.println("Producing sentences took " + duration + " ms."); } catch (IOException ex) { System.out.println(ex); System.exit(2); } long startTime = System.currentTimeMillis(); List<List<String>> words = WordProvider.getWords(sentences); long endTime = System.currentTimeMillis(); long duration = endTime - startTime; totalPreprocessingDuration += duration; System.out.println("Producing word data took " + duration + " ms."); startTime = System.currentTimeMillis(); Data data = WordGraphBuilder.buildGraph(words); Collections.<DirectedWordGraphNode>sort(data.graph); endTime = System.currentTimeMillis(); duration = endTime - startTime; totalPreprocessingDuration += duration; System.out.println("Building word graph took " + duration + " ms."); System.out.println( "Total preprocessing took " + totalPreprocessingDuration + " ms."); repl(data); } private static void repl(Data data) { Scanner scanner = new Scanner(System.in); while (true) { System.out.print("> "); String cmdString = scanner.nextLine(); if (cmdString.startsWith(CommandNames.GENERATE_SENTENCE)) { processCommandGenerateSentence(cmdString, data); } else if (cmdString.startsWith( CommandNames.GET_NUMBER_OF_SENTENCES)) { processCommandGetNumberOfSentences( data.numberOfSentences); } else if (cmdString.startsWith( CommandNames.GET_NUMBER_OF_WORDS)) { processCommandGetNumberOfWords(cmdString, data); } else if (cmdString.startsWith( CommandNames.LIST_ALL_WORDS)) { processCommandListAllWords(data.graph); } else if (cmdString.startsWith( CommandNames.LIST_WORD_RANGE)) { processCommandListWordRange(cmdString, data.graph); } else if (cmdString.startsWith(CommandNames.WORD_STAT)) { processShowNodeStats(cmdString, data.graphMap); } else if (cmdString.startsWith(CommandNames.QUIT)) { processCommandQuit(); } else { System.out.println( ">>> Warning: \"" + cmdString + "\" has not parsed."); } } } private static void processCommandGenerateSentence( String cmd, Data data) { String[] lineParts = cmd.trim().split("\\s+"); if (!isWithinRange(lineParts.length, 1, 2)) { System.out.println( ">>> Warning: Command \"" + cmd + "\" not recognized."); return; } String commandWord2 = lineParts.length > 1 ? lineParts[1] : null; int maximumSentenceLength = commandWord2 != null ? Integer.parseInt(commandWord2) : Integer.MAX_VALUE; int currentSentenceLength = 1; DirectedWordGraphNode node = data.graphMap.get("."); List<DirectedWordGraphNode> path = new ArrayList<>(); path.add(node); while (currentSentenceLength < maximumSentenceLength) { node = node.sampleParent(); if (node == null) { break; } path.add(node); currentSentenceLength++; } Collections.<DirectedWordGraphNode>reverse(path); print(path); } private static void processCommandGetNumberOfSentences(int numberOfSentences) { System.out.println(">>> " + numberOfSentences); } private static void processCommandGetNumberOfWords(String cmd, Data data) { String[] parts = cmd.trim().split("\\s+"); boolean distinct = false; if (parts.length == 2) { distinct = (parts[1].equals("-d")); } System.out.println( ">>> " + (distinct ? data.numberOfDistinctWords : data.numberOfWords)); } private static void processCommandListAllWords(List<DirectedWordGraphNode> graph) { graph.forEach(System.out::println); } private static void processCommandListWordRange( String cmd, List<DirectedWordGraphNode> graph) { String[] parts = cmd.trim().split("\\s+"); if (!isWithinRange(parts.length, 2, 3)) { System.out.println("Command \"" + cmd + "\" could not be parsed."); return; } int index1; int index2; try { index1 = Integer.parseInt(parts[1]); } catch (NumberFormatException ex) { System.out.println(parts[1] + " is not an index expression."); return; } if (!isWithinRange(index1, 0, graph.size() - 1)) { System.out.println( "Index " + index1 + " is not within bounds: " + index1 + ", words: " + graph.size() + "."); return; } if (parts.length == 2) { System.out.println(graph.get(index1).getWord()); return; } try { index2 = Integer.parseInt(parts[2]); } catch (NumberFormatException ex) { System.out.println(parts[2] + " is not an index expression."); return; } if (!isWithinRange(index2, 0, graph.size() - 1)) { System.out.println( "Index " + index1 + " is not within bounds: " + index1 + ", words: " + graph.size() + "."); return; } if (index1 > index2) { System.out.println(">>> Indices are reversed."); return; } for (int i = index1; i <= index2; i++) { System.out.println(graph.get(i)); } } private static void processShowNodeStats( String cmd, Map<String, DirectedWordGraphNode> graphMap) { String[] parts = cmd.split("\\s+"); String word = parts[1]; DirectedWordGraphNode node = graphMap.get(word); if (node == null) { System.out.println("\"" + word + "\": no such word."); return; } System.out.println("--- Outgoing word arcs:"); BinaryTreeProbabilityDistribution.FieldLengths fieldLengths = node.getChildProbabilityDistribution().getFieldLengths(); double totalWeight = node.getChildProbabilityDistribution() .getTotalWeight(); List<DirectedWordGraphNode> children = new ArrayList<>(node.getChildren()); Collections.<DirectedWordGraphNode>sort(children); String fmt = "%-" + fieldLengths.maximumWordLength + "s, w = %.0" + fieldLengths.maximumWeightLength + "f, p = %f"; for (DirectedWordGraphNode child : children) { System.out.println( String.format( fmt, child.getWord(), child.getChildWeight(node), child.getChildWeight(node) / totalWeight)); } System.out.println("Total of " + children.size() + " outgoing arcs."); children.clear(); System.out.println("--- Incoming word arcs:"); fieldLengths = node.getParentProbabilityDistribution() .getFieldLengths(); List<DirectedWordGraphNode> parents = new ArrayList<>(node.getParents()); Collections.<DirectedWordGraphNode>sort(parents); fmt = "%-" + fieldLengths.maximumWordLength + "s, w = %.0" + fieldLengths.maximumWeightLength + "f, p = %f"; totalWeight = node.getParentProbabilityDistribution().getTotalWeight(); for (DirectedWordGraphNode parent : parents) { System.out.println( String.format( fmt, parent.getWord(), parent.getParentWeight(node), parent.getParentWeight(node) / totalWeight)); } System.out.println("Total of " + parents.size() + " incoming arcs."); parents.clear(); } private static void processCommandQuit() { System.out.println(">>> Bye!"); System.exit(0); } private static boolean isWithinRange(int value, int min, int max) { return !(value < min || value > max); } private static void print(List<DirectedWordGraphNode> path) { DirectedWordGraphNode node = path.get(0); StringBuilder stringBuilder = new StringBuilder() .append(Character.toUpperCase( node.getWord().charAt(0))) .append(node.getWord().substring(1)); for (int i = 1; i < path.size(); i++) { stringBuilder.append(" ") .append(path.get(i).getWord()); } System.out.println(">>> " + stringBuilder.toString()); } } com.github.coderodde.ai.sentencegenerator.SentenceProducer.java:
package com.github.coderodde.ai.sentencegenerator; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.ArrayList; import java.util.List; final class SentenceProducer { private static final String SPLIT_REGEX = "\\s*(\\.|\\?|;|!|—)\\s*"; private final File file; public SentenceProducer(String fileName) { this.file = new File(fileName); } public List<String> getSentences() throws IOException { String allText = Files.readString(file.toPath()); return splitEntireTextToSentences(allText); } private static List<String> splitEntireTextToSentences(String text) { char[] chars = text.toCharArray(); List<String> sentences = new ArrayList<>(); outerLoop: for (int i = 0; i < chars.length; i++) { StringBuilder stringBuilder = new StringBuilder(); for (int j = i; j < chars.length; j++, i++) { char ch = chars[j]; switch (ch) { case '.', '?', '!' -> { stringBuilder.append(ch); String newSentence = stringBuilder.toString(); sentences.add(newSentence); continue outerLoop; } default -> stringBuilder.append(ch); } } } return sentences; } } com.github.coderodde.ai.sentencegenerator.WordGraphBuilder.java:
package com.github.coderodde.ai.sentencegenerator; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; final class WordGraphBuilder { public static final class Data { public final List<DirectedWordGraphNode> graph; public final Map<String, DirectedWordGraphNode> graphMap; public final int numberOfSentences; public final int numberOfDistinctWords; public final int numberOfWords; private Data(List<DirectedWordGraphNode> graph, Map<String, DirectedWordGraphNode> graphMap, int numberOfSentences, int numberOfDistinctWords, int numberOfWords) { this.graph = graph; this.graphMap = graphMap; this.numberOfSentences = numberOfSentences; this.numberOfDistinctWords = numberOfDistinctWords; this.numberOfWords = numberOfWords; } } public static Data buildGraph(List<List<String>> sentenceList) { Map<String, DirectedWordGraphNode> nodeMap = new HashMap<>(); List<DirectedWordGraphNode> graph = new ArrayList<>(); int numberOfDistinctWords = 0; int numberOfWords = 0; for (List<String> sentence : sentenceList) { numberOfWords += sentence.size(); for (String word : sentence) { if (!nodeMap.containsKey(word)) { numberOfDistinctWords++; DirectedWordGraphNode directedWordGraphNode = new DirectedWordGraphNode(word); nodeMap.put(word, directedWordGraphNode); graph.add(directedWordGraphNode); } } } for (List<String> sentence : sentenceList) { if (sentence.isEmpty()) { continue; } for (int i = 0; i < sentence.size() - 1; i++) { String word1 = sentence.get(i); String word2 = sentence.get(i + 1); nodeMap.get(word2) .connectToParent( nodeMap.get(word1)); } } for (DirectedWordGraphNode node : nodeMap.values()) { node.computeProbabilityDistribution(); } return new Data(graph, getGraphMap(graph), sentenceList.size(), numberOfDistinctWords, numberOfWords); } private static Map<String, DirectedWordGraphNode> getGraphMap(List<DirectedWordGraphNode> graph) { Map<String, DirectedWordGraphNode> graphMap = new HashMap<>(graph.size()); for (DirectedWordGraphNode node : graph) { graphMap.put(node.getWord(), node); } return graphMap; } } com.github.coderodde.ai.sentencegenerator.WordProvider.java:
package com.github.coderodde.ai.sentencegenerator; import java.util.ArrayList; import java.util.List; final class WordProvider { public static List<List<String>> getWords(List<String> sentences) { List<List<String>> returnList = new ArrayList<>(sentences.size()); for (String sentence : sentences) { List<String> words = splitSentenceToWords(sentence); List<String> wordList = new ArrayList<>(); boolean addedWord = false; for (String word : words) { word = cleanWord(word.toLowerCase()); if (word == null) { continue; } if (word.isBlank()) { continue; } wordList.add(word); addedWord = true; } if (addedWord) { char lastChar = sentence.charAt(sentence.length() - 1); switch (lastChar) { case '.' -> wordList.add("."); case '?' -> wordList.add("?"); case '!' -> wordList.add("!"); } returnList.add(wordList); } } return returnList; } private static String cleanWord(String word) { word = word.replace("“", "") .replace("(", "") .replace(")", ""); if (word.isBlank()) { return null; } int i = 0; int cutOffIndex = -1; for (; i < word.length(); i++, cutOffIndex++) { char ch = word.charAt(i); if (ch != '‘') { break; } } if (cutOffIndex >= 0) { word = word.substring(cutOffIndex + 1); } switch (word) { case "’": case "”": return null; } if (!wordIsAlphaNumeric(word)) { return null; } return word; } private static boolean wordIsAlphaNumeric(String s) { for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); if (!Character.isLetterOrDigit(ch)) { return false; } } return true; } private static List<String> splitSentenceToWords(String sentence) { sentence = sentence.trim(); String[] wordArray = sentence.split("\\s+|\\.|,|;|:|\"|\r|\n"); List<String> returnArray = new ArrayList<>(wordArray.length + 1); for (String word : wordArray) { returnArray.add(word); } char lastSentenceCharacter = sentence.charAt(sentence.length() - 1); switch (lastSentenceCharacter) { case '.' -> returnArray.add("."); case '?' -> returnArray.add("?"); case '!' -> returnArray.add("!"); } return returnArray; } } Critique request
As always, I would be glad to hear improvement comments.