I wrote a scanner/lexer to parse my language. Some notes:
- I'm scanning files written in UTF-8.
- I avoided regex for two reasons:
- It feels far easier to simply handle codepoints one at a time than to get regex to handle Unicode properly.
- As far as I can tell, there's no efficient way for me to match a regex starting at a certain position in a string. (The best I can do is O(N) where N is the length of the source string)
- I feel like NightScanner is too big, and the cyclomatic complexity of each of the functions is too high. I don't really want to make a "NumberScanner", "SymbolScanner", etc though, because the classes would be really tightly-copuled.
NightScanner.java:
package com.nmerrill.night.parsing; import java.util.ArrayList; import java.util.List; public class NightScanner { public ScanResults scan(String text) { List<ScanError> errors = new ArrayList<>(); List<Token> tokens = new ArrayList<>(); TextIterator textIterator = new TextIterator(text); while (textIterator.valid()) { int current = textIterator.current(); if (!Character.isValidCodePoint(current)) { errors.add(new ScanError("Invalid code point: " +toString(current), textIterator.getPosition())); textIterator.next(); continue; } if (Character.isWhitespace(current)){ continue; } OptionalToken token = matchToken(textIterator); if (token.hasError()){ errors.add(new ScanError(token.getError(), textIterator.getPosition())); textIterator.next(); continue; } if (!token.exists()){ errors.add(new ScanError("Unrecognized code point: "+toString(current), textIterator.getPosition())); textIterator.next(); continue; } tokens.add(token.getToken()); } if (errors.isEmpty()){ return ScanResults.success(tokens); } else { return ScanResults.error(errors); } } private String toString(int codePoint){ return "0x"+Integer.toHexString(codePoint); } private OptionalToken matchSymbol(int codePoint, TextIterator iterator) { switch (codePoint) { case Codepoints.LEFT_PARENTHESIS: return OptionalToken.of(TokenType.LEFT_PAREN, iterator); case Codepoints.RIGHT_PARENTHESIS: return OptionalToken.of(TokenType.RIGHT_PAREN, iterator); case Codepoints.LEFT_CURLY_BRACKET: return OptionalToken.of(TokenType.LEFT_BRACE, iterator); case Codepoints.RIGHT_CURLY_BRACKET: return OptionalToken.of(TokenType.RIGHT_BRACE, iterator); case Codepoints.LEFT_SQUARE_BRACKET: return OptionalToken.of(TokenType.LEFT_BRACKET, iterator); case Codepoints.RIGHT_SQUARE_BRACKET: return OptionalToken.of(TokenType.RIGHT_BRACKET, iterator); case Codepoints.LESS_THAN_SIGN: return OptionalToken.of(TokenType.LEFT_ANGLE, iterator); case Codepoints.GREATER_THAN_SIGN: return OptionalToken.of(TokenType.RIGHT_ANGLE, iterator); case Codepoints.COMMA: return OptionalToken.of(TokenType.COMMA, iterator); case Codepoints.FULL_STOP: OptionalToken dotQuestion = matchWord(Codepoints.FULL_STOP, Codepoints.QUESTION_MARK, TokenType.DOT_QUESTION, iterator); if (dotQuestion.exists()) { return dotQuestion; } return matchWord(Codepoints.FULL_STOP, TokenType.DOT, iterator); case Codepoints.HYPHEN_MINUS: OptionalToken arrow = matchWord(Codepoints.GREATER_THAN_SIGN, TokenType.ARROW, iterator); if (arrow.exists()) { return arrow; } if (Character.isDigit(iterator.peek())){ return matchNumber(iterator); } return matchWord(Codepoints.HYPHEN_MINUS, TokenType.MINUS, iterator); case Codepoints.PLUS_SIGN: return OptionalToken.of(TokenType.PLUS, iterator); case Codepoints.SEMICOLON: return OptionalToken.of(TokenType.SEMICOLON, iterator); case Codepoints.SOLIDUS: return OptionalToken.of(TokenType.SLASH, iterator); case Codepoints.ASTERISK: return OptionalToken.of(TokenType.ASTERISK, iterator); case Codepoints.QUESTION_MARK: return OptionalToken.of(TokenType.QUESTION, iterator); case Codepoints.VERTICAL_LINE: return OptionalToken.of(TokenType.BAR, iterator); case Codepoints.AMPERSAND: return OptionalToken.of(TokenType.AMPERSAND, iterator); case Codepoints.EQUALS_SIGN: return OptionalToken.of(TokenType.ASSIGNMENT, iterator); } return OptionalToken.notFound(); } private void progressIterator(Token token, TextIterator iterator) { int length = token.getLexeme().length(); for (int i = 0; i <= length; i++) { iterator.next(); } } private TokenType matchKeyword(String keyword) { switch (keyword) { case "and": return TokenType.AND; case "await": return TokenType.AWAIT; case "break": return TokenType.BREAK; case "continue": return TokenType.CONTINUE; case "class": return TokenType.CLASS; case "else": return TokenType.ELSE; case "elseif": return TokenType.ELSEIF; case "end": return TokenType.END; case "enum": return TokenType.ENUM; case "eq": return TokenType.EQUAL; case "false": return TokenType.FALSE; case "fun": return TokenType.FUNCTION; case "gt": return TokenType.GREATER; case "gteq": return TokenType.GREATER_EQUAL; case "if": return TokenType.IF; case "let": return TokenType.LET; case "lt": return TokenType.LESS; case "lteq": return TokenType.LESS_EQUAL; case "mut": return TokenType.MUTABLE; case "neq": return TokenType.NOT_EQUAL; case "not": return TokenType.NOT; case "or": return TokenType.OR; case "panic": return TokenType.PANIC; case "protocol": return TokenType.PROTOCOL; case "repeat": return TokenType.REPEAT; case "return": return TokenType.RETURN; case "struct": return TokenType.STRUCT; case "true": return TokenType.TRUE; case "var": return TokenType.VAR; case "xor": return TokenType.XOR; case "yield": return TokenType.YIELD; } return null; } private OptionalToken matchToken(TextIterator iterator) { int currentCodePoint = iterator.current(); OptionalToken token = matchSymbol(currentCodePoint, iterator); if (token.exists()) { progressIterator(token.getToken(), iterator); return token; } if (currentCodePoint == Codepoints.QUOTATION_MARK){ return matchString(iterator); } if (currentCodePoint == Codepoints.GRAVE_ACCENT){ return matchRawString(iterator); } if (Character.isUnicodeIdentifierStart(currentCodePoint) || currentCodePoint == Codepoints.LOW_LINE) { int startPosition = iterator.current(); String identifier = matchIdentifier(iterator); TokenType type = matchKeyword(identifier); if (type == null) { type = TokenType.IDENTIFIER; } return OptionalToken.of(new Token(type, identifier, startPosition)); } if (Character.isDigit(currentCodePoint)) { return matchNumber(iterator); } return OptionalToken.error("Unrecognized symbol: "+toString(currentCodePoint)); } private OptionalToken matchString(TextIterator iterator){ StringBuilder stringBuilder = new StringBuilder(); int startPosition = iterator.getPosition(); int codePoint = iterator.next(); while (true){ if (codePoint == Codepoints.REVERSE_SOLIDUS){ codePoint = parseEscapeCode(iterator); if (codePoint == -1){ return OptionalToken.error("Invalid escape code"); } stringBuilder.appendCodePoint(codePoint); codePoint = iterator.next(); continue; } if (codePoint == Codepoints.LINE_FEED || codePoint == Codepoints.CARRIAGE_RETURN || codePoint == Codepoints.END_OF_TEXT){ return OptionalToken.error("String not finished"); } if (codePoint == Codepoints.QUOTATION_MARK){ break; } stringBuilder.appendCodePoint(codePoint); codePoint = iterator.next(); } iterator.next(); return OptionalToken.of(new Token(TokenType.STRING, stringBuilder.toString(), startPosition)); } private OptionalToken matchRawString(TextIterator iterator){ StringBuilder stringBuilder = new StringBuilder(); int startPosition = iterator.getPosition(); int codePoint = iterator.next(); while (iterator.valid()){ if (codePoint == Codepoints.GRAVE_ACCENT){ break; } if (codePoint != Codepoints.CARRIAGE_RETURN){ stringBuilder.appendCodePoint(codePoint); } codePoint = iterator.next(); } iterator.next(); return OptionalToken.of(new Token(TokenType.STRING, stringBuilder.toString(), startPosition)); } private int parseEscapeCode(TextIterator iterator){ int code = iterator.next(); switch (code){ case Codepoints.LATIN_SMALL_LETTER_N004E: return Codepoints.LINE_FEED; case Codepoints.LATIN_SMALL_LETTER_R0052: return Codepoints.CARRIAGE_RETURN; case Codepoints.LATIN_SMALL_LETTER_T0054: return Codepoints.TAB; case Codepoints.LATIN_SMALL_LETTER_U0055: return parseUnicode(iterator, 4); case Codepoints.LATIN_CAPITAL_LETTER_U: return parseUnicode(iterator, 8); case Codepoints.QUOTATION_MARK: case Codepoints.APOSTROPHE: case Codepoints.REVERSE_SOLIDUS: return code; } return -1; } private int parseUnicode(TextIterator iterator, int size){ StringBuilder codepoint = new StringBuilder(); for (int i = 0; i < size; i++){ codepoint.appendCodePoint(iterator.next()); } try { return Integer.parseInt(codepoint.toString(), 16); } catch (NumberFormatException e){ return -1; } } private String matchIdentifier(TextIterator textIterator) { StringBuilder identifierBuilder = new StringBuilder(); identifierBuilder.appendCodePoint(textIterator.current()); int current = textIterator.next(); while (true) { if (current == Codepoints.END_OF_TEXT || Character.isWhitespace(current)){ break; } if (Character.isIdentifierIgnorable(current)){ current = textIterator.next(); } else if (Character.isUnicodeIdentifierPart(current)) { identifierBuilder.appendCodePoint(current); current = textIterator.next(); } else { break; } } return identifierBuilder.toString(); } private OptionalToken matchNumber(TextIterator textIterator) { StringBuilder numberBuilder = new StringBuilder(); numberBuilder.appendCodePoint(textIterator.current()); int start = textIterator.getPosition(); int codePoint = textIterator.next(); boolean seenDecimal = false; while (true) { if (Character.isDigit(codePoint)) { numberBuilder.appendCodePoint(codePoint); codePoint = textIterator.next(); continue; } if (codePoint == Codepoints.LOW_LINE) { codePoint = textIterator.next(); continue; //Ignoring _ so numbers can be readable } if (codePoint == Codepoints.FULL_STOP) { if (seenDecimal) { return OptionalToken.error("Cannot have two decimal points in a number"); } if (!Character.isDigit(textIterator.peek())){ return OptionalToken.error("Number cannot end with a decimal point"); } seenDecimal = true; numberBuilder.appendCodePoint(codePoint); codePoint = textIterator.next(); continue; } if (Character.isUnicodeIdentifierStart(codePoint)) { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("Invalid character found in number: "); stringBuilder.appendCodePoint(codePoint); return OptionalToken.error(stringBuilder.toString()); } return OptionalToken.of(new Token(TokenType.NUMBER, numberBuilder.toString(), start)); } } private OptionalToken matchWord(int part1, TokenType tokenType, TextIterator iterator) { if (iterator.current() == part1) { return OptionalToken.of(tokenType, iterator, 1); } return OptionalToken.notFound(); } private OptionalToken matchWord(int part1, int part2, TokenType tokenType, TextIterator iterator) { if (iterator.current() == part1 && iterator.peek() == part2) { return OptionalToken.of(tokenType, iterator, 2); } return OptionalToken.notFound(); } } TextIterator.java
package com.nmerrill.night.parsing; public class TextIterator { private int position; private int current; private final String text; public TextIterator(String text) { this.text = text; current = charAt(0); } public int getPosition() { return position; } public String getText() { return text; } public int next(){ int skip = Character.charCount(current); position += skip; current = charAt(position); return current; } public boolean valid(){ return position < text.length(); } private int charAt(int position){ if (position >= text.length()){ return '\03'; } return text.codePointAt(position); } public int peek(){ return charAt(Character.charCount(current) + position); } public int current(){ return current; } } The following code doesn't need review (AFIAK), its more here to help you test/run my code:
My testing code:
package com.nmerrill.night.parsing; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.stream.Collectors; public class ScannerTest { private final NightScanner scanner; public ScannerTest() { this.scanner = new NightScanner(); } @Test public void emptyString() { ScanResults results = scanner.scan(""); assertNoErrors(results); Assert.assertTrue(results.getTokens().isEmpty()); } @DataProvider(name = "numbers") public Object[][] numberData() { return new Object[][]{ {"0"}, {"2"}, {"-4"}, {"100"}, {"4.5"}, {"-4.5"}, {"0.5"}, {"400000000000000000000000000"}, {"-400000000000000000000000000"}, {"1_000_000"}, {"1_000.5"}, {"1.0_5"}, {"-1_0.5"}, }; } @Test(dataProvider = "numbers") public void numbers(String number) { ScanResults results = scanner.scan(number); String expected = number.replaceAll("_", ""); assertTokens(results, expected); assertTokenTypes(results, TokenType.NUMBER); } @DataProvider(name = "invalidNumbers") public Object[][] invalidNumberData() { return new Object[][]{ {"0a"}, {"4b0"}, {"455x"}, {"1..23"}, {"3.6.7"}, {"-3.4.800"}, {"2."}, {"-55t"}, }; } @Test(dataProvider = "invalidNumbers") public void invalidNumbers(String invalid) { ScanResults results = scanner.scan(invalid); assertError(results); } @DataProvider(name = "identifiers") public Object[][] identifierData(){ return new Object[][]{ {"a"}, {"abc"}, {"camelCase"}, {"snake_case"}, {"TitleCase"}, {"ALL_CAPS"}, {"withNumber99"}, {"_underscore"}, {"_"}, {"Español"}, {"français"}, {"日本語"}, {"中文"}, {"हिंदी"}, {"русский"}, {"عربى"}, }; } @Test(dataProvider = "identifiers") public void identifiers(String identifier){ ScanResults results = scanner.scan(identifier); assertTokenTypes(results, TokenType.IDENTIFIER); assertTokens(results, identifier); } @DataProvider(name = "ignorables") public Object[][] ignorableData(){ return new Object[][]{ {"a\u0000\u0001\u0002\u0004\u0005\u0006\u0007\u0008b"},// Intentionally skip \u0003 {"a\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001Bb"}, {"a\u007Fb"}, {"a\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008Fb"}, {"a\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009Fb"}, }; } @Test(dataProvider = "ignorables") public void identifierIgnores(String ignorable){ ScanResults results = scanner.scan(ignorable); assertTokenTypes(results, TokenType.IDENTIFIER); assertTokens(results, "ab"); } @DataProvider(name="strings") public Iterator<Object[]> stringData(){ List<Object[]> data = new ArrayList<>(); String[] literals = new String[] { "", "abc", "Español", "français", "日本語", "中文", "हिंदी", "русский", "عربى", }; for (String literal : literals) { data.add(new Object[]{ "\"" + literal + "\"", literal }); data.add(new Object[]{ "`" + literal + "`", literal }); } data.add(new Object[]{"\"`\"", "`"}); data.add(new Object[]{"`\"`", "\""}); String[][] escapes = new String[][]{ {"\\n", "\n"}, {"\\\\", "\\"}, {"\\u0035", "5"}, {"\\U0001D49E", "\uD835\uDC9E"}, {"\\\"", "\""}, {"\\'", "'"} }; for (String[] escape: escapes){ data.add(new Object[]{"\""+escape[0]+"\"", escape[1]}); } for (String[] escape: escapes){ data.add(new Object[]{"`"+escape[1]+"`", escape[1]}); } data.add(new Object[]{"\"\\r\"", "\r"}); data.add(new Object[]{"`\r`", ""}); return data.iterator(); } @Test(dataProvider = "strings") public void strings(String string, String match){ ScanResults results = scanner.scan(string); assertTokenTypes(results, TokenType.STRING); assertTokens(results, match); } private void assertTokens(ScanResults results, String... tokens) { assertTokensLength(results, tokens.length); for (int i = 0; i < tokens.length; i++) { Assert.assertEquals(results.getTokens().get(i).getLexeme(), tokens[i], "Wrong token contents"); } } private void assertTokenTypes(ScanResults results, TokenType... tokens) { assertTokensLength(results, tokens.length); for (int i = 0; i < tokens.length; i++) { Assert.assertEquals(results.getTokens().get(i).getTokenType(), tokens[i], "Wrong token type"); } } private void assertTokensLength(ScanResults results, int length) { assertNoErrors(results); Assert.assertEquals(results.getTokens().size(), length, "Wrong number of tokens returned"); } private void assertNoErrors(ScanResults scanResults) { Assert.assertTrue(scanResults.getErrors().isEmpty(), "Scanner errors:" + scanResults.getErrors().stream().map(ScanError::getMessage).collect(Collectors.joining())); } private void assertError(ScanResults scanResults) { assertErrors(scanResults, 1); } private void assertErrors(ScanResults scanResults, int count) { Assert.assertFalse(scanResults.isSuccess(), "No errors returned"); Assert.assertEquals(scanResults.getErrors().size(), count, "Wrong number of errors returned"); } } Codepoints.java:
package com.nmerrill.night.parsing; @SuppressWarnings({"unused", "WeakerAccess"}) public class Codepoints { private Codepoints(){} public final static int END_OF_TEXT = 0x0003; public final static int BELL = 0x0007; public final static int BACKSPACE = 0x0008; public final static int TAB = 0x0009; public final static int LINE_FEED = 0x000A; public final static int FORM_FEED = 0x000C; public final static int CARRIAGE_RETURN = 0x000D; public final static int SPACE = 0x0020; public final static int EXCLAMATION_MARK = 0x0021; public final static int QUOTATION_MARK = 0x0022; public final static int NUMBER_SIGN = 0x0023; public final static int DOLLAR_SIGN = 0x0024; public final static int PERCENT_SIGN = 0x0025; public final static int AMPERSAND = 0x0026; public final static int APOSTROPHE = 0x0027; public final static int LEFT_PARENTHESIS = 0x0028; public final static int RIGHT_PARENTHESIS = 0x0029; public final static int ASTERISK = 0x002A; public final static int PLUS_SIGN = 0x002B; public final static int COMMA = 0x002C; public final static int HYPHEN_MINUS = 0x002D; public final static int FULL_STOP = 0x002E; public final static int SOLIDUS = 0x002F; public final static int DIGIT_ZERO = 0x0030; public final static int DIGIT_ONE = 0x0031; public final static int DIGIT_TWO = 0x0032; public final static int DIGIT_THREE = 0x0033; public final static int DIGIT_FOUR = 0x0034; public final static int DIGIT_FIVE = 0x0035; public final static int DIGIT_SIX = 0x0036; public final static int DIGIT_SEVEN = 0x0037; public final static int DIGIT_EIGHT = 0x0038; public final static int DIGIT_NINE = 0x0039; public final static int COLON = 0x003A; public final static int SEMICOLON = 0x003B; public final static int LESS_THAN_SIGN = 0x003C; public final static int EQUALS_SIGN = 0x003D; public final static int GREATER_THAN_SIGN = 0x003E; public final static int QUESTION_MARK = 0x003F; public final static int LATIN_CAPITAL_LETTER_A = 0x0041; public final static int LATIN_CAPITAL_LETTER_B = 0x0042; public final static int LATIN_CAPITAL_LETTER_C = 0x0043; public final static int LATIN_CAPITAL_LETTER_D = 0x0044; public final static int LATIN_CAPITAL_LETTER_E = 0x0045; public final static int LATIN_CAPITAL_LETTER_F = 0x0046; public final static int LATIN_CAPITAL_LETTER_G = 0x0047; public final static int LATIN_CAPITAL_LETTER_H = 0x0048; public final static int LATIN_CAPITAL_LETTER_I = 0x0049; public final static int LATIN_CAPITAL_LETTER_J = 0x004A; public final static int LATIN_CAPITAL_LETTER_K = 0x004B; public final static int LATIN_CAPITAL_LETTER_L = 0x004C; public final static int LATIN_CAPITAL_LETTER_M = 0x004D; public final static int LATIN_CAPITAL_LETTER_N = 0x004E; public final static int LATIN_CAPITAL_LETTER_O = 0x004F; public final static int LATIN_CAPITAL_LETTER_P = 0x0050; public final static int LATIN_CAPITAL_LETTER_Q = 0x0051; public final static int LATIN_CAPITAL_LETTER_R = 0x0052; public final static int LATIN_CAPITAL_LETTER_S = 0x0053; public final static int LATIN_CAPITAL_LETTER_T = 0x0054; public final static int LATIN_CAPITAL_LETTER_U = 0x0055; public final static int LATIN_CAPITAL_LETTER_V = 0x0056; public final static int LATIN_CAPITAL_LETTER_W = 0x0057; public final static int LATIN_CAPITAL_LETTER_X = 0x0058; public final static int LATIN_CAPITAL_LETTER_Y = 0x0059; public final static int LATIN_CAPITAL_LETTER_Z = 0x005A; public final static int LEFT_SQUARE_BRACKET = 0x005B; public final static int REVERSE_SOLIDUS = 0x005C; public final static int RIGHT_SQUARE_BRACKET = 0x005D; public final static int CIRCUMFLEX_ACCENT = 0x005E; public final static int LOW_LINE = 0x005F; public final static int GRAVE_ACCENT = 0x0060; public final static int LATIN_SMALL_LETTER_A0041 = 0x0061; public final static int LATIN_SMALL_LETTER_B0042 = 0x0062; public final static int LATIN_SMALL_LETTER_C0043 = 0x0063; public final static int LATIN_SMALL_LETTER_D0044 = 0x0064; public final static int LATIN_SMALL_LETTER_E0045 = 0x0065; public final static int LATIN_SMALL_LETTER_F0046 = 0x0066; public final static int LATIN_SMALL_LETTER_G0047 = 0x0067; public final static int LATIN_SMALL_LETTER_H0048 = 0x0068; public final static int LATIN_SMALL_LETTER_I0049 = 0x0069; public final static int LATIN_SMALL_LETTER_J004A = 0x006A; public final static int LATIN_SMALL_LETTER_K004B = 0x006B; public final static int LATIN_SMALL_LETTER_L004C = 0x006C; public final static int LATIN_SMALL_LETTER_M004D = 0x006D; public final static int LATIN_SMALL_LETTER_N004E = 0x006E; public final static int LATIN_SMALL_LETTER_O004F = 0x006F; public final static int LATIN_SMALL_LETTER_P0050 = 0x0070; public final static int LATIN_SMALL_LETTER_Q0051 = 0x0071; public final static int LATIN_SMALL_LETTER_R0052 = 0x0072; public final static int LATIN_SMALL_LETTER_S0053 = 0x0073; public final static int LATIN_SMALL_LETTER_T0054 = 0x0074; public final static int LATIN_SMALL_LETTER_U0055 = 0x0075; public final static int LATIN_SMALL_LETTER_V0056 = 0x0076; public final static int LATIN_SMALL_LETTER_W0057 = 0x0077; public final static int LATIN_SMALL_LETTER_X0058 = 0x0078; public final static int LATIN_SMALL_LETTER_Y0059 = 0x0079; public final static int LATIN_SMALL_LETTER_Z005A = 0x007A; public final static int LEFT_CURLY_BRACKET = 0x007B; public final static int VERTICAL_LINE = 0x007C; public final static int RIGHT_CURLY_BRACKET = 0x007D; public final static int TILDE = 0x007E; public final static int NO_BREAK_SPACE = 0x00A0; public final static int INVERTED_EXCLAMATION_MARK = 0x00A1; public final static int CENT_SIGN = 0x00A2; public final static int POUND_SIGN = 0x00A3; public final static int CURRENCY_SIGN = 0x00A4; } OptionalToken.java
package com.nmerrill.night.parsing; public class OptionalToken { private final Token token; private final String error; private OptionalToken(Token token, String error){ this.token = token; this.error = error; } public String getError() { return error; } public Token getToken() { return token; } public boolean exists(){ return token != null; } public boolean hasError(){ return error != null; } public static OptionalToken notFound(){ return new OptionalToken(null, null); } public static OptionalToken error(String error){ return new OptionalToken(null, error); } public static OptionalToken of(TokenType tokenType, TextIterator iterator){ return of(tokenType, iterator, 1); } public static OptionalToken of(TokenType tokenType, TextIterator iterator, int length){ int position = iterator.getPosition(); return of( new Token(tokenType, iterator.getText().substring(position, position+length), position) ); } public static OptionalToken of(Token token){ return new OptionalToken(token, null); } } Token.java
package com.nmerrill.night.parsing; public class Token { private final TokenType tokenType; private final String lexeme; private final int characterPosition; public Token(TokenType tokenType, String lexeme, int characterPosition) { this.tokenType = tokenType; this.lexeme = lexeme; this.characterPosition = characterPosition; } public TokenType getTokenType() { return tokenType; } public String getLexeme() { return lexeme; } public int getCharacterPosition() { return characterPosition; } } TokenType.java
package com.nmerrill.night.parsing; public enum TokenType { //Braces LEFT_PAREN, RIGHT_PAREN, LEFT_BRACE, RIGHT_BRACE, LEFT_BRACKET, RIGHT_BRACKET, LEFT_ANGLE, RIGHT_ANGLE, // Operators COMMA, DOT, DOT_QUESTION, MINUS, PLUS, SEMICOLON, SLASH, ASTERISK, QUESTION, ARROW, BAR, AMPERSAND, ASSIGNMENT, // Comparators EQUAL, NOT_EQUAL, GREATER, GREATER_EQUAL, LESS, LESS_EQUAL, //Booleans AND, OR, XOR, NOT, TRUE, FALSE, // Structures CLASS, STRUCT, PROTOCOL, ENUM, // Control Flow IF, ELSE, ELSEIF, END, REPEAT, CONTINUE, BREAK, FUNCTION, RETURN, YIELD, AWAIT, PANIC, // Mod ifiers LET, VAR, MUTABLE, // Literals IDENTIFIER, STRING, NUMBER, EOF } ScanResults.java
package com.nmerrill.night.parsing; import java.util.ArrayList; import java.util.List; public class ScanResults { private final List<Token> tokens; private final List<ScanError> errors; private ScanResults(List<Token> tokens, List<ScanError> errors) { this.tokens = tokens; this.errors = errors; } public static ScanResults success(List<Token> tokens){ return new ScanResults(tokens, new ArrayList<>()); } public static ScanResults error(List<ScanError> errors){ return new ScanResults(new ArrayList<>(), errors); } public List<Token> getTokens() { return tokens; } public List<ScanError> getErrors() { return errors; } public boolean isSuccess(){ return errors == null; } } ScanError.java
package com.nmerrill.night.parsing; public class ScanError { private final String message; private final int position; public ScanError(String message, int position) { this.message = message; this.position = position; } public String getMessage() { return message; } public int getPosition(){ return position; } @Override public String toString() { return message; } }