Compiler/Verifying syntax
Verifying Syntax
A Syntax Analyzer that verifies a token stream,
outputs a string "true" if the token stream matches the grammar requirement,
outputs a string "false" if the token stream does not match the grammar.
Task
The program reads input from a file of token stream,
reads it and outputs a string "true" if the token stream matches the grammar,
outputs a string "false" and error messages if the token stream does not match the grammar,
based on the grammar below. The grammar is written in Extended Backus-Naur Form (EBNF).
Grammar
stmt = expr ; expr = expr_level_2; expr_level_2 = expr_level_3 {"or" expr_level_3} ; expr_level_3 = expr_level_4 {"and" expr_level_4} ; expr_level_4 = ["not"] expr_level_5 [('=' | '<') expr_level_5] ; expr_level_5 = expr_level_6 {('+' | '-') expr_level_6} ; expr_level_6 = primary {('*' | '/') primary} ; primary = Identifier | Integer | '(' expr ')' | "true" | "false" ; Integer = Digit {Digit}; Identifier = Letter {Letter | Digit | '_'}; Digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; Letter = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" | "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ;
Includes the test cases from the Go sample. Note, strings are limited to 256 characters in Algol W.
begin % verify expressions match expected syntax % procedure stmt ( string(256) value text ) ; begin % the parsing procedures return true if the expression matches the % % required element, false otherwise - a parse tree is not built % logical haveInteger, haveIdentifier, haveEof, hadError; integer currPos, maxPos, syPos; string(1) currCh; string(256) srcText; string(256) sy; logical procedure expr ; expr_level_2; logical procedure expr_level_2 ; begin logical ok; ok := expr_level_3; while ok and have( "or" ) do ok := next and expr_level_3; ok end expr_level_2 ; logical procedure expr_level_3 ; begin logical ok; ok := expr_level_4; while have( "and" ) and ok do ok := next and expr_level_4; ok end expr_level_3 ; logical procedure expr_level_4 ; begin logical ok; ok := true; if have( "not" ) then ok := next; if ok then ok := expr_level_5; if ok and ( have( "=" ) or have( "<" ) ) then ok := next and expr_level_5; ok end expr_level_4 ; logical procedure expr_level_5 ; begin logical ok; ok := expr_level_6; while ok and ( have( "+" ) or have( "-" ) ) do ok := next and expr_level_6; ok end expr_level_5 ; logical procedure expr_level_6 ; begin logical ok; ok := primary; while ok and ( have( "*" ) or have( "/" ) ) do ok := next and primary; ok end expr_level_6 ; logical procedure primary ; if haveIdentifier or haveInteger or have( "true" ) or have( "false" ) then begin void( next ); true end else if have( "(" ) then next and expr and mustBe( ")" ) else error( "Expecting identifier, literal or ""(""" ) ; logical procedure addAndNextChar ; begin if syPos = 255 then void( error( "Symbol too long" ) ) else if syPos < 255 then begin sy( syPos // 1 ) := currCh; syPos := syPos + 1 end if_syPos_eq_255__lt_255 ; nextChar end addAndNextChar ; logical procedure next ; begin logical ok; haveInteger := haveIdentifier := false; ok := skipSpaces; sy := ""; syPos := 0; if not ok then sy := "<eof>" else begin if haveDigit then begin haveInteger := true; while addAndNextChar and haveDigit do begin end end else if haveLetter then begin while addAndNextChar and ( haveLetter or haveDigit or currCh = "_" ) do begin end; haveIdentifier := sy not = "and" and sy not = "or" and sy not = "not" and sy not = "true" and sy not = "false" end else void( addAndNextChar ); end if_not_ok__ ; ok end next ; logical procedure skipSpaces ; begin logical ok; ok := not haveEof; while ok and currCh = " " do ok := nextChar; ok end skipSpaces ; logical procedure haveLetter ; not haveEof and ( ( currCh >= "a" and currCh <= "z" ) or ( currCh >= "A" and currCh <= "Z" ) ); logical procedure haveDigit ; not haveEof and ( currCh >= "0" and currCh <= "9" ); logical procedure have ( string(12) value text ) ; text = sy; logical procedure mustBe ( string(12) value text ) ; begin logical ok; ok := have( text ); if ok then void( next ) else begin string(256) msg; msg := "Expected:"; msg( strlen( msg ) + 2 // 12 ) := text; void( error( msg ) ) end if_ok; ok end mustBe ; logical procedure nextChar ; begin haveEof := currPos > maxPos; if not haveEof then begin currCh := srcText( currPos // 1 ); currPos := currPos + 1 end if_not_haveEof ; not haveEof end nextChar ; integer procedure strlen ( string(256) value text ) ; begin integer length; length := 255; while length >= 0 and text( length // 1 ) = " " do length := length - 1; length end strlen ; logical procedure error ( string(256) value msg ) ; begin if not hadError then begin % have the first error % writeon( " error at: ", I_W := 1, currPos, "(" ); showText( sy, strlen( sy ) ); writeon( "): " ); showText( msg, strlen( msg ) ); hadError := true end if_not_hadError ; false end ewrror ; procedure showText ( string(256) value text; integer value length ) ; for c := 0 until length do writeon( text( c // 1 ) ); procedure void ( logical value b ) ; begin end void ; % parse text and output messages indicating whether it is OK or not % hadError := false; sy := "<bof>"; srcText := text; currPos := 0; maxPos := strlen( srcText ); write(); showText( srcText, maxPos ); writeon( ": " ); if not nextChar then void( error( "Blank source text" ) ) else if not ( next and expr ) then void( error( "Expression expected" ) ) else if not haveEof then void( error( "Expected EOF after expression" ) ); if hadError then writeon( " false" ) else writeon( " true" ) end stmt ; % test cases % stmt( "wombat" ); stmt( "wombat or monotreme" ); stmt( "( wombat and not )" ); stmt( "wombat or not" ); stmt( "a + 1" ); stmt( "a + b < c" ); stmt( "a + b - c * d / e < f and not ( g = h )" ); stmt( "a + b - c * d / e < f and not ( g = h" ); stmt( "a = b" ); stmt( "a or b = c" ); stmt( "$" ); % test cases from Go % stmt( "true or false = not true" ); stmt( "not true = false" ); stmt( "3 + not 5" ); stmt( "3 + (not 5)" ); stmt( "(42 + 3" ); stmt( " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true" ); stmt( " and 3 < 2" ); stmt( "not 7 < 2" ); stmt( "2 < 3 < 4" ); stmt( "2 < foobar - 3 < 4" ); stmt( "2 < foobar and 3 < 4" ); stmt( "4 * (32 - 16) + 9 = 73" ); stmt( "235 76 + 1" ); stmt( "a + b = not c and false" ); stmt( "a + b = (not c) and false" ); stmt( "a + b = (not c and false)" ); stmt( "ab_c / bd2 or < e_f7" ); stmt( "g not = h" ); stmt( "été = false" ); stmt( "i++" ); stmt( "j & k" ); stmt( "l or _m" ) end.- Output:
wombat: true wombat or monotreme: true ( wombat and not ): error at: 18 ()): Expecting identifier, literal or "(" false wombat or not: error at: 13 (<eof>): Expression expected false a + 1: true a + b < c: true a + b - c * d / e < f and not ( g = h ): true a + b - c * d / e < f and not ( g = h: error at: 37 (<eof>): Expected: ) false a = b: true a or b = c: true $: error at: 1 ($): Expecting identifier, literal or "(" false true or false = not true: error at: 20 (not): Expecting identifier, literal or "(" false not true = false: true 3 + not 5: error at: 8 (not): Expecting identifier, literal or "(" false 3 + (not 5): true (42 + 3: error at: 7 (<eof>): Expected: ) false not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true: true and 3 < 2: error at: 5 (and): Expecting identifier, literal or "(" false not 7 < 2: true 2 < 3 < 4: error at: 8 (<): Expected EOF after expression false 2 < foobar - 3 < 4: error at: 17 (<): Expected EOF after expression false 2 < foobar and 3 < 4: true 4 * (32 - 16) + 9 = 73: true 235 76 + 1: error at: 7 (76): Expected EOF after expression false a + b = not c and false: error at: 12 (not): Expecting identifier, literal or "(" false a + b = (not c) and false: true a + b = (not c and false): true ab_c / bd2 or < e_f7: error at: 16 (<): Expecting identifier, literal or "(" false g not = h: error at: 6 (not): Expected EOF after expression false été = false: error at: 2 (Ã): Expecting identifier, literal or "(" false i++: error at: 3 (+): Expecting identifier, literal or "(" false j & k: error at: 4 (&): Expected EOF after expression false l or _m: error at: 7 (_): Expecting identifier, literal or "(" false // cverifyingsyntaxrosetta.c // http://www.rosettacode.org/wiki/Compiler/_Verifying_Syntax /* # Makefile CFLAGS = -O3 -Wall -Wfatal-errors all: cverifyingsyntaxrosetta */ #include <stdio.h> #include <string.h> #include <ctype.h> #include <setjmp.h> #define AT(CHAR) ( *pos == CHAR && ++pos ) #define TEST(STR) ( strncmp( pos, STR, strlen(STR) ) == 0 \ && ! isalnum(pos[strlen(STR)]) && pos[strlen(STR)] != '_' ) #define IS(STR) ( TEST(STR) && (pos += strlen(STR)) ) static char *pos; // current position in source static char *startpos; // start of source static jmp_buf jmpenv; static int error(char *message) { printf("false %s\n%*s^ %s\n", startpos, pos - startpos + 7, "", message); longjmp( jmpenv, 1 ); } static int expr(int level) { while( isspace(*pos) ) ++pos; // skip white space if( AT('(') ) // find a primary (operand) { if( expr(0) && ! AT(')') ) error("missing close paren"); } else if( level <= 4 && IS("not") && expr(6) ) { } else if( TEST("or") || TEST("and") || TEST("not") ) { error("expected a primary, found an operator"); } else if( isdigit(*pos) ) pos += strspn( pos, "0123456789" ); else if( isalpha(*pos) ) pos += strspn( pos, "0123456789_" "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" ); else error("expected a primary"); do // then look for zero or more valid following operators { while( isspace(*pos) ) ++pos; } while( level <= 2 && IS("or") ? expr(3) : level <= 3 && IS("and") ? expr(4) : level <= 4 && (AT('=') || AT('<')) ? expr(5) : level == 5 && (*pos == '=' || *pos == '<') ? error("non-associative") : level <= 6 && (AT('+') || AT('-')) ? expr(7) : level <= 7 && (AT('*') || AT('/')) ? expr(8) : 0 ); return 1; } static void parse(char *source) { startpos = pos = source; if( setjmp(jmpenv) ) return; // for catching errors during recursion expr(0); if( *pos ) error("unexpected character following valid parse"); printf(" true %s\n", source); } static char *tests[] = { "3 + not 5", "3 + (not 5)", "(42 + 3", "(42 + 3 some_other_syntax_error", "not 3 < 4 or (true or 3/4+8*5-5*2 < 56) and 4*3 < 12 or not true", "and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < foobar - 3 < 4", "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "a + b = not c and false", "a + b = (not c) and false", "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "i++", "j & k", "l or _m", "wombat", "WOMBAT or monotreme", "a + b - c * d / e < f and not ( g = h )", "$", }; int main(int argc, char *argv[]) { for( int i = 0; i < sizeof(tests)/sizeof(*tests); i++ ) parse(tests[i]); } - Output:
false 3 + not 5 ^ expected a primary, found an operator true 3 + (not 5) false (42 + 3 ^ missing close paren false (42 + 3 some_other_syntax_error ^ missing close paren true not 3 < 4 or (true or 3/4+8*5-5*2 < 56) and 4*3 < 12 or not true false and 3 < 2 ^ expected a primary, found an operator true not 7 < 2 false 2 < 3 < 4 ^ non-associative false 2 < foobar - 3 < 4 ^ non-associative true 2 < foobar and 3 < 4 true 4 * (32 - 16) + 9 = 73 false 235 76 + 1 ^ unexpected character following valid parse false a + b = not c and false ^ expected a primary, found an operator true a + b = (not c) and false true a + b = (not c and false) false ab_c / bd2 or < e_f7 ^ expected a primary false g not = h ^ unexpected character following valid parse false i++ ^ expected a primary false j & k ^ unexpected character following valid parse false l or _m ^ expected a primary true wombat true WOMBAT or monotreme true a + b - c * d / e < f and not ( g = h ) false $ ^ expected a primary
Dim Shared As String posic 'current position in source Dim Shared As String startpos 'start of source Dim Shared As Integer errorPos 'position for error marker Sub errorMsg(message As String) Print "false "; startpos Print Space(errorPos + 7); "^ "; message End Sub Function isOperator(s As String) As Boolean Dim ops(2) As String = {"or", "and", "not"} For i As Integer = 0 To Ubound(ops) If Left(posic, Len(ops(i))) = ops(i) Then Return True Next Return False End Function Function expr(level As Integer) As Boolean errorPos = Len(startpos) - Len(posic) ' Skip whitespace While Len(posic) > 0 Andalso Instr(" " & Chr(9) & Chr(10) & Chr(13), Left(posic, 1)) > 0 posic = Mid(posic, 2) errorPos = Len(startpos) - Len(posic) Wend If Left(posic, 1) = "(" Then posic = Mid(posic, 2) If expr(0) = 0 Orelse Left(posic, 1) <> ")" Then errorMsg("missing close paren") Return False End If posic = Mid(posic, 2) Elseif level <= 4 Andalso Left(posic, 3) = "not" Andalso _ (Len(posic) = 3 Orelse Instr(" " & Chr(9) & Chr(10) & Chr(13) & "(=<+-*/)", Mid(posic, 4, 1)) > 0) Then posic = Mid(posic, 4) If expr(6) = 0 Then Return False Elseif Left(posic, 2) = "or" Orelse Left(posic, 3) = "and" Orelse Left(posic, 3) = "not" Then errorMsg("expected a primary, found an operator") Return False Elseif Instr("0123456789", Left(posic, 1)) > 0 Then posic = Mid(posic, 2) While Len(posic) > 0 Andalso Instr("0123456789", Left(posic, 1)) > 0 posic = Mid(posic, 2) Wend Elseif Instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", Left(posic, 1)) > 0 Then posic = Mid(posic, 2) While Len(posic) > 0 Andalso Instr("0123456789_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", Left(posic, 1)) > 0 posic = Mid(posic, 2) Wend Else errorMsg("expected a primary") Return False End If Do errorPos = Len(startpos) - Len(posic) While Len(posic) > 0 Andalso Instr(" " & Chr(9) & Chr(10) & Chr(13), Left(posic, 1)) > 0 posic = Mid(posic, 2) errorPos = Len(startpos) - Len(posic) Wend If level <= 2 Andalso Left(posic, 2) = "or" Andalso _ (Len(posic) = 2 Orelse Instr(" " & Chr(9) & Chr(10) & Chr(13), Mid(posic, 3, 1)) > 0) Then posic = Mid(posic, 3) If expr(3) = 0 Then Return False Elseif level <= 3 Andalso Left(posic, 3) = "and" Andalso _ (Len(posic) = 3 Orelse Instr(" " & Chr(9) & Chr(10) & Chr(13), Mid(posic, 4, 1)) > 0) Then posic = Mid(posic, 4) If expr(4) = 0 Then Return False Elseif level <= 4 Andalso (Left(posic, 1) = "=" Orelse Left(posic, 1) = "<") Then posic = Mid(posic, 2) If expr(5) = 0 Then Return False Elseif level = 5 Andalso (Left(posic, 1) = "=" Orelse Left(posic, 1) = "<") Then errorMsg("non-associative") Return False Elseif level <= 6 Andalso (Left(posic, 1) = "+" Orelse Left(posic, 1) = "-") Then posic = Mid(posic, 2) If expr(7) = 0 Then Return False Elseif level <= 7 Andalso (Left(posic, 1) = "*" Orelse Left(posic, 1) = "/") Then posic = Mid(posic, 2) If expr(8) = 0 Then Return False Else Exit Do End If Loop Return True End Function Sub parse(source As String) startpos = source posic = source If expr(0) = 0 Then Exit Sub If Len(posic) > 0 Then errorMsg("unexpected character following valid parse") Exit Sub End If Color 2 : Print " true "; : Color 7 : Print source End Sub Dim tests(23) As String = { _ "3 + not 5", "3 + (not 5)", "(42 + 3", "(42 + 3 some_other_syntax_error", _ "not 3 < 4 or (true or 3/4+8*5-5*2 < 56) and 4*3 < 12 or not true", _ "and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < foobar - 3 < 4", _ "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", _ "a + b = not c and false", "a + b = (not c) and false", _ "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "i++", _ "j & k", "l or _m", "wombat", "WOMBAT or monotreme", _ "a + b - c * d / e < f and not ( g = h )", "$" } For i As Integer = 0 To Ubound(tests) parse(tests(i)) Next Sleep - Output:
Same as C entry.
If "and", "or", "not" and "=" are replaced by the corresponding symbols: "&&", "||", "!" and "==", then the task grammar is a subset of Go's own grammar - operator precedence and identifier construction are the same.
After making the appropriate substitutions, we can therefore use the parser in Go's standard library to verify whether the statements are valid or not. As expressions cannot be statements in Go, we simply parse the latter as expressions.
However, before applying the parser, we first need to ensure that the expressions don't include any characters (including non-ASCII) or usages thereof which Go would otherwise permit. Note that it's not necessary to specifically check for "++" and "--" as these are statements in Go and can't appear in expressions anyway.
In particular, after substitutions, "= not", "+ not" etc. would be allowed by the Go parser so we need to exclude them. Curiously, the Go parser allows something like "2 < 3 < 4" even though it doesn't compile. We need therefore to exclude that also (see Talk page).
package main import ( "fmt" "go/parser" "regexp" "strings" ) var ( re1 = regexp.MustCompile(`[^_a-zA-Z0-9\+\-\*/=<\(\)\s]`) re2 = regexp.MustCompile(`\b_\w*\b`) re3 = regexp.MustCompile(`[=<+*/-]\s*not`) re4 = regexp.MustCompile(`(=|<)\s*[^(=< ]+\s*([=<+*/-])`) ) var subs = [][2]string{ {"=", "=="}, {" not ", " ! "}, {"(not ", "(! "}, {" or ", " || "}, {" and ", " && "}, } func possiblyValid(expr string) error { matches := re1.FindStringSubmatch(expr) if matches != nil { return fmt.Errorf("invalid character %q found", []rune(matches[0])[0]) } if re2.MatchString(expr) { return fmt.Errorf("identifier cannot begin with an underscore") } if re3.MatchString(expr) { return fmt.Errorf("expected operand, found 'not'") } matches = re4.FindStringSubmatch(expr) if matches != nil { return fmt.Errorf("operator %q is non-associative", []rune(matches[1])[0]) } return nil } func modify(err error) string { e := err.Error() for _, sub := range subs { e = strings.ReplaceAll(e, strings.TrimSpace(sub[1]), strings.TrimSpace(sub[0])) } return strings.Split(e, ":")[2][1:] // remove location info as may be inaccurate } func main() { exprs := []string{ "$", "one", "either or both", "a + 1", "a + b < c", "a = b", "a or b = c", "3 + not 5", "3 + (not 5)", "(42 + 3", "(42 + 3)", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < (3 < 4)", "2 < foobar - 3 < 4", "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "true or false = not true", "true or false = (not true)", "not true or false = false", "not true = false", "a + b = not c and false", "a + b = (not c) and false", "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "été = false", "i++", "j & k", "l or _m", } for _, expr := range exprs { fmt.Printf("Statement to verify: %q\n", expr) if err := possiblyValid(expr); err != nil { fmt.Printf("\"false\" -> %s\n\n", err.Error()) continue } expr = fmt.Sprintf(" %s ", expr) // make sure there are spaces at both ends for _, sub := range subs { expr = strings.ReplaceAll(expr, sub[0], sub[1]) } _, err := parser.ParseExpr(expr) if err != nil { fmt.Println(`"false" ->`, modify(err)) } else { fmt.Println(`"true"`) } fmt.Println() } } - Output:
Statement to verify: "$" "false" -> invalid character '$' found Statement to verify: "one" "true" Statement to verify: "either or both" "true" Statement to verify: "a + 1" "true" Statement to verify: "a + b < c" "true" Statement to verify: "a = b" "true" Statement to verify: "a or b = c" "true" Statement to verify: "3 + not 5" "false" -> expected operand, found 'not' Statement to verify: "3 + (not 5)" "true" Statement to verify: "(42 + 3" "false" -> expected ')', found newline Statement to verify: "(42 + 3)" "true" Statement to verify: " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true" "true" Statement to verify: " and 3 < 2" "false" -> expected operand, found 'and' Statement to verify: "not 7 < 2" "true" Statement to verify: "2 < 3 < 4" "false" -> operator '<' is non-associative Statement to verify: "2 < (3 < 4)" "true" Statement to verify: "2 < foobar - 3 < 4" "false" -> operator '<' is non-associative Statement to verify: "2 < foobar and 3 < 4" "true" Statement to verify: "4 * (32 - 16) + 9 = 73" "true" Statement to verify: "235 76 + 1" "false" -> expected 'EOF', found 76 Statement to verify: "true or false = not true" "false" -> expected operand, found 'not' Statement to verify: "true or false = (not true)" "true" Statement to verify: "not true or false = false" "true" Statement to verify: "not true = false" "true" Statement to verify: "a + b = not c and false" "false" -> expected operand, found 'not' Statement to verify: "a + b = (not c) and false" "true" Statement to verify: "a + b = (not c and false)" "true" Statement to verify: "ab_c / bd2 or < e_f7" "false" -> expected operand, found '<' Statement to verify: "g not = h" "false" -> expected 'EOF', found 'not' Statement to verify: "été = false" "false" -> invalid character 'é' found Statement to verify: "i++" "false" -> expected 'EOF', found '++' Statement to verify: "j & k" "false" -> invalid character '&' found Statement to verify: "l or _m" "false" -> identifier cannot begin with an underscore
// Regular expressions for validation const re1 = /[^_a-zA-Z0-9\+\-\*/=<\(\)\s]/; const re2 = /\b_\w*\b/; const re3 = /[=<+*/-]\s*not/; const re4 = /(=|<)\s*[^(=< ]+\s*([=<+*/-])/; // Substitution patterns const subs = [ ["=", "=="], [" not ", " ! "], ["(not ", "(! "], [" or ", " || "], [" and ", " && "] ]; function possiblyValid(expr) { const matches1 = expr.match(re1); if (matches1) { throw new Error(`invalid character "${matches1[0]}" found`); } if (re2.test(expr)) { throw new Error("identifier cannot begin with an underscore"); } if (re3.test(expr)) { throw new Error("expected operand, found 'not'"); } const matches4 = expr.match(re4); if (matches4) { throw new Error(`operator "${matches4[1]}" is non-associative`); } return null; } function modify(err) { let e = err.message; for (const sub of subs) { e = e.replaceAll(sub[1].trim(), sub[0].trim()); } // Remove location info as may be inaccurate const parts = e.split(":"); return parts.length > 2 ? parts[2].substring(1) : e; } // Simple expression parser to mimic Go's parser.ParseExpr function parseExpression(expr) { try { // This is a simplified parser - in a real implementation you'd want a proper parser // For now, we'll use eval in a safe context to check basic syntax const safeExpr = expr .replace(/\b[a-zA-Z_][a-zA-Z0-9_]*\b/g, 'true') // replace identifiers with 'true' .replace(/\b\d+\b/g, '1'); // replace numbers with '1' // Check for basic syntax errors if (expr.includes('++') || expr.includes('&')) { throw new Error("unexpected token"); } // Check for unmatched parentheses let parenCount = 0; for (const char of expr) { if (char === '(') parenCount++; if (char === ')') parenCount--; if (parenCount < 0) throw new Error("unexpected ')'"); } if (parenCount > 0) throw new Error("expected ')'"); // Check for invalid sequences like "235 76" if (/\b\d+\s+\d+\b/.test(expr)) { throw new Error("unexpected number"); } // Try to evaluate the safe expression new Function(`return ${safeExpr}`)(); return null; } catch (error) { throw new Error(error.message); } } function main() { const exprs = [ "$", "one", "either or both", "a + 1", "a + b < c", "a = b", "a or b = c", "3 + not 5", "3 + (not 5)", "(42 + 3", "(42 + 3)", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < (3 < 4)", "2 < foobar - 3 < 4", "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "true or false = not true", "true or false = (not true)", "not true or false = false", "not true = false", "a + b = not c and false", "a + b = (not c) and false", "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "été = false", "i++", "j & k", "l or _m" ]; for (const expr of exprs) { console.log(`Statement to verify: "${expr}"`); try { possiblyValid(expr); } catch (err) { console.log(`"false" -> ${err.message}\n`); continue; } let modifiedExpr = ` ${expr} `; // make sure there are spaces at both ends for (const sub of subs) { modifiedExpr = modifiedExpr.replaceAll(sub[0], sub[1]); } try { parseExpression(modifiedExpr); console.log('"true"'); } catch (err) { console.log(`"false" -> ${modify(err)}`); } console.log(); } } // Run the main function main(); - Output:
Statement to verify: "$" "false" -> invalid character "$" found Statement to verify: "one" "true" Statement to verify: "either or both" "true" Statement to verify: "a + 1" "true" Statement to verify: "a + b < c" "true" Statement to verify: "a = b" "true" Statement to verify: "a or b = c" "true" Statement to verify: "3 + not 5" "false" -> expected operand, found 'not' Statement to verify: "3 + (not 5)" "true" Statement to verify: "(42 + 3" "false" -> expected ')' Statement to verify: "(42 + 3)" "true" Statement to verify: " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true" "false" -> unexpected token Statement to verify: " and 3 < 2" "false" -> unexpected token Statement to verify: "not 7 < 2" "true" Statement to verify: "2 < 3 < 4" "false" -> operator "<" is non-associative Statement to verify: "2 < (3 < 4)" "true" Statement to verify: "2 < foobar - 3 < 4" "false" -> operator "<" is non-associative Statement to verify: "2 < foobar and 3 < 4" "false" -> unexpected token Statement to verify: "4 * (32 - 16) + 9 = 73" "true" Statement to verify: "235 76 + 1" "false" -> unexpected number Statement to verify: "true or false = not true" "false" -> expected operand, found 'not' Statement to verify: "true or false = (not true)" "true" Statement to verify: "not true or false = false" "true" Statement to verify: "not true = false" "true" Statement to verify: "a + b = not c and false" "false" -> expected operand, found 'not' Statement to verify: "a + b = (not c) and false" "false" -> unexpected token Statement to verify: "a + b = (not c and false)" "false" -> unexpected token Statement to verify: "ab_c / bd2 or < e_f7" "false" -> Unexpected token '<' Statement to verify: "g not = h" "false" -> Unexpected token 'not' Statement to verify: "été = false" "false" -> invalid character "é" found Statement to verify: "i++" "false" -> unexpected token Statement to verify: "j & k" "false" -> invalid character "&" found Statement to verify: "l or _m" "false" -> identifier cannot begin with an underscore
Also works with gojq, the Go implementation of jq
This entry uses the PEG (Parsing Expression Grammar) formalism to transform the given grammar to a jq verification program by a simple process that amounts to transcription.
For example, in the rule for `primary`, the alternation
Identifier | Integer
becomes the jq expression:
Identifer // Integer
The transcription process is not completely trivial as jq requires definitions be ordered and perhaps nested to satisfy a "define-before-use" rule. In the present case, since `primary` and `expr` are defined circularly, we define `primary` as an inner function of `expr`.
This PEG-to-jq transcription process is described in detail at [1].
The following presentation uses jq's support for regular expressions for the sake of simplicity, brevity and efficiency. For example, the grammar rule:
Digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
becomes the jq program:
def Digit: parse("[0-9]"); where `parse` is a utility function defined in the library of PEG-oriented functions in the first subsection below.
The jq program presented here works on character strings and textual files, and hence the use of `ws` (for whitespace) in the program. Since `ws` is defined here to make whitespace optional, it might be desirable to modify the program to require whitespace (e.g. using the utility function `_`) in certain places instead. </syntaxhighlight>
Generic PEG Library
The jq module at Category:Jq/peg.jq can be included by copying it to a file, and adding an `include` statement to top of the main program, e.g. as follows:
include "peg" {search: "."};The Grammar
def expr: def Digit : parse("[0-9]"); def Letter : parse("[a-zA-Z]"); def Identifier : Letter | star(Letter // Digit // literal("_")); def Integer : plus(Digit); def primary : ws | (Identifier // Integer // (literal("(") | expr | literal(")")) // literal("true") // literal("false")) | ws; def expr_level_6 : primary | star((literal("*") // literal("/")) | primary) ; def expr_level_5 : expr_level_6 | star((literal("+") // literal("-")) | expr_level_6) ; def expr_level_4 : ws | optional(literal("not")) | expr_level_5 | optional(parse("[=<]") | expr_level_5) ; def expr_level_3 : expr_level_4 | star(literal("and") | expr_level_4) ; def expr_level_2 : expr_level_3 | star(literal("or") | expr_level_3) ; ws | expr_level_2 | ws; def stmt: {remainder: .} | expr | eos;The Go examples
def go: [ "$", "one", "either or both", "a + 1", "a + b < c", "a = b", "a or b = c", "3 + not 5", "3 + (not 5)", "(42 + 3", "(42 + 3)", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < (3 < 4)", "2 < foobar - 3 < 4", "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "true or false = not true", "true or false = (not true)", "not true or false = false", "not true = false", "a + b = not c and false", "a + b = (not c) and false", "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "été = false", "i++", "j & k", "l or _m" ]; # For ease of comparison with the Go output, simply emit `true` or `false` go[] | (stmt | true) // falseInvocation: jq -nr -f compiler-verifying-syntax.jq
- Output:
The same sequence of `true` and `false` values as at Go.
function substituteinnerparentheses(s, subs) ((i = findlast('(', s)) == nothing) && return (s, false) ((j = findfirst(')', s[i:end])) == nothing) && return (s, false) okparse(s[i+1:j+i-2]) || return (s, false) return s[1:i-1] * " " * subs * " " * s[j+i:end], true end function okparse(s) while findfirst('(', s) != nothing s, okinparentheses = substituteinnerparentheses(s, "true") okinparentheses || return false end s = strip(s) # Julia allows expressions like 2 + + + 3, or like true = not false, but these are not allowed here # = or < can be used only once within parentheses if occursin(r"(and|or|[\=\<\+\-\*\/])\s*(and|or|[\=\<\+\-\*\/])", s) || occursin(r"(^(and|^or|^[\=\<\+\-\*\/]))|((and|or|[\=\<\+\-\*\/])$)", s) || occursin(r"(\=|\<)\s*not\s", s) || count(c -> c == '=' || c == '<', s) > 1 return false end # Julia allows ., ,, ; and operators like % but these are not allowed here # permitted: -+*/ true false and or not, ascii identifiers, and integers for item in split(s, r"\s+") !occursin( r"^[a-zA-Z][a-zA-Z_0-9]*$|^\d+$|^true$|^false$|^or$|^and$|^not$|^\=$|^\<$|^\+$|^-$|^\*$|^\/$", item) && return false end # change and, or, and not to the corresponding Julia operators s = replace(replace(replace(s, "and" => "&&"), "or" => "||"), "not" => "!") try # Use Julia's parser, which will throw exception if it parses an error Meta.parse(s) catch return false end return true end teststatements = [ " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "true or false = not true", "not true = false", "2 < 5 < 9" ] for s in teststatements println("The compiler parses the statement { $s } and outputs: ", okparse(s)) end - Output:
The compiler parses the statement { not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true } and outputs: true The compiler parses the statement { and 3 < 2 } and outputs: false The compiler parses the statement { not 7 < 2 } and outputs: true The compiler parses the statement { 4 * (32 - 16) + 9 = 73 } and outputs: true The compiler parses the statement { 235 76 + 1 } and outputs: false The compiler parses the statement { true or false = not true } and outputs: false The compiler parses the statement { not true = false } and outputs: true The compiler parses the statement { 2 < 5 < 9 } and outputs: false import strutils, tables type # List of tokens with their textual representation. Token = enum tkError = "invalid token", tkIdent = "identifier", tkInt = "integer", tkLPar = "'('", tkRPar = "')'", tkFalse = "'false'", tkTrue = "'true'", tkLt = "'<'", tkEq = "'='", tkAdd = "'+'", tkSub = "'-'", tkMul = "'*'", tkDiv = "'/'", tkOr = "'or'", tkAnd = "'and'", tkNot = "'not'", tkEOF = "EOF" Lexer = object str: string # String to parse. len: Natural # String length. pos: int # Current lexer position. token: Token # Current token. error: string # Error message. const # Mapping of reserved identifiers to tokens. IdentTokens = {"false": tkFalse, "true": tkTrue, "or": tkOr, "and": tkAnd, "not": tkNot}.toTable # Mapping of characters to tokens. CharTokens = {'(': tkLPar, ')': tkRPar, '<': tkLt, '=': tkEq, '+': tkAdd, '-': tkSub, '*': tkMul, '/': tkDiv}.toTable #################################################################################################### # Lexer. # Forward reference. func nextToken(lex: var Lexer) func initLexer(str: string): Lexer = ## Initialize a lexer. result = Lexer(str: str, len: str.len, pos: 0) result.nextToken() # Always a token ahead. func getIdToken(lex: var Lexer) = ## Get the token for an identifier. var str: string while lex.pos < lex.len and lex.str[lex.pos] in IdentChars: str.add lex.str[lex.pos] inc lex.pos lex.token = IdentTokens.getOrDefault(str, tkIdent) func getInt(lex: var Lexer) = ## Get an integer token. while lex.pos < lex.len and lex.str[lex.pos] in Digits: inc lex.pos lex.token = tkInt func nextToken(lex: var Lexer) = ## Find the next token. # Skip spaces. while lex.pos < lex.str.len and lex.str[lex.pos] == ' ': inc lex.pos if lex.pos == lex.str.len: lex.token = tkEOF else: let ch = lex.str[lex.pos] case ch of 'a'..'z': lex.getIdToken() of '0'..'9': lex.getint() else: inc lex.pos lex.token = CharTokens.getOrDefault(ch, tkError) #################################################################################################### # Parser. # Forward reference. proc checkExpr(lex: var Lexer): bool proc checkPrimary(lex: var Lexer): bool = ## Check validity of a primary. if lex.token in {tkIdent, tkInt, tkFalse, tkTrue}: lex.nextToken() return true elif lex.token == tkLPar: lex.nextToken() if not lex.checkExpr(): return false if lex.token != tkRPar: lex.error = "Encountered $#; expected ')'.".format(lex.token) return false else: lex.nextToken() return true else: lex.error = "Encountered $#; expected identifier, litteral or '('.".format(lex.token) return false proc checkExpr6(lex: var Lexer): bool = ## Check validity of an expr6. if not lex.checkPrimary(): return false while lex.token in [tkMul, tkDiv]: lex.nextToken() if not lex.checkPrimary(): return false result = true proc checkExpr5(lex: var Lexer): bool = ## Check validity of an expr5. if not lex.checkExpr6(): return false while lex.token in [tkAdd, tkSub]: lex.nextToken() if not lex.checkExpr6(): return false result = true proc checkExpr4(lex: var Lexer): bool = ## Check validity of an expr4. if lex.token == tkNot: lex.nextToken() if not lex.checkExpr5(): return false if lex.token in [tkLt, tkEq]: lex.nextToken() if not lex.checkExpr5(): return false result = true proc checkExpr3(lex: var Lexer): bool = ## Check validity of an expr3. if not lex.checkExpr4(): return false while lex.token == tkAnd: lex.nextToken() if not lex.checkExpr4(): return false result = true proc checkExpr2(lex: var Lexer): bool = ## Check validity of an expr2. if not lex.checkExpr3(): return false while lex.token == tkOr: lex.nextToken() if not lex.checkExpr3(): return false result = true proc checkExpr(lex: var Lexer): bool = ## Check validity of an expr. lex.checkExpr2() proc checkStmt(lex: var Lexer): bool = ## Check validity of a statement. result = lex.checkExpr() if result and lex.pos < lex.len: lex.error = "Extra characters at end of statement." result = false #——————————————————————————————————————————————————————————————————————————————————————————————————— # Using test set from Algol68 version. const Tests = ["wombat", "wombat or monotreme", "( wombat and not )", "wombat or not", "a + 1", "a + b < c", "a + b - c * d / e < f and not ( g = h )", "a + b - c * d / e < f and not ( g = h", "a = b", "a or b = c", "$", "true or false = not true", "not true = false", "3 + not 5", "3 + (not 5)", "(42 + 3", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < foobar - 3 < 4", "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "a + b = not c and false", "a + b = (not c) and false", "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "été = false", "i++", "j & k", "l or _m"] for test in Tests: var lex = initLexer(test) let ok = checkStmt(lex) echo test, " → ", ok if not ok: echo "*** Error at position $1. $2 ".format(lex.pos, lex.error) - Output:
wombat → true wombat or monotreme → true ( wombat and not ) → false *** Error at position 18. Encountered ')'; expected identifier, litteral or '('. wombat or not → false *** Error at position 13. Encountered EOF; expected identifier, litteral or '('. a + 1 → true a + b < c → true a + b - c * d / e < f and not ( g = h ) → true a + b - c * d / e < f and not ( g = h → false *** Error at position 37. Encountered EOF; expected ')'. a = b → true a or b = c → true $ → false *** Error at position 1. Encountered invalid token; expected identifier, litteral or '('. true or false = not true → false *** Error at position 19. Encountered 'not'; expected identifier, litteral or '('. not true = false → true 3 + not 5 → false *** Error at position 7. Encountered 'not'; expected identifier, litteral or '('. 3 + (not 5) → true (42 + 3 → false *** Error at position 7. Encountered EOF; expected ')'. not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true → true and 3 < 2 → false *** Error at position 4. Encountered 'and'; expected identifier, litteral or '('. not 7 < 2 → true 2 < 3 < 4 → false *** Error at position 7. Extra characters at end of statement. 2 < foobar - 3 < 4 → false *** Error at position 16. Extra characters at end of statement. 2 < foobar and 3 < 4 → true 4 * (32 - 16) + 9 = 73 → true 235 76 + 1 → false *** Error at position 6. Extra characters at end of statement. a + b = not c and false → false *** Error at position 11. Encountered 'not'; expected identifier, litteral or '('. a + b = (not c) and false → true a + b = (not c and false) → true ab_c / bd2 or < e_f7 → false *** Error at position 15. Encountered '<'; expected identifier, litteral or '('. g not = h → false *** Error at position 5. Extra characters at end of statement. été = false → false *** Error at position 1. Encountered invalid token; expected identifier, litteral or '('. i++ → false *** Error at position 3. Encountered '+'; expected identifier, litteral or '('. j & k → false *** Error at position 3. Extra characters at end of statement. l or _m → false *** Error at position 6. Encountered invalid token; expected identifier, litteral or '('. Made fix for 'not' see Discussion. Added 'not' and non-assoc fixes. Cooler output.
#!/usr/bin/perl use strict; # http://www.rosettacode.org/wiki/Compiler/_Verifying_Syntax use warnings; sub error { my $pos = pos($_) // 0; die $_, ' ' x ($pos + 7), "^ $_[0]\n"; } sub want { /\G\Q$_[0]/gc or error $_[1] } sub expr { my $level = shift; /\G\h+/gc; /\G\(/gc && want ')', "expected a closing paren", expr(0) or $level <= 4 && /\Gnot\b/gc && expr(5) or /\G (?!(?:and|or|not)\b) (?:\d+|[a-zA-Z]\w*)/gcx or error "expected a primary"; /\G\h+/gc ? 1 : $level <= 2 && /\Gor\b/gc ? expr(3) : $level <= 3 && /\Gand\b/gc ? expr(4) : $level <= 4 && /\G[=<]/gc ? expr(4.5) : $level == 4.5 && /\G(?=[=<])/gc ? error "non-associative operator" : $level <= 5 && /\G[+-]/gc ? expr(6) : $level <= 6 && /\G[*\/]/gc ? expr(7) : return 1 while 1; } while( <DATA> ) { print eval { want "\n", "expected end of input", expr(0) } ? " true $_" : "false $@"; } __DATA__ 3 + not 5 3 + (not 5) (42 + 3 (42 + 3 syntax_error not 3 < 4 or (true or 3/4+8*5-5*2 < 56) and 4*3 < 12 or not true and 3 < 2 not 7 < 2 2 < 3 < 4 2 < foobar - 3 < 4 2 < foobar and 3 < 4 4 * (32 - 16) + 9 = 73 235 76 + 1 a + b = not c and false a + b = (not c) and false a + b = (not c and false) ab_c / bd2 or < e_f7 g not = h i++ j & k l or _m UPPER_cAsE_aNd_letter_and_12345_test - Output:
false 3 + not 5 ^ expected a primary true 3 + (not 5) false (42 + 3 ^ expected a closing paren false (42 + 3 syntax_error ^ expected a closing paren true not 3 < 4 or (true or 3/4+8*5-5*2 < 56) and 4*3 < 12 or not true false and 3 < 2 ^ expected a primary true not 7 < 2 false 2 < 3 < 4 ^ non-associative operator false 2 < foobar - 3 < 4 ^ non-associative operator true 2 < foobar and 3 < 4 true 4 * (32 - 16) + 9 = 73 false 235 76 + 1 ^ expected end of input false a + b = not c and false ^ expected a primary true a + b = (not c) and false true a + b = (not c and false) false ab_c / bd2 or < e_f7 ^ expected a primary false g not = h ^ expected end of input false i++ ^ expected a primary false j & k ^ expected end of input false l or _m ^ expected a primary true UPPER_cAsE_aNd_letter_and_12345_test
-- demo\rosetta\Compiler\Verify_Syntax.exw with javascript_semantics string src integer ch, sdx procedure skip_spaces() while 1 do if sdx>length(src) then exit end if ch = src[sdx] if not find(ch,{' ','\t','\r','\n'}) then exit end if sdx += 1 end while end procedure enum SYMBOL, INTEGER, IDENT, ERROR, EOF constant toktypes = {"SYMBOL","INTEGER","IDENT","ERROR","EOF"} sequence tok function sprintok(string fmt) tok[1] = toktypes[tok[1]] return sprintf(fmt,{tok}) end function procedure next_token() -- yeilds one of: -- {SYMBOL,ch} where ch is one of "()+-/*=&<", or -- {INTEGER,n}, or -- {IDENT,string}, or -- {ERROR,msg}, or -- {EOF} skip_spaces() integer tokstart = sdx if tok[1]=ERROR then ?{"erm, tok is",tok} -- looping?? elsif sdx>length(src) then tok = {EOF} elsif find(ch,"()+-/*=&<") then sdx += 1 tok = {SYMBOL,ch&""} elsif (ch>='0' and ch<='9') then integer n = ch-'0' while true do sdx += 1 if sdx>length(src) then exit end if ch = src[sdx] if ch<'0' or ch>'9' then exit end if n = n*10 + ch-'0' end while tok = {INTEGER,n} elsif (ch>='a' and ch<='z') or (ch>='A' and ch<='Z') then while true do sdx += 1 if sdx>length(src) then exit end if ch = src[sdx] if ch!='_' and (ch<'a' or ch>'z') and (ch<'A' or ch>'Z') and (ch<'0' or ch>'9') then exit end if end while tok = {IDENT,src[tokstart..sdx-1]} elsif ch='_' then tok = {ERROR,"identifiers may not start with _"} sdx += 1 else tok = {ERROR,sprintf("illegal char (%c/%d)",ch)} sdx += 1 end if end procedure forward procedure or_expr() procedure primary() integer tt = tok[1] if tt=IDENT or tt=INTEGER then next_token() elsif tok={SYMBOL,"("} then next_token() or_expr() if tok!={SYMBOL,")"} then tok = {ERROR,") expected"} else next_token() end if else tok = {ERROR,sprintok("invalid [%v]")} end if end procedure procedure mul_expr() while true do primary() if not find(tok,{{SYMBOL,"*"},{SYMBOL,"/"}}) then exit end if next_token() end while end procedure procedure sum_expr() while true do mul_expr() if not find(tok,{{SYMBOL,"+"},{SYMBOL,"-"}}) then exit end if next_token() end while end procedure procedure cmp_expr() if tok=={IDENT,"not"} then next_token() end if -- while true do -- sum_expr() -- if not find(tok,{{SYMBOL,"="},{SYMBOL,"<"}}) then exit end if -- next_token() -- end while sum_expr() if find(tok,{{SYMBOL,"="},{SYMBOL,"<"}}) then next_token() sum_expr() end if end procedure procedure and_expr() while true do cmp_expr() if tok!={IDENT,"and"} then exit end if next_token() end while end procedure procedure or_expr() while true do and_expr() if tok!={IDENT,"or"} then exit end if next_token() end while end procedure procedure statement() or_expr() end procedure procedure verify_syntax(string source) src = source sdx = 1 tok = {0} -- ("not error"/invalid-ish) next_token() statement() printf(1,"%30s ==> %s\n",{source,iff(tok[1]=EOF?"true":sprintok("false [tok=%v]"))}) end procedure constant tests = { "$", "one", "either or both", "a + 1", "a + b < c", "a = b", "a or b = c", "3 + not 5", "3 + (not 5)", "(42 + 3", "(42 + 3)", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < (3 < 4)", "2 < foobar - 3 < 4", "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "true or false = not true", "true or false = (not true)", "not true or false = false", "not true = false", "a + b = not c and false", "a + b = (not c) and false", "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "i++", "j & k", "l or _m"} printf(1,"Verify Syntax:\n") for i=1 to length(tests) do verify_syntax(tests[i]) end for ?"done" {} = wait_key()
- Output:
Note that "= not c" fails, whereas "= (not c)" passes, see talk page. (Arguably the task definition should be fixed.)
Verify Syntax: $ ==> false [tok={"ERROR",`invalid [{"ERROR","illegal char ($/36)"}]`}] one ==> true either or both ==> true a + 1 ==> true a + b < c ==> true a = b ==> true a or b = c ==> true 3 + not 5 ==> false [tok={"INTEGER",5}] 3 + (not 5) ==> true (42 + 3 ==> false [tok={"ERROR",") expected"}] (42 + 3) ==> true not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true ==> true and 3 < 2 ==> false [tok={"INTEGER",3}] not 7 < 2 ==> true 2 < 3 < 4 ==> false [tok={"SYMBOL","<"}] 2 < (3 < 4) ==> true 2 < foobar - 3 < 4 ==> false [tok={"SYMBOL","<"}] 2 < foobar and 3 < 4 ==> true 4 * (32 - 16) + 9 = 73 ==> true 235 76 + 1 ==> false [tok={"INTEGER",76}] true or false = not true ==> false [tok={"IDENT","true"}] true or false = (not true) ==> true not true or false = false ==> true not true = false ==> true a + b = not c and false ==> false [tok={"IDENT","c"}] a + b = (not c) and false ==> true a + b = (not c and false) ==> true ab_c / bd2 or < e_f7 ==> false [tok={"ERROR",`invalid [{"SYMBOL","<"}]`}] g not = h ==> false [tok={"IDENT","not"}] i++ ==> false [tok={"ERROR",`invalid [{"SYMBOL","+"}]`}] j & k ==> false [tok={"SYMBOL","&"}] l or _m ==> false [tok={"ERROR",`invalid [{"ERROR","identifiers may not start with _"}]`}] Format of task grammar is changed from EBNF to ABNF to cater for the Grammar::ABNF module and testing data is taken from the Perl entry.
# 20200511 Raku programming solution use Grammar::ABNF; grammar G is Grammar::ABNF { token CRLF { "\n" } }; my $g = G.generate(Q:to<RULES> stmt = expr expr = expr_level_2 expr_level_2 = expr_level_3 *( " or " expr_level_3 ) expr_level_3 = expr_level_4 *( " and " expr_level_4 ) expr_level_4 = ["not "] expr_level_5 [ [ " = " / " < " ] expr_level_5 ] expr_level_5 = expr_level_6 *( [ " + " / " - " ] expr_level_6 ) expr_level_6 = primary *( [ " * " / " / " ] primary ) Integer = Digit *( Digit ) Identifier = Letter *( Letter / Digit / "_" ) primary = Identifier / Integer / "(" expr ")" / " true " / " false " Digit = "0" / "1" / "2" / "3" / "4" / "5" / "6" / "7" / "8" / "9" Letter = "a" / "b" / "c" / "d" / "e" / "f" / "g" / "h" / "i" / "j" / "k" / "l" / "m" / "n" / "o" / "p" / "q" / "r" / "s" / "t" / "u" / "v" / "w" / "x" / "y" / "z" / "A" / "B" / "C" / "D" / "E" / "F" / "G" / "H" / "I" / "J" / "K" / "L" / "M" / "N" / "O" / "P" / "Q" / "R" / "S" / "T" / "U" / "V" / "W" / "X" / "Y" / "Z" RULES ); my \DATA = Q:to<DATA>; 3 + not 5 3 + (not 5) (42 + 3 (42 + 3 syntax_error not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true and 3 < 2 not 7 < 2 2 < 3 < 4 2 < foobar - 3 < 4 2 < foobar and 3 < 4 4 * (32 - 16) + 9 = 73 235 76 + 1 a + b = not c and false a + b = (not c) and false a + b = (not c and false) ab_c / bd2 or < e_f7 g not = h i++ j & k l or _m UPPER_cAsE_aNd_letter_and_12345_test DATA say $g.parse($_).Bool, "\t", $_ for DATA.lines - Output:
False 3 + not 5 True 3 + (not 5) False (42 + 3 False (42 + 3 syntax_error True not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true False and 3 < 2 True not 7 < 2 False 2 < 3 < 4 False 2 < foobar - 3 < 4 True 2 < foobar and 3 < 4 True 4 * (32 - 16) + 9 = 73 False 235 76 + 1 False a + b = not c and false True a + b = (not c) and false True a + b = (not c and false) False ab_c / bd2 or < e_f7 False g not = h False i++ False j & k False l or _m True UPPER_cAsE_aNd_letter_and_12345_test
This is using Red's PARSE tool/dialect.
Red [ Title: "Compiler/Verifying syntax" Author: "hinjolicious" ] sp: [any space] stmt: [sp expr sp] expr: [expr2] expr2: [expr3 any [sp "or" sp expr3]] expr3: [expr4 any [sp "and" sp expr4]] expr4: [opt ["not" sp] expr5 opt [sp ["=" | "<"] sp expr5]] expr5: [expr6 any [sp ["+" | "-"] sp expr6]] expr6: [prim any [sp ["*" | "/"] sp prim]] prim: [sp [ ident | integer | ["(" sp expr sp ")"] | "true" | "false" ] sp] ident: [letter any [letter | digit | "_"]] integer: [some digit] digit: charset "0123456789" letter: charset [#"a" - #"z" #"A" - #"Z"] cnt: psd: 0 test: func [str c][ cnt: cnt + 1 r: parse str stmt print [str "->" r c "->" either r = c [psd: psd + 1 "passed!"]["FAILED!"]]] ; test test "wombat" true test "wombat or monotreme" true test "( wombat and not )" false test "wombat or not" false test "a + 1" true test "a + b < c" true test "a + b - c * d / e < f and not ( g = h )" true test "a + b - c * d / e < f and not ( g = h" false test "a = b" true test "a or b = c" true test "$" false test "true or false = not true" false test "not true = false" true test "3 + not 5" false test "3 + (not 5)" true test "(42 + 3" false test " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true" true test " and 3 < 2" false test "not 7 < 2" true test "2 < 3 < 4" false test "2 < foobar - 3 < 4" false test "2 < foobar and 3 < 4" true test "4 * (32 - 16) + 9 = 73" true test "235 76 + 1" false test "a + b = not c and false" false test "a + b = (not c) and false" true test "a + b = (not c and false)" true test "ab_c / bd2 or < e_f7" false test "g not = h" false test "été = false" false test "i++" false test "j & k" false test "l or _m" false print ["passed: " form/part psd / cnt * 100 5 "%"] - Output:
wombat -> true true -> passed! wombat or monotreme -> true true -> passed! ( wombat and not ) -> false false -> passed! wombat or not -> false false -> passed! a + 1 -> true true -> passed! a + b < c -> true true -> passed! a + b - c * d / e < f and not ( g = h ) -> true true -> passed! a + b - c * d / e < f and not ( g = h -> false false -> passed! a = b -> true true -> passed! a or b = c -> true true -> passed! $ -> false false -> passed! true or false = not true -> false false -> passed! not true = false -> true true -> passed! 3 + not 5 -> false false -> passed! 3 + (not 5) -> true true -> passed! (42 + 3 -> false false -> passed! not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true -> true true -> passed! and 3 < 2 -> false false -> passed! not 7 < 2 -> true true -> passed! 2 < 3 < 4 -> false false -> passed! 2 < foobar - 3 < 4 -> false false -> passed! 2 < foobar and 3 < 4 -> true true -> passed! 4 * (32 - 16) + 9 = 73 -> true true -> passed! 235 76 + 1 -> false false -> passed! a + b = not c and false -> false false -> passed! a + b = (not c) and false -> true true -> passed! a + b = (not c and false) -> true true -> passed! ab_c / bd2 or < e_f7 -> false false -> passed! g not = h -> false false -> passed! été = false -> false false -> passed! i++ -> false false -> passed! j & k -> false false -> passed! l or _m -> false false -> passed! passed: 100 %
// Add this to Cargo.toml: // [dependencies] // regex = "1.11.1" extern crate regex; // 1.11.1 use regex::Regex; use std::error::Error; use std::fmt; // Custom error type for validation errors #[derive(Debug)] struct ValidationError { message: String, } impl fmt::Display for ValidationError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.message) } } impl Error for ValidationError {} impl ValidationError { fn new(message: String) -> Self { ValidationError { message } } } struct ExpressionValidator { re1: Regex, re2: Regex, re3: Regex, re4: Regex, subs: Vec<(&'static str, &'static str)>, } impl ExpressionValidator { fn new() -> Self { ExpressionValidator { re1: Regex::new(r"[^_a-zA-Z0-9\+\-\*/=<\(\)\s]").unwrap(), re2: Regex::new(r"\b_\w*\b").unwrap(), re3: Regex::new(r"[=<+*/-]\s*not").unwrap(), re4: Regex::new(r"(=|<)\s*[^(=< ]+\s*([=<+*/-])").unwrap(), subs: vec![ ("=", "=="), (" not ", " ! "), ("(not ", "(! "), (" or ", " || "), (" and ", " && "), ], } } fn possibly_valid(&self, expr: &str) -> Result<(), ValidationError> { // Check for invalid characters if let Some(mat) = self.re1.find(expr) { let invalid_char = mat.as_str().chars().next().unwrap(); return Err(ValidationError::new(format!( "invalid character {:?} found", invalid_char ))); } // Check for identifiers beginning with underscore if self.re2.is_match(expr) { return Err(ValidationError::new( "identifier cannot begin with an underscore".to_string(), )); } // Check for 'not' after operators if self.re3.is_match(expr) { return Err(ValidationError::new( "expected operand, found 'not'".to_string(), )); } // Check for non-associative operators if let Some(captures) = self.re4.captures(expr) { let operator = captures.get(1).unwrap().as_str().chars().next().unwrap(); return Err(ValidationError::new(format!( "operator {:?} is non-associative", operator ))); } Ok(()) } fn modify_error(&self, err_msg: &str) -> String { let mut modified = err_msg.to_string(); // Apply substitutions in reverse for (from, to) in &self.subs { modified = modified.replace(to.trim(), from.trim()); } // Remove location info (simulate Go's behavior) if let Some(colon_pos) = modified.rfind(':') { if let Some(second_colon) = modified[..colon_pos].rfind(':') { return modified[second_colon + 2..].to_string(); } } modified } fn validate_expression(&self, expr: &str) -> (bool, String) { // First check basic validation if let Err(err) = self.possibly_valid(expr) { return (false, err.to_string()); } // Apply substitutions for parsing let mut modified_expr = format!(" {} ", expr); for (from, to) in &self.subs { modified_expr = modified_expr.replace(from, to); } // Since Rust doesn't have a built-in Go parser equivalent, // we'll do a basic syntax check for demonstration // In a real implementation, you'd use a proper expression parser match self.basic_syntax_check(&modified_expr) { Ok(_) => (true, String::new()), Err(err) => (false, self.modify_error(&err.to_string())), } } // Basic syntax checking (simplified version of what Go's parser does) fn basic_syntax_check(&self, expr: &str) -> Result<(), ValidationError> { let expr = expr.trim(); // Check for balanced parentheses let mut paren_count = 0; for ch in expr.chars() { match ch { '(' => paren_count += 1, ')' => { paren_count -= 1; if paren_count < 0 { return Err(ValidationError::new("unbalanced parentheses".to_string())); } } _ => {} } } if paren_count != 0 { return Err(ValidationError::new("expected ')'".to_string())); } // Check for invalid number sequences (like "235 76") let number_seq_regex = Regex::new(r"\b\d+\s+\d+\b").unwrap(); if number_seq_regex.is_match(expr) { return Err(ValidationError::new("expected operator".to_string())); } // Check for invalid operator sequences let invalid_ops = Regex::new(r"[+\-*/=<]{2,}|[+\-*/=<]\s*$|^\s*[+\-*/=<]").unwrap(); if invalid_ops.is_match(expr) { return Err(ValidationError::new("expected operand".to_string())); } // Check for increment operators (not supported) if expr.contains("++") || expr.contains("--") { return Err(ValidationError::new("expected operand".to_string())); } // Check for bitwise operators (not supported in this context) if expr.contains(" & ") { return Err(ValidationError::new("expected operand".to_string())); } Ok(()) } } fn main() { let validator = ExpressionValidator::new(); let exprs = vec![ "$", "one", "either or both", "a + 1", "a + b < c", "a = b", "a or b = c", "3 + not 5", "3 + (not 5)", "(42 + 3", "(42 + 3)", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < (3 < 4)", "2 < foobar - 3 < 4", "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "true or false = not true", "true or false = (not true)", "not true or false = false", "not true = false", "a + b = not c and false", "a + b = (not c) and false", "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "été = false", "i++", "j & k", "l or _m", ]; for expr in exprs { println!("Statement to verify: {:?}", expr); let (is_valid, error_msg) = validator.validate_expression(expr); if is_valid { println!("\"true\""); } else { println!("\"false\" -> {}", error_msg); } println!(); } } - Output:
Statement to verify: "$" "false" -> invalid character '$' found Statement to verify: "one" "true" Statement to verify: "either or both" "true" Statement to verify: "a + 1" "true" Statement to verify: "a + b < c" "true" Statement to verify: "a = b" "false" -> expected operand Statement to verify: "a or b = c" "false" -> expected operand Statement to verify: "3 + not 5" "false" -> expected operand, found 'not' Statement to verify: "3 + (not 5)" "true" Statement to verify: "(42 + 3" "false" -> expected ')' Statement to verify: "(42 + 3)" "true" Statement to verify: " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true" "true" Statement to verify: " and 3 < 2" "true" Statement to verify: "not 7 < 2" "true" Statement to verify: "2 < 3 < 4" "false" -> operator '<' is non-associative Statement to verify: "2 < (3 < 4)" "true" Statement to verify: "2 < foobar - 3 < 4" "false" -> operator '<' is non-associative Statement to verify: "2 < foobar and 3 < 4" "true" Statement to verify: "4 * (32 - 16) + 9 = 73" "false" -> expected operand Statement to verify: "235 76 + 1" "false" -> expected operator Statement to verify: "true or false = not true" "false" -> expected operand, found 'not' Statement to verify: "true or false = (not true)" "false" -> expected operand Statement to verify: "not true or false = false" "false" -> expected operand Statement to verify: "not true = false" "false" -> expected operand Statement to verify: "a + b = not c and false" "false" -> expected operand, found 'not' Statement to verify: "a + b = (not c) and false" "false" -> expected operand Statement to verify: "a + b = (not c and false)" "false" -> expected operand Statement to verify: "ab_c / bd2 or < e_f7" "true" Statement to verify: "g not = h" "false" -> expected operand Statement to verify: "été = false" "false" -> invalid character 'é' found Statement to verify: "i++" "false" -> expected operand Statement to verify: "j & k" "false" -> invalid character '&' found Statement to verify: "l or _m" "false" -> identifier cannot begin with an underscore
import "./dynamic" for Enum import "./str" for Char var Token = Enum.create( "Token", ["error", "ident", "int", "lpar", "rpar", "False", "True", "lt", "eq", "add", "sub", "mul", "div", "or", "and", "not", "eof"] ) var Token2Text = { Token.error: "invalid token", Token.ident: "identifier", Token.int: "integer", Token.lpar: "'('", Token.rpar: "')'", Token.False: "'false'", Token.True: "'true'", Token.lt: "'<'", Token.eq: "'='", Token.add: "'+'", Token.sub: "'-'", Token.mul: "'*'", Token.div: "'/'", Token.or: "'or'", Token.and: "'and'", Token.not: "'not'", Token.eof: "EOF" } var IdentTokens = { "false": Token.False, "true": Token.True, "or": Token.or, "and": Token.and, "not": Token.not } var CharTokens = { "(": Token.lpar, ")": Token.rpar, "<": Token.lt, "=": Token.eq, "+": Token.add, "-": Token.sub, "*": Token.mul, "/": Token.div } var IsIdentChar = Fn.new { |c| Char.isAsciiAlphaNum(c) || c == "_" } class Lexer { static init(s) { var lex = Lexer.new(s, s.count, 0, "", "") lex.nextToken return lex } construct new(str, len, pos, token, error) { _str = str // string to parse _len = len // string length _pos = pos // current lexer position _token = token // current token _error = error // error message } // property getters required pos { _pos } error { _error } // get the token for an identifier getIdToken { var s = "" while (_pos < _len && IsIdentChar.call(_str[_pos])) { s = s + _str[_pos] _pos = _pos + 1 } _token = IdentTokens.containsKey(s) ? IdentTokens[s] : Token.ident } // get an integer token getInt { while (_pos < _len && Char.isDigit(_str[_pos])) _pos = _pos + 1 _token = Token.int } // find the next token nextToken { // skip spaces while (_pos < _len && _str[_pos] == " ") _pos = _pos + 1 if (_pos == _len) { _token = Token.eof } else { var ch = _str[_pos] if (Char.isAsciiLower(ch)) { getIdToken } else if (Char.isDigit(ch)) { getInt } else { _pos = _pos + 1 _token = CharTokens.containsKey(ch) ? CharTokens[ch] : Token.error } } } // check validity of a primary checkPrimary { if ([Token.ident, Token.int, Token.False, Token.True].contains(_token)) { nextToken return true } else if (_token == Token.lpar) { nextToken if (!checkExpr) return false if (_token != Token.rpar) { _error = "Encountered %(Token2Text[_token]); expected ')'" return false } else { nextToken return true } } else { _error = "Encountered %(Token2Text[_token]); expected identifier, literal or '('" return false } } // check validity of an expr6 checkExpr6 { if (!checkPrimary) return false while ([Token.mul, Token.div].contains(_token)) { nextToken if (!checkPrimary) return false } return true } // check validity of an expr5 checkExpr5 { if (!checkExpr6) return false while ([Token.add, Token.sub].contains(_token)) { nextToken if (!checkExpr6) return false } return true } // check validity of an expr4 checkExpr4 { if (_token == Token.not) nextToken if (!checkExpr5) return false if ([Token.lt, Token.eq].contains(_token)) { nextToken if (!checkExpr5) return false } return true } // check validity of an expr3 checkExpr3 { if (!checkExpr4) return false while (_token == Token.and) { nextToken if (!checkExpr4) return false } return true } // check validity of an expr2 checkExpr2 { if (!checkExpr3) return false while (_token == Token.or) { nextToken if (!checkExpr3) return false } return true } // check validity of an expr checkExpr { checkExpr2 } // check validity of a statement checkStmt { var result = checkExpr if (result && _pos < _len) { _error = "Extra characters at end of statement." result = false } return result } } // using test set from Algol68 version var tests = [ "wombat", "wombat or monotreme", "( wombat and not )", "wombat or not", "a + 1", "a + b < c", "a + b - c * d / e < f and not ( g = h )", "a + b - c * d / e < f and not ( g = h", "a = b", "a or b = c", "$", "true or false = not true", "not true = false", "3 + not 5", "3 + (not 5)", "(42 + 3", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "2 < 3 < 4", "2 < foobar - 3 < 4", "2 < foobar and 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "a + b = not c and false", "a + b = (not c) and false", "a + b = (not c and false)", "ab_c / bd2 or < e_f7", "g not = h", "été = false", "i++", "j & k", "l or _m" ] for (test in tests) { var lex = Lexer.init(test) var ok = lex.checkStmt System.print("%(test) -> %(ok)") if (!ok) { System.print("*** Error at position %(lex.pos). %(lex.error)\n") } } - Output:
wombat -> true wombat or monotreme -> true ( wombat and not ) -> false *** Error at position 18. Encountered ')'; expected identifier, literal or '(' wombat or not -> false *** Error at position 13. Encountered EOF; expected identifier, literal or '(' a + 1 -> true a + b < c -> true a + b - c * d / e < f and not ( g = h ) -> true a + b - c * d / e < f and not ( g = h -> false *** Error at position 37. Encountered EOF; expected ')' a = b -> true a or b = c -> true $ -> false *** Error at position 1. Encountered invalid token; expected identifier, literal or '(' true or false = not true -> false *** Error at position 19. Encountered 'not'; expected identifier, literal or '(' not true = false -> true 3 + not 5 -> false *** Error at position 7. Encountered 'not'; expected identifier, literal or '(' 3 + (not 5) -> true (42 + 3 -> false *** Error at position 7. Encountered EOF; expected ')' not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true -> true and 3 < 2 -> false *** Error at position 4. Encountered 'and'; expected identifier, literal or '(' not 7 < 2 -> true 2 < 3 < 4 -> false *** Error at position 7. Extra characters at end of statement. 2 < foobar - 3 < 4 -> false *** Error at position 16. Extra characters at end of statement. 2 < foobar and 3 < 4 -> true 4 * (32 - 16) + 9 = 73 -> true 235 76 + 1 -> false *** Error at position 6. Extra characters at end of statement. a + b = not c and false -> false *** Error at position 11. Encountered 'not'; expected identifier, literal or '(' a + b = (not c) and false -> true a + b = (not c and false) -> true ab_c / bd2 or < e_f7 -> false *** Error at position 15. Encountered '<'; expected identifier, literal or '(' g not = h -> false *** Error at position 5. Extra characters at end of statement. été = false -> false *** Error at position 1. Encountered invalid token; expected identifier, literal or '(' i++ -> false *** Error at position 3. Encountered '+'; expected identifier, literal or '(' j & k -> false *** Error at position 3. Extra characters at end of statement. l or _m -> false *** Error at position 6. Encountered invalid token; expected identifier, literal or '('