diff --git a/c-grammar.jsonic b/c-grammar.jsonic index 3d85776..fe7acb4 100644 --- a/c-grammar.jsonic +++ b/c-grammar.jsonic @@ -61,11 +61,11 @@ external_declaration: { open: [ { s: '#ZZ' b: 1 g: 'extdecl-eof' } - # Phase H: PP_HASH dispatches to preprocessor_directive. - { s: 'PP_HASH PP_HASH' c: '@is-first-iter' b: 2 + # [extension: preprocessor] PP_HASH dispatches to preprocessor_directive. + { s: 'PP_HASH PP_HASH' c: '@ext-and-first-iter' b: 2 p: 'preprocessor_directive' a: '@mark-new-path' g: 'extdecl-pp-2' } - { s: 'PP_HASH #ANY_C_TOKEN' c: '@is-first-iter' b: 2 + { s: 'PP_HASH #ANY_C_TOKEN' c: '@ext-and-first-iter' b: 2 p: 'preprocessor_directive' a: '@mark-new-path' g: 'extdecl-pp' } # Phase O: top-level static_assert dispatches into the @@ -79,16 +79,37 @@ { s: 'KW__STATIC_ASSERT' c: '@is-first-iter' b: 1 p: 'static_assert_declaration' a: '@mark-new-path' g: 'extdecl-sa-1' } - # Phase I.2: top-level GCC __asm__ block. - { s: 'KW_ASM' c: '@is-first-iter' b: 1 + # [extension: gcc-asm] top-level inline assembly block. + { s: 'KW_ASM' c: '@ext-and-first-iter' b: 1 p: 'asm_statement' a: '@mark-new-path' g: 'extdecl-asm' } - { s: 'KW___ASM' c: '@is-first-iter' b: 1 + { s: 'KW___ASM' c: '@ext-and-first-iter' b: 1 p: 'asm_statement' a: '@mark-new-path' g: 'extdecl-asm-1' } - { s: 'KW___ASM__' c: '@is-first-iter' b: 1 + { s: 'KW___ASM__' c: '@ext-and-first-iter' b: 1 p: 'asm_statement' a: '@mark-new-path' g: 'extdecl-asm-2' } + # Plain-mode direct dispatches. When the head token clearly + # starts a declaration, push simple_declaration without a + # lookahead validator — the rule's own alts and per-rule k + # state disambiguate the actual shape. These run only when + # `extended: false` so we don't bypass the wildcard alts' + # @looks-simple-decl + isFunctionBodySupported gate that + # routes asm-body / pp-line function definitions to the + # legacy structuring path (those constructs only matter in + # extended mode anyway). + { s: '#SIMPLE_TYPE_HEAD' c: '@plain-and-first-iter' b: 1 + p: 'simple_declaration' a: '@mark-new-path' + g: 'extdecl-plain-type' } + { s: '#STORAGE_PREFIX' c: '@plain-and-first-iter' b: 1 + p: 'simple_declaration' a: '@mark-new-path' + g: 'extdecl-plain-storage' } + { s: 'KW__BITINT' c: '@plain-and-first-iter' b: 1 + p: 'simple_declaration' a: '@mark-new-path' + g: 'extdecl-plain-bitint' } + { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@plain-as23-and-first' + b: 2 p: 'simple_declaration' a: '@mark-new-path' + g: 'extdecl-plain-c23-attr' } # Phase B2.3 dispatch: cascading wildcard-token alts. Each one # matches a fixed number of tokens to force lookahead, then the # @looks-simple-decl cond validates the actual shape — optional @@ -141,16 +162,16 @@ # has wired up for full C precedence). simple_declaration: { open: [ - # Phase G: leading attribute specs dispatch into spec_loop - # which then consumes the attribute-spec sub-rule plus any - # following storage / type / tagged specifiers. + # Leading C23 attribute spec — plain C23. { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@as23-adjacent-open' b: 2 p: 'spec_loop' g: 'simple-decl-attr-c23' } - { s: 'KW___ATTRIBUTE__' b: 1 p: 'spec_loop' + # [extension: gcc-attr] leading GCC __attribute__((…)) spec. + { s: 'KW___ATTRIBUTE__' c: '@extended-on' b: 1 p: 'spec_loop' g: 'simple-decl-attr-gcc' } - { s: 'KW___ATTRIBUTE' b: 1 p: 'spec_loop' + { s: 'KW___ATTRIBUTE' c: '@extended-on' b: 1 p: 'spec_loop' g: 'simple-decl-attr-gcc-1' } - { s: 'KW___DECLSPEC' b: 1 p: 'spec_loop' + # [extension: msvc-attr] leading __declspec(…) spec. + { s: 'KW___DECLSPEC' c: '@extended-on' b: 1 p: 'spec_loop' g: 'simple-decl-attr-msvc' } { s: '#STORAGE_PREFIX' a: '@absorb-spec-storage' p: 'spec_loop' g: 'simple-decl-storage' } @@ -163,6 +184,12 @@ g: 'simple-decl-union' } { s: 'KW_ENUM' b: 1 p: 'enum_specifier' g: 'simple-decl-enum' } + # C23 _BitInt(N) — leading. Absorb the keyword onto the + # parent's specs and descend into bit_int_paren which + # captures `( )`. Then re-enter simple_declaration's + # open via spec_loop on return. + { s: 'KW__BITINT' b: 1 p: 'spec_loop' + g: 'simple-decl-bitint' } { s: '#SIMPLE_TYPE_HEAD' a: '@absorb-spec-type' p: 'spec_loop' g: 'simple-decl-type' } ] @@ -211,41 +238,75 @@ # owning declaration_specifiers list. spec_loop: { open: [ - # Phase G: attribute specs interleave freely with simple - # specifiers and tagged-type heads. + # Attribute specs interleave freely with simple specifiers and + # tagged-type heads. C23 [[…]] is plain; GCC __attribute__ / + # __attribute / MSVC __declspec are extensions. { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@as23-adjacent-open' b: 2 p: 'attribute_spec_c23' g: 'spec-loop-attr-c23' } - { s: 'KW___ATTRIBUTE__' b: 1 p: 'attribute_spec_gcc' - g: 'spec-loop-attr-gcc' } - { s: 'KW___ATTRIBUTE' b: 1 p: 'attribute_spec_gcc' - g: 'spec-loop-attr-gcc-1' } - { s: 'KW___DECLSPEC' b: 1 p: 'attribute_spec_msvc' - g: 'spec-loop-attr-msvc' } - { s: '#SIMPLE_TYPE_HEAD' a: '@absorb-spec-type' g: 'spec-loop-type' } + # [extension: gcc-attr] + { s: 'KW___ATTRIBUTE__' c: '@extended-on' b: 1 + p: 'attribute_spec_gcc' g: 'spec-loop-attr-gcc' } + { s: 'KW___ATTRIBUTE' c: '@extended-on' b: 1 + p: 'attribute_spec_gcc' g: 'spec-loop-attr-gcc-1' } + # [extension: msvc-attr] + { s: 'KW___DECLSPEC' c: '@extended-on' b: 1 + p: 'attribute_spec_msvc' g: 'spec-loop-attr-msvc' } + # Tagged-type heads dispatch into struct_specifier / + # enum_specifier. These must come BEFORE #SIMPLE_TYPE_HEAD + # because KW_STRUCT/KW_UNION/KW_ENUM are members of that set + # too — without ordering, the generic alt would absorb them + # as plain tokens instead of producing a struct_specifier + # subtree. { s: 'KW_STRUCT' b: 1 p: 'struct_specifier' g: 'spec-loop-struct' } { s: 'KW_UNION' b: 1 p: 'struct_specifier' g: 'spec-loop-union' } { s: 'KW_ENUM' b: 1 p: 'enum_specifier' g: 'spec-loop-enum' } + # C23 _BitInt(N) — width-parameterised integer type. Absorb + # the keyword as a normal type spec, then descend into the + # bit_int_paren sub-rule which captures `( )`. + { s: 'KW__BITINT' a: '@absorb-spec-type' + p: 'bit_int_paren' g: 'spec-loop-bitint' } + { s: '#SIMPLE_TYPE_HEAD' a: '@absorb-spec-type' g: 'spec-loop-type' } # If the next token isn't a specifier, fall through without # consuming so the parent can pick up the declarator. { s: [] g: 'spec-loop-empty' } ] close: [ + # See open above for the plain-vs-extension split. { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@as23-adjacent-open' b: 2 p: 'attribute_spec_c23' g: 'spec-loop-more-attr-c23' } - { s: 'KW___ATTRIBUTE__' b: 1 p: 'attribute_spec_gcc' - g: 'spec-loop-more-attr-gcc' } - { s: 'KW___ATTRIBUTE' b: 1 p: 'attribute_spec_gcc' - g: 'spec-loop-more-attr-gcc-1' } - { s: 'KW___DECLSPEC' b: 1 p: 'attribute_spec_msvc' - g: 'spec-loop-more-attr-msvc' } - { s: '#SIMPLE_TYPE_HEAD' b: 1 r: 'spec_loop' g: 'spec-loop-more' } + # [extension: gcc-attr] + { s: 'KW___ATTRIBUTE__' c: '@extended-on' b: 1 + p: 'attribute_spec_gcc' g: 'spec-loop-more-attr-gcc' } + { s: 'KW___ATTRIBUTE' c: '@extended-on' b: 1 + p: 'attribute_spec_gcc' g: 'spec-loop-more-attr-gcc-1' } + # [extension: msvc-attr] + { s: 'KW___DECLSPEC' c: '@extended-on' b: 1 + p: 'attribute_spec_msvc' g: 'spec-loop-more-attr-msvc' } + # Tagged-type heads must come before #SIMPLE_TYPE_HEAD here + # too (see open above for rationale). { s: 'KW_STRUCT' b: 1 p: 'struct_specifier' g: 'spec-loop-more-struct' } { s: 'KW_UNION' b: 1 p: 'struct_specifier' g: 'spec-loop-more-union' } { s: 'KW_ENUM' b: 1 p: 'enum_specifier' g: 'spec-loop-more-enum' } + { s: 'KW__BITINT' a: '@absorb-spec-type' + p: 'bit_int_paren' g: 'spec-loop-more-bitint' } + { s: '#SIMPLE_TYPE_HEAD' b: 1 r: 'spec_loop' g: 'spec-loop-more' } { s: [] g: 'spec-loop-end' } ] } + # bit_int_paren: the `(N)` width argument of `_BitInt(N)`. + # The keyword has already been absorbed by spec_loop's KW__BITINT + # alt; this rule just takes `(`, then a width expression, then `)`. + bit_int_paren: { + open: [ + { s: 'PUNC_LPAREN' a: '@bip-take-lparen' g: 'bip-lparen' } + ] + close: [ + { s: 'PUNC_RPAREN' a: '@bip-take-rparen' g: 'bip-rparen' } + { p: 'val' a: '@bip-mark-val' g: 'bip-val' } + ] + } + # init_declarator: pointer* ID (= val)? # Each invocation builds its own init_declarator node. The # parent simple_declaration's bc pushes it onto the @@ -295,11 +356,26 @@ # only exercises ` ID ( … ) ;`. { s: 'PUNC_LPAREN' b: 1 p: 'function_postfix' r: 'init_declarator' g: 'idecl-fn' } - { s: 'PUNC_ASSIGN' p: 'val' a: '@idecl-take-eq' g: 'idecl-eq' } + { s: 'PUNC_ASSIGN' p: 'initializer' a: '@idecl-take-eq' g: 'idecl-eq' } { s: [] g: 'idecl-end' } ] } + # initializer (phase Q2.1): wrapper around the RHS of `=` in an + # init_declarator. Dispatches to initializer_list for brace-init + # forms (`= { 1, 2, 3 }`, `= { [0] = 1 }`) and to val for + # expression initializers (`= 5`, `= (int)x`, `= f()`). Mirrors + # the legacy parseInitializer wrapper in structure.ts. + initializer: { + open: [ + { s: 'PUNC_LBRACE' b: 1 p: 'initializer_list' g: 'init-brace' } + { p: 'val' g: 'init-expr' } + ] + close: [ + { s: [] g: 'init-end' } + ] + } + # paren_inner_declarator (phase P): builds an inner declarator # node for a parenthesised sub-declarator (function-pointer # form). Mirrors init_declarator's pointer + ID + postfix logic @@ -344,9 +420,16 @@ # pointer_list: absorbs one or more `*` tokens. Pushes a # pointer node per `*` onto the parent init_declarator's # declarator children. + # pointer_list: `*` qualifiers* (then another `*` ...). + # Type qualifiers (`const`/`volatile`/`restrict`/`_Atomic`) after + # `*` qualify the pointer (e.g. `int * const p` is a const pointer + # to int). The qualifiers between successive `*`s are consumed by + # the pointer_qualifier_loop sub-rule, called from open after the + # `*` is absorbed. pointer_list: { open: [ - { s: 'PUNC_STAR' a: '@absorb-pointer' g: 'ptr' } + { s: 'PUNC_STAR' a: '@absorb-pointer' + p: 'pointer_qualifier_loop' g: 'ptr' } ] close: [ { s: 'PUNC_STAR' b: 1 r: 'pointer_list' g: 'ptr-more' } @@ -354,9 +437,30 @@ ] } + # pointer_qualifier_loop: zero or more type qualifiers that bind + # to the parent pointer_list's most recently-pushed pointer node. + # Each qualifier r:-recurses for the next; falls through cleanly + # when no qualifier is present. + pointer_qualifier_loop: { + open: [ + { s: 'KW_CONST' a: '@absorb-pq-const' + r: 'pointer_qualifier_loop' g: 'pql-const' } + { s: 'KW_VOLATILE' a: '@absorb-pq-const' + r: 'pointer_qualifier_loop' g: 'pql-volatile' } + { s: 'KW_RESTRICT' a: '@absorb-pq-const' + r: 'pointer_qualifier_loop' g: 'pql-restrict' } + { s: 'KW__ATOMIC' a: '@absorb-pq-const' + r: 'pointer_qualifier_loop' g: 'pql-atomic' } + { s: [] g: 'pql-empty' } + ] + close: [ + { s: [] g: 'pql-end' } + ] + } + # function_postfix: `( )` after the declarator name. - # Phase B3.1 covers the simplest forms: empty `()`, explicit - # `(void)`, and one or more concrete parameter declarations. + # Covers empty `()`, explicit `(void)`, prototype parameters, + # variadic, K&R-style identifier lists. function_postfix: { open: [ { s: 'PUNC_LPAREN' a: '@fn-open' g: 'fn-open' } @@ -364,18 +468,48 @@ close: [ # Empty parameter list: `()`. { s: 'PUNC_RPAREN' a: '@fn-close' g: 'fn-end-empty' } + # K&R-style prototype: `(ID , ID , ...)` or `(ID)` where the + # ID is NOT a registered typedef (else it would lex as + # TYPEDEF_NAME and route through parameter_type_list). The + # two-token lookahead disambiguates from a typed parameter. + { s: 'ID PUNC_COMMA' b: 2 p: 'identifier_list' g: 'fn-knr-comma' } + { s: 'ID PUNC_RPAREN' b: 2 p: 'identifier_list' g: 'fn-knr-end' } # Otherwise descend into the parameter list, then re-enter # close (where the matching `)` is consumed). { p: 'parameter_type_list' g: 'fn-params' } ] } - # parameter_type_list: 1+ comma-separated parameter_declarations. + # identifier_list (K&R-style function prototype parameter list). + # `int f(a, b);` — the parameters are bare identifiers without + # type specifiers; the actual types are declared between the + # parameter list and the function body in pre-ANSI C, or are + # left implicit. We consume the comma-separated IDs and attach + # the identifier_list node onto function_postfix. + identifier_list: { + open: [ + { s: 'ID' a: '@idlist-take' g: 'idl-first' } + ] + close: [ + { s: 'PUNC_COMMA' a: '@idlist-comma' + r: 'identifier_list' g: 'idl-comma' } + { s: 'ID' a: '@idlist-take' + r: 'identifier_list' g: 'idl-more' } + { s: 'PUNC_RPAREN' b: 1 a: '@idlist-attach' + g: 'idl-end' } + ] + } + + # parameter_type_list: 1+ comma-separated parameter_declarations, + # optionally terminated by `, ...` for variadic functions. parameter_type_list: { open: [ { p: 'parameter_declaration' g: 'ptl-first' } ] close: [ + # Variadic ellipsis: `, ...` ends the list. + { s: 'PUNC_COMMA PUNC_ELLIPSIS' a: '@ptl-take-ellipsis' + g: 'ptl-ellipsis' } { s: 'PUNC_COMMA' a: '@ptl-comma' p: 'parameter_declaration' g: 'ptl-more' } { s: 'PUNC_RPAREN' b: 1 a: '@ptl-attach-and-end' g: 'ptl-end' } @@ -401,11 +535,66 @@ # absorbing more `*` and finally the optional ID. { s: 'PUNC_STAR' a: '@param-pointer' r: 'parameter_declaration' g: 'param-ptr' } - { s: 'ID' a: '@param-name' g: 'param-id' } + # Returning from param_paren_inner: take the matching `)` of + # the outer paren-form. Must come BEFORE the PUNC_LPAREN + # alts so the closing token is consumed by the right alt. + { s: 'PUNC_RPAREN' c: '@param-paren-pending' + a: '@param-paren-close' r: 'parameter_declaration' + g: 'param-paren-rparen' } + # Function postfix following a parenthesised pointer, e.g. + # the `(int)` in `int (*)(int)`. Higher priority than the + # paren-form alt so it fires once we've already absorbed a + # paren-form declarator. + { s: 'PUNC_LPAREN' c: '@param-need-fn-postfix' + b: 1 p: 'function_postfix' + r: 'parameter_declaration' g: 'param-fn-postfix' } + # Parenthesised abstract / named declarator: `int (*)(int)`, + # `int (*fn)(int)`. Open paren feeds into a sub-rule that + # handles `(...)` containing pointer + optional ID; the + # outer `)` is consumed by the param-paren-rparen alt above. + # Cond gates against re-firing once a paren-form has already + # been absorbed. + { s: 'PUNC_LPAREN' c: '@param-can-paren-form' + a: '@param-paren-open' p: 'param_paren_inner' + g: 'param-paren' } + # Array postfix, e.g. `int arr[10]` or `int (*)[10]`. + { s: 'PUNC_LBRACKET' b: 1 p: 'array_postfix' + r: 'parameter_declaration' g: 'param-arr' } + { s: 'ID' a: '@param-name' + r: 'parameter_declaration' g: 'param-id' } { s: [] g: 'param-end' } ] } + # param_paren_inner: the contents of `(` ... `)` inside a + # parameter declarator. Mirrors paren_inner_declarator's role + # for init_declarator. Accepts pointer prefix(es) + optional ID, + # then exits on `)` (which the outer parameter_declaration + # consumes). + param_paren_inner: { + open: [ + # Re-entry after we absorbed the inner ID — fall through to + # close so the outer `)` is left for parameter_declaration. + { c: '@ppi-named' s: [] g: 'ppi-reentry' } + # Pointer prefix. + { s: 'PUNC_STAR' a: '@ppi-pointer' + r: 'param_paren_inner' g: 'ppi-ptr' } + # Bare ID with no pointer (rare, e.g. `int (fp)(…)`). + { s: 'ID' a: '@ppi-name' g: 'ppi-id' } + # Abstract: nothing inside `(*)` already-handled by re-entry + # via the pointer alt; fallthrough to close on bare `)`. + { s: [] g: 'ppi-empty' } + ] + close: [ + # After absorbing `*`, an optional ID may follow. + { s: 'ID' a: '@ppi-name' g: 'ppi-id-after-ptr' } + # Stop before `)` so the outer parameter_declaration's close + # consumes it. + { s: 'PUNC_RPAREN' b: 1 g: 'ppi-end' } + { s: [] g: 'ppi-fall' } + ] + } + # param_spec_loop: zero or more additional type specifiers in a # parameter's spec list. param_spec_loop: { @@ -496,12 +685,17 @@ { s: 'KW_BREAK' b: 1 p: 'jump_statement' g: 'stmt-break' } { s: 'KW_CONTINUE' b: 1 p: 'jump_statement' g: 'stmt-continue' } { s: 'KW_GOTO' b: 1 p: 'jump_statement' g: 'stmt-goto' } - # GCC inline asm - { s: 'KW_ASM' b: 1 p: 'asm_statement' g: 'stmt-asm' } - { s: 'KW___ASM' b: 1 p: 'asm_statement' g: 'stmt-asm-1' } - { s: 'KW___ASM__' b: 1 p: 'asm_statement' g: 'stmt-asm-2' } - # Preprocessor line inside a function body (rare but legal). - { s: 'PP_HASH' b: 1 p: 'preprocessor_line' g: 'stmt-pp' } + # [extension: gcc-asm] inline assembly inside a body + { s: 'KW_ASM' c: '@extended-on' b: 1 p: 'asm_statement' + g: 'stmt-asm' } + { s: 'KW___ASM' c: '@extended-on' b: 1 p: 'asm_statement' + g: 'stmt-asm-1' } + { s: 'KW___ASM__' c: '@extended-on' b: 1 p: 'asm_statement' + g: 'stmt-asm-2' } + # [extension: preprocessor] preprocessor line inside a body + # (rare but legal). + { s: 'PP_HASH' c: '@extended-on' b: 1 p: 'preprocessor_line' + g: 'stmt-pp' } # Expression statement (default fallthrough) { p: 'expression_statement' g: 'stmt-expr' } ] @@ -1321,7 +1515,10 @@ { s: 'PUNC_RBRACE' a: '@el-take-rbrace' g: 'el-end' } { s: 'PUNC_COMMA' a: '@el-take-comma' r: 'enumerator_list' g: 'el-comma' } - { p: 'enumerator' r: 'enumerator_list' g: 'el-enum' } + # Push enumerator for the next item; on return, re-fire close + # (the parser stays in close state). No `r:` — that would + # fight with `p:` and is effectively dead code. + { p: 'enumerator' g: 'el-enum' } ] } @@ -1332,6 +1529,10 @@ { s: 'TYPEDEF_NAME' a: '@enr-take-name' g: 'enr-td' } ] close: [ + # C23 attribute spec on the enumerator: `A [[deprecated]] = 1`. + { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@as23-adjacent-open' + b: 2 p: 'attribute_spec_c23' + r: 'enumerator' g: 'enr-attr' } { c: '@enr-need-eq' s: 'PUNC_ASSIGN' a: '@enr-take-eq' p: 'val' g: 'enr-eq' } { s: [] g: 'enr-end' } @@ -1421,16 +1622,26 @@ { s: 'ID' a: '@ai-take-name' g: 'ai-id' } { s: 'TYPEDEF_NAME' a: '@ai-take-name' g: 'ai-td' } { s: 'MACRO_NAME' a: '@ai-take-name' g: 'ai-macro' } + # GCC attribute names can be keywords: `__attribute__((const))`, + # `__attribute__((noreturn))` etc. Mirror legacy parseAttributeItem + # which accepts any KW_* token in the name slot. + { s: '#KW_TOKEN' a: '@ai-take-name' g: 'ai-kw' } ] close: [ - # C23 namespaced form: ` :: `. - { c: '@ai-need-colon-1' s: 'PUNC_COLON' + # C23 namespaced form: ` :: `. Use a 2-token + # `s:` so parse_alts force-fetches both `:` tokens (without + # this, only the first colon is in ctx.t and the cond's + # ctx.t[1] check sees NOTOKEN and rejects). + { c: '@ai-need-colon-1' s: 'PUNC_COLON PUNC_COLON' b: 1 a: '@ai-take-colon-1' r: 'attribute_item' g: 'ai-c1' } { c: '@ai-need-colon-2' s: 'PUNC_COLON' a: '@ai-take-colon-2' r: 'attribute_item' g: 'ai-c2' } { c: '@ai-need-prefixed-name' s: 'ID' a: '@ai-take-prefixed-name' r: 'attribute_item' - g: 'ai-pname' } + g: 'ai-pname-id' } + { c: '@ai-need-prefixed-name' s: '#KW_TOKEN' + a: '@ai-take-prefixed-name' r: 'attribute_item' + g: 'ai-pname-kw' } # Optional argument list. { c: '@ai-need-args' s: 'PUNC_LPAREN' b: 1 p: 'attribute_argument_list' g: 'ai-args' } @@ -1459,15 +1670,21 @@ # unrecognised name, falls through to a flat-token fallback. preprocessor_directive: { open: [ - { c: '@ppd-is-define' s: 'PP_HASH' b: 1 + # Use a 2-token `s:` so parse_alts force-fetches both `#` + # and the directive-name token (an ID lexed from the body + # of the directive). The cond inspects rule.o1.src to + # distinguish #define / #undef / #include / #if family / + # #pragma|error|warning|line. b: 2 backsteps both so the + # sub-rule's open re-takes them in its own parse. + { c: '@ppd-is-define' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'define_directive' g: 'ppd-define' } - { c: '@ppd-is-undef' s: 'PP_HASH' b: 1 + { c: '@ppd-is-undef' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'undef_directive' g: 'ppd-undef' } - { c: '@ppd-is-include' s: 'PP_HASH' b: 1 + { c: '@ppd-is-include' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'include_directive' g: 'ppd-include' } - { c: '@ppd-is-conditional' s: 'PP_HASH' b: 1 + { c: '@ppd-is-conditional' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'conditional_directive' g: 'ppd-cond' } - { c: '@ppd-is-simple' s: 'PP_HASH' b: 1 + { c: '@ppd-is-simple' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'simple_directive' g: 'ppd-simple' } # Fallback: unknown directive name. { s: 'PP_HASH' b: 1 p: 'simple_directive' g: 'ppd-unknown' } diff --git a/src/c.ts b/src/c.ts index e733258..cd530ef 100644 --- a/src/c.ts +++ b/src/c.ts @@ -101,11 +101,11 @@ const grammarText = ` external_declaration: { open: [ { s: '#ZZ' b: 1 g: 'extdecl-eof' } - # Phase H: PP_HASH dispatches to preprocessor_directive. - { s: 'PP_HASH PP_HASH' c: '@is-first-iter' b: 2 + # [extension: preprocessor] PP_HASH dispatches to preprocessor_directive. + { s: 'PP_HASH PP_HASH' c: '@ext-and-first-iter' b: 2 p: 'preprocessor_directive' a: '@mark-new-path' g: 'extdecl-pp-2' } - { s: 'PP_HASH #ANY_C_TOKEN' c: '@is-first-iter' b: 2 + { s: 'PP_HASH #ANY_C_TOKEN' c: '@ext-and-first-iter' b: 2 p: 'preprocessor_directive' a: '@mark-new-path' g: 'extdecl-pp' } # Phase O: top-level static_assert dispatches into the @@ -119,16 +119,37 @@ const grammarText = ` { s: 'KW__STATIC_ASSERT' c: '@is-first-iter' b: 1 p: 'static_assert_declaration' a: '@mark-new-path' g: 'extdecl-sa-1' } - # Phase I.2: top-level GCC __asm__ block. - { s: 'KW_ASM' c: '@is-first-iter' b: 1 + # [extension: gcc-asm] top-level inline assembly block. + { s: 'KW_ASM' c: '@ext-and-first-iter' b: 1 p: 'asm_statement' a: '@mark-new-path' g: 'extdecl-asm' } - { s: 'KW___ASM' c: '@is-first-iter' b: 1 + { s: 'KW___ASM' c: '@ext-and-first-iter' b: 1 p: 'asm_statement' a: '@mark-new-path' g: 'extdecl-asm-1' } - { s: 'KW___ASM__' c: '@is-first-iter' b: 1 + { s: 'KW___ASM__' c: '@ext-and-first-iter' b: 1 p: 'asm_statement' a: '@mark-new-path' g: 'extdecl-asm-2' } + # Plain-mode direct dispatches. When the head token clearly + # starts a declaration, push simple_declaration without a + # lookahead validator — the rule's own alts and per-rule k + # state disambiguate the actual shape. These run only when + # \`extended: false\` so we don't bypass the wildcard alts' + # @looks-simple-decl + isFunctionBodySupported gate that + # routes asm-body / pp-line function definitions to the + # legacy structuring path (those constructs only matter in + # extended mode anyway). + { s: '#SIMPLE_TYPE_HEAD' c: '@plain-and-first-iter' b: 1 + p: 'simple_declaration' a: '@mark-new-path' + g: 'extdecl-plain-type' } + { s: '#STORAGE_PREFIX' c: '@plain-and-first-iter' b: 1 + p: 'simple_declaration' a: '@mark-new-path' + g: 'extdecl-plain-storage' } + { s: 'KW__BITINT' c: '@plain-and-first-iter' b: 1 + p: 'simple_declaration' a: '@mark-new-path' + g: 'extdecl-plain-bitint' } + { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@plain-as23-and-first' + b: 2 p: 'simple_declaration' a: '@mark-new-path' + g: 'extdecl-plain-c23-attr' } # Phase B2.3 dispatch: cascading wildcard-token alts. Each one # matches a fixed number of tokens to force lookahead, then the # @looks-simple-decl cond validates the actual shape — optional @@ -181,16 +202,16 @@ const grammarText = ` # has wired up for full C precedence). simple_declaration: { open: [ - # Phase G: leading attribute specs dispatch into spec_loop - # which then consumes the attribute-spec sub-rule plus any - # following storage / type / tagged specifiers. + # Leading C23 attribute spec — plain C23. { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@as23-adjacent-open' b: 2 p: 'spec_loop' g: 'simple-decl-attr-c23' } - { s: 'KW___ATTRIBUTE__' b: 1 p: 'spec_loop' + # [extension: gcc-attr] leading GCC __attribute__((…)) spec. + { s: 'KW___ATTRIBUTE__' c: '@extended-on' b: 1 p: 'spec_loop' g: 'simple-decl-attr-gcc' } - { s: 'KW___ATTRIBUTE' b: 1 p: 'spec_loop' + { s: 'KW___ATTRIBUTE' c: '@extended-on' b: 1 p: 'spec_loop' g: 'simple-decl-attr-gcc-1' } - { s: 'KW___DECLSPEC' b: 1 p: 'spec_loop' + # [extension: msvc-attr] leading __declspec(…) spec. + { s: 'KW___DECLSPEC' c: '@extended-on' b: 1 p: 'spec_loop' g: 'simple-decl-attr-msvc' } { s: '#STORAGE_PREFIX' a: '@absorb-spec-storage' p: 'spec_loop' g: 'simple-decl-storage' } @@ -203,6 +224,12 @@ const grammarText = ` g: 'simple-decl-union' } { s: 'KW_ENUM' b: 1 p: 'enum_specifier' g: 'simple-decl-enum' } + # C23 _BitInt(N) — leading. Absorb the keyword onto the + # parent's specs and descend into bit_int_paren which + # captures \`( )\`. Then re-enter simple_declaration's + # open via spec_loop on return. + { s: 'KW__BITINT' b: 1 p: 'spec_loop' + g: 'simple-decl-bitint' } { s: '#SIMPLE_TYPE_HEAD' a: '@absorb-spec-type' p: 'spec_loop' g: 'simple-decl-type' } ] @@ -251,41 +278,75 @@ const grammarText = ` # owning declaration_specifiers list. spec_loop: { open: [ - # Phase G: attribute specs interleave freely with simple - # specifiers and tagged-type heads. + # Attribute specs interleave freely with simple specifiers and + # tagged-type heads. C23 [[…]] is plain; GCC __attribute__ / + # __attribute / MSVC __declspec are extensions. { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@as23-adjacent-open' b: 2 p: 'attribute_spec_c23' g: 'spec-loop-attr-c23' } - { s: 'KW___ATTRIBUTE__' b: 1 p: 'attribute_spec_gcc' - g: 'spec-loop-attr-gcc' } - { s: 'KW___ATTRIBUTE' b: 1 p: 'attribute_spec_gcc' - g: 'spec-loop-attr-gcc-1' } - { s: 'KW___DECLSPEC' b: 1 p: 'attribute_spec_msvc' - g: 'spec-loop-attr-msvc' } - { s: '#SIMPLE_TYPE_HEAD' a: '@absorb-spec-type' g: 'spec-loop-type' } + # [extension: gcc-attr] + { s: 'KW___ATTRIBUTE__' c: '@extended-on' b: 1 + p: 'attribute_spec_gcc' g: 'spec-loop-attr-gcc' } + { s: 'KW___ATTRIBUTE' c: '@extended-on' b: 1 + p: 'attribute_spec_gcc' g: 'spec-loop-attr-gcc-1' } + # [extension: msvc-attr] + { s: 'KW___DECLSPEC' c: '@extended-on' b: 1 + p: 'attribute_spec_msvc' g: 'spec-loop-attr-msvc' } + # Tagged-type heads dispatch into struct_specifier / + # enum_specifier. These must come BEFORE #SIMPLE_TYPE_HEAD + # because KW_STRUCT/KW_UNION/KW_ENUM are members of that set + # too — without ordering, the generic alt would absorb them + # as plain tokens instead of producing a struct_specifier + # subtree. { s: 'KW_STRUCT' b: 1 p: 'struct_specifier' g: 'spec-loop-struct' } { s: 'KW_UNION' b: 1 p: 'struct_specifier' g: 'spec-loop-union' } { s: 'KW_ENUM' b: 1 p: 'enum_specifier' g: 'spec-loop-enum' } + # C23 _BitInt(N) — width-parameterised integer type. Absorb + # the keyword as a normal type spec, then descend into the + # bit_int_paren sub-rule which captures \`( )\`. + { s: 'KW__BITINT' a: '@absorb-spec-type' + p: 'bit_int_paren' g: 'spec-loop-bitint' } + { s: '#SIMPLE_TYPE_HEAD' a: '@absorb-spec-type' g: 'spec-loop-type' } # If the next token isn't a specifier, fall through without # consuming so the parent can pick up the declarator. { s: [] g: 'spec-loop-empty' } ] close: [ + # See open above for the plain-vs-extension split. { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@as23-adjacent-open' b: 2 p: 'attribute_spec_c23' g: 'spec-loop-more-attr-c23' } - { s: 'KW___ATTRIBUTE__' b: 1 p: 'attribute_spec_gcc' - g: 'spec-loop-more-attr-gcc' } - { s: 'KW___ATTRIBUTE' b: 1 p: 'attribute_spec_gcc' - g: 'spec-loop-more-attr-gcc-1' } - { s: 'KW___DECLSPEC' b: 1 p: 'attribute_spec_msvc' - g: 'spec-loop-more-attr-msvc' } - { s: '#SIMPLE_TYPE_HEAD' b: 1 r: 'spec_loop' g: 'spec-loop-more' } + # [extension: gcc-attr] + { s: 'KW___ATTRIBUTE__' c: '@extended-on' b: 1 + p: 'attribute_spec_gcc' g: 'spec-loop-more-attr-gcc' } + { s: 'KW___ATTRIBUTE' c: '@extended-on' b: 1 + p: 'attribute_spec_gcc' g: 'spec-loop-more-attr-gcc-1' } + # [extension: msvc-attr] + { s: 'KW___DECLSPEC' c: '@extended-on' b: 1 + p: 'attribute_spec_msvc' g: 'spec-loop-more-attr-msvc' } + # Tagged-type heads must come before #SIMPLE_TYPE_HEAD here + # too (see open above for rationale). { s: 'KW_STRUCT' b: 1 p: 'struct_specifier' g: 'spec-loop-more-struct' } { s: 'KW_UNION' b: 1 p: 'struct_specifier' g: 'spec-loop-more-union' } { s: 'KW_ENUM' b: 1 p: 'enum_specifier' g: 'spec-loop-more-enum' } + { s: 'KW__BITINT' a: '@absorb-spec-type' + p: 'bit_int_paren' g: 'spec-loop-more-bitint' } + { s: '#SIMPLE_TYPE_HEAD' b: 1 r: 'spec_loop' g: 'spec-loop-more' } { s: [] g: 'spec-loop-end' } ] } + # bit_int_paren: the \`(N)\` width argument of \`_BitInt(N)\`. + # The keyword has already been absorbed by spec_loop's KW__BITINT + # alt; this rule just takes \`(\`, then a width expression, then \`)\`. + bit_int_paren: { + open: [ + { s: 'PUNC_LPAREN' a: '@bip-take-lparen' g: 'bip-lparen' } + ] + close: [ + { s: 'PUNC_RPAREN' a: '@bip-take-rparen' g: 'bip-rparen' } + { p: 'val' a: '@bip-mark-val' g: 'bip-val' } + ] + } + # init_declarator: pointer* ID (= val)? # Each invocation builds its own init_declarator node. The # parent simple_declaration's bc pushes it onto the @@ -335,11 +396,26 @@ const grammarText = ` # only exercises \` ID ( … ) ;\`. { s: 'PUNC_LPAREN' b: 1 p: 'function_postfix' r: 'init_declarator' g: 'idecl-fn' } - { s: 'PUNC_ASSIGN' p: 'val' a: '@idecl-take-eq' g: 'idecl-eq' } + { s: 'PUNC_ASSIGN' p: 'initializer' a: '@idecl-take-eq' g: 'idecl-eq' } { s: [] g: 'idecl-end' } ] } + # initializer (phase Q2.1): wrapper around the RHS of \`=\` in an + # init_declarator. Dispatches to initializer_list for brace-init + # forms (\`= { 1, 2, 3 }\`, \`= { [0] = 1 }\`) and to val for + # expression initializers (\`= 5\`, \`= (int)x\`, \`= f()\`). Mirrors + # the legacy parseInitializer wrapper in structure.ts. + initializer: { + open: [ + { s: 'PUNC_LBRACE' b: 1 p: 'initializer_list' g: 'init-brace' } + { p: 'val' g: 'init-expr' } + ] + close: [ + { s: [] g: 'init-end' } + ] + } + # paren_inner_declarator (phase P): builds an inner declarator # node for a parenthesised sub-declarator (function-pointer # form). Mirrors init_declarator's pointer + ID + postfix logic @@ -384,9 +460,16 @@ const grammarText = ` # pointer_list: absorbs one or more \`*\` tokens. Pushes a # pointer node per \`*\` onto the parent init_declarator's # declarator children. + # pointer_list: \`*\` qualifiers* (then another \`*\` ...). + # Type qualifiers (\`const\`/\`volatile\`/\`restrict\`/\`_Atomic\`) after + # \`*\` qualify the pointer (e.g. \`int * const p\` is a const pointer + # to int). The qualifiers between successive \`*\`s are consumed by + # the pointer_qualifier_loop sub-rule, called from open after the + # \`*\` is absorbed. pointer_list: { open: [ - { s: 'PUNC_STAR' a: '@absorb-pointer' g: 'ptr' } + { s: 'PUNC_STAR' a: '@absorb-pointer' + p: 'pointer_qualifier_loop' g: 'ptr' } ] close: [ { s: 'PUNC_STAR' b: 1 r: 'pointer_list' g: 'ptr-more' } @@ -394,9 +477,30 @@ const grammarText = ` ] } + # pointer_qualifier_loop: zero or more type qualifiers that bind + # to the parent pointer_list's most recently-pushed pointer node. + # Each qualifier r:-recurses for the next; falls through cleanly + # when no qualifier is present. + pointer_qualifier_loop: { + open: [ + { s: 'KW_CONST' a: '@absorb-pq-const' + r: 'pointer_qualifier_loop' g: 'pql-const' } + { s: 'KW_VOLATILE' a: '@absorb-pq-const' + r: 'pointer_qualifier_loop' g: 'pql-volatile' } + { s: 'KW_RESTRICT' a: '@absorb-pq-const' + r: 'pointer_qualifier_loop' g: 'pql-restrict' } + { s: 'KW__ATOMIC' a: '@absorb-pq-const' + r: 'pointer_qualifier_loop' g: 'pql-atomic' } + { s: [] g: 'pql-empty' } + ] + close: [ + { s: [] g: 'pql-end' } + ] + } + # function_postfix: \`( )\` after the declarator name. - # Phase B3.1 covers the simplest forms: empty \`()\`, explicit - # \`(void)\`, and one or more concrete parameter declarations. + # Covers empty \`()\`, explicit \`(void)\`, prototype parameters, + # variadic, K&R-style identifier lists. function_postfix: { open: [ { s: 'PUNC_LPAREN' a: '@fn-open' g: 'fn-open' } @@ -404,18 +508,48 @@ const grammarText = ` close: [ # Empty parameter list: \`()\`. { s: 'PUNC_RPAREN' a: '@fn-close' g: 'fn-end-empty' } + # K&R-style prototype: \`(ID , ID , ...)\` or \`(ID)\` where the + # ID is NOT a registered typedef (else it would lex as + # TYPEDEF_NAME and route through parameter_type_list). The + # two-token lookahead disambiguates from a typed parameter. + { s: 'ID PUNC_COMMA' b: 2 p: 'identifier_list' g: 'fn-knr-comma' } + { s: 'ID PUNC_RPAREN' b: 2 p: 'identifier_list' g: 'fn-knr-end' } # Otherwise descend into the parameter list, then re-enter # close (where the matching \`)\` is consumed). { p: 'parameter_type_list' g: 'fn-params' } ] } - # parameter_type_list: 1+ comma-separated parameter_declarations. + # identifier_list (K&R-style function prototype parameter list). + # \`int f(a, b);\` — the parameters are bare identifiers without + # type specifiers; the actual types are declared between the + # parameter list and the function body in pre-ANSI C, or are + # left implicit. We consume the comma-separated IDs and attach + # the identifier_list node onto function_postfix. + identifier_list: { + open: [ + { s: 'ID' a: '@idlist-take' g: 'idl-first' } + ] + close: [ + { s: 'PUNC_COMMA' a: '@idlist-comma' + r: 'identifier_list' g: 'idl-comma' } + { s: 'ID' a: '@idlist-take' + r: 'identifier_list' g: 'idl-more' } + { s: 'PUNC_RPAREN' b: 1 a: '@idlist-attach' + g: 'idl-end' } + ] + } + + # parameter_type_list: 1+ comma-separated parameter_declarations, + # optionally terminated by \`, ...\` for variadic functions. parameter_type_list: { open: [ { p: 'parameter_declaration' g: 'ptl-first' } ] close: [ + # Variadic ellipsis: \`, ...\` ends the list. + { s: 'PUNC_COMMA PUNC_ELLIPSIS' a: '@ptl-take-ellipsis' + g: 'ptl-ellipsis' } { s: 'PUNC_COMMA' a: '@ptl-comma' p: 'parameter_declaration' g: 'ptl-more' } { s: 'PUNC_RPAREN' b: 1 a: '@ptl-attach-and-end' g: 'ptl-end' } @@ -441,11 +575,66 @@ const grammarText = ` # absorbing more \`*\` and finally the optional ID. { s: 'PUNC_STAR' a: '@param-pointer' r: 'parameter_declaration' g: 'param-ptr' } - { s: 'ID' a: '@param-name' g: 'param-id' } + # Returning from param_paren_inner: take the matching \`)\` of + # the outer paren-form. Must come BEFORE the PUNC_LPAREN + # alts so the closing token is consumed by the right alt. + { s: 'PUNC_RPAREN' c: '@param-paren-pending' + a: '@param-paren-close' r: 'parameter_declaration' + g: 'param-paren-rparen' } + # Function postfix following a parenthesised pointer, e.g. + # the \`(int)\` in \`int (*)(int)\`. Higher priority than the + # paren-form alt so it fires once we've already absorbed a + # paren-form declarator. + { s: 'PUNC_LPAREN' c: '@param-need-fn-postfix' + b: 1 p: 'function_postfix' + r: 'parameter_declaration' g: 'param-fn-postfix' } + # Parenthesised abstract / named declarator: \`int (*)(int)\`, + # \`int (*fn)(int)\`. Open paren feeds into a sub-rule that + # handles \`(...)\` containing pointer + optional ID; the + # outer \`)\` is consumed by the param-paren-rparen alt above. + # Cond gates against re-firing once a paren-form has already + # been absorbed. + { s: 'PUNC_LPAREN' c: '@param-can-paren-form' + a: '@param-paren-open' p: 'param_paren_inner' + g: 'param-paren' } + # Array postfix, e.g. \`int arr[10]\` or \`int (*)[10]\`. + { s: 'PUNC_LBRACKET' b: 1 p: 'array_postfix' + r: 'parameter_declaration' g: 'param-arr' } + { s: 'ID' a: '@param-name' + r: 'parameter_declaration' g: 'param-id' } { s: [] g: 'param-end' } ] } + # param_paren_inner: the contents of \`(\` ... \`)\` inside a + # parameter declarator. Mirrors paren_inner_declarator's role + # for init_declarator. Accepts pointer prefix(es) + optional ID, + # then exits on \`)\` (which the outer parameter_declaration + # consumes). + param_paren_inner: { + open: [ + # Re-entry after we absorbed the inner ID — fall through to + # close so the outer \`)\` is left for parameter_declaration. + { c: '@ppi-named' s: [] g: 'ppi-reentry' } + # Pointer prefix. + { s: 'PUNC_STAR' a: '@ppi-pointer' + r: 'param_paren_inner' g: 'ppi-ptr' } + # Bare ID with no pointer (rare, e.g. \`int (fp)(…)\`). + { s: 'ID' a: '@ppi-name' g: 'ppi-id' } + # Abstract: nothing inside \`(*)\` already-handled by re-entry + # via the pointer alt; fallthrough to close on bare \`)\`. + { s: [] g: 'ppi-empty' } + ] + close: [ + # After absorbing \`*\`, an optional ID may follow. + { s: 'ID' a: '@ppi-name' g: 'ppi-id-after-ptr' } + # Stop before \`)\` so the outer parameter_declaration's close + # consumes it. + { s: 'PUNC_RPAREN' b: 1 g: 'ppi-end' } + { s: [] g: 'ppi-fall' } + ] + } + # param_spec_loop: zero or more additional type specifiers in a # parameter's spec list. param_spec_loop: { @@ -536,12 +725,17 @@ const grammarText = ` { s: 'KW_BREAK' b: 1 p: 'jump_statement' g: 'stmt-break' } { s: 'KW_CONTINUE' b: 1 p: 'jump_statement' g: 'stmt-continue' } { s: 'KW_GOTO' b: 1 p: 'jump_statement' g: 'stmt-goto' } - # GCC inline asm - { s: 'KW_ASM' b: 1 p: 'asm_statement' g: 'stmt-asm' } - { s: 'KW___ASM' b: 1 p: 'asm_statement' g: 'stmt-asm-1' } - { s: 'KW___ASM__' b: 1 p: 'asm_statement' g: 'stmt-asm-2' } - # Preprocessor line inside a function body (rare but legal). - { s: 'PP_HASH' b: 1 p: 'preprocessor_line' g: 'stmt-pp' } + # [extension: gcc-asm] inline assembly inside a body + { s: 'KW_ASM' c: '@extended-on' b: 1 p: 'asm_statement' + g: 'stmt-asm' } + { s: 'KW___ASM' c: '@extended-on' b: 1 p: 'asm_statement' + g: 'stmt-asm-1' } + { s: 'KW___ASM__' c: '@extended-on' b: 1 p: 'asm_statement' + g: 'stmt-asm-2' } + # [extension: preprocessor] preprocessor line inside a body + # (rare but legal). + { s: 'PP_HASH' c: '@extended-on' b: 1 p: 'preprocessor_line' + g: 'stmt-pp' } # Expression statement (default fallthrough) { p: 'expression_statement' g: 'stmt-expr' } ] @@ -1361,7 +1555,10 @@ const grammarText = ` { s: 'PUNC_RBRACE' a: '@el-take-rbrace' g: 'el-end' } { s: 'PUNC_COMMA' a: '@el-take-comma' r: 'enumerator_list' g: 'el-comma' } - { p: 'enumerator' r: 'enumerator_list' g: 'el-enum' } + # Push enumerator for the next item; on return, re-fire close + # (the parser stays in close state). No \`r:\` — that would + # fight with \`p:\` and is effectively dead code. + { p: 'enumerator' g: 'el-enum' } ] } @@ -1372,6 +1569,10 @@ const grammarText = ` { s: 'TYPEDEF_NAME' a: '@enr-take-name' g: 'enr-td' } ] close: [ + # C23 attribute spec on the enumerator: \`A [[deprecated]] = 1\`. + { s: 'PUNC_LBRACKET PUNC_LBRACKET' c: '@as23-adjacent-open' + b: 2 p: 'attribute_spec_c23' + r: 'enumerator' g: 'enr-attr' } { c: '@enr-need-eq' s: 'PUNC_ASSIGN' a: '@enr-take-eq' p: 'val' g: 'enr-eq' } { s: [] g: 'enr-end' } @@ -1461,16 +1662,26 @@ const grammarText = ` { s: 'ID' a: '@ai-take-name' g: 'ai-id' } { s: 'TYPEDEF_NAME' a: '@ai-take-name' g: 'ai-td' } { s: 'MACRO_NAME' a: '@ai-take-name' g: 'ai-macro' } + # GCC attribute names can be keywords: \`__attribute__((const))\`, + # \`__attribute__((noreturn))\` etc. Mirror legacy parseAttributeItem + # which accepts any KW_* token in the name slot. + { s: '#KW_TOKEN' a: '@ai-take-name' g: 'ai-kw' } ] close: [ - # C23 namespaced form: \` :: \`. - { c: '@ai-need-colon-1' s: 'PUNC_COLON' + # C23 namespaced form: \` :: \`. Use a 2-token + # \`s:\` so parse_alts force-fetches both \`:\` tokens (without + # this, only the first colon is in ctx.t and the cond's + # ctx.t[1] check sees NOTOKEN and rejects). + { c: '@ai-need-colon-1' s: 'PUNC_COLON PUNC_COLON' b: 1 a: '@ai-take-colon-1' r: 'attribute_item' g: 'ai-c1' } { c: '@ai-need-colon-2' s: 'PUNC_COLON' a: '@ai-take-colon-2' r: 'attribute_item' g: 'ai-c2' } { c: '@ai-need-prefixed-name' s: 'ID' a: '@ai-take-prefixed-name' r: 'attribute_item' - g: 'ai-pname' } + g: 'ai-pname-id' } + { c: '@ai-need-prefixed-name' s: '#KW_TOKEN' + a: '@ai-take-prefixed-name' r: 'attribute_item' + g: 'ai-pname-kw' } # Optional argument list. { c: '@ai-need-args' s: 'PUNC_LPAREN' b: 1 p: 'attribute_argument_list' g: 'ai-args' } @@ -1499,15 +1710,21 @@ const grammarText = ` # unrecognised name, falls through to a flat-token fallback. preprocessor_directive: { open: [ - { c: '@ppd-is-define' s: 'PP_HASH' b: 1 + # Use a 2-token \`s:\` so parse_alts force-fetches both \`#\` + # and the directive-name token (an ID lexed from the body + # of the directive). The cond inspects rule.o1.src to + # distinguish #define / #undef / #include / #if family / + # #pragma|error|warning|line. b: 2 backsteps both so the + # sub-rule's open re-takes them in its own parse. + { c: '@ppd-is-define' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'define_directive' g: 'ppd-define' } - { c: '@ppd-is-undef' s: 'PP_HASH' b: 1 + { c: '@ppd-is-undef' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'undef_directive' g: 'ppd-undef' } - { c: '@ppd-is-include' s: 'PP_HASH' b: 1 + { c: '@ppd-is-include' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'include_directive' g: 'ppd-include' } - { c: '@ppd-is-conditional' s: 'PP_HASH' b: 1 + { c: '@ppd-is-conditional' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'conditional_directive' g: 'ppd-cond' } - { c: '@ppd-is-simple' s: 'PP_HASH' b: 1 + { c: '@ppd-is-simple' s: 'PP_HASH #ANY_C_TOKEN' b: 2 p: 'simple_directive' g: 'ppd-simple' } # Fallback: unknown directive name. { s: 'PP_HASH' b: 1 p: 'simple_directive' g: 'ppd-unknown' } @@ -1751,9 +1968,58 @@ function parseGrammar(text: string): any { } export interface COptions { - // Reserved for future flags (strict mode, dialect selection, etc.) + // Enable extension support: preprocessor (#include, #define, #if family, + // #pragma, #error, #warning, #undef, #line), GCC keywords/syntax + // (__attribute__, __asm__, __extension__, __inline__, __signed__, + // __volatile__, __const__, __restrict__, __typeof__, __alignof__), + // MSVC keywords (__declspec, __cdecl, __int8/16/32/64, __ptr32/64, + // etc.), Clang nullability annotations, and the structured CST + // shapes for inline assembly. + // + // When false (default), the parser only handles plain C23: keywords, + // punctuators, literals, declarations/definitions/statements/ + // expressions, C23 attributes [[...]], typedef tracking. Source that + // uses any extension construct will fail to parse cleanly. + // + // The default is `false` to keep the plain-C path the canonical + // reference. Real-world C source typically needs `{extended: true}`. + extended?: boolean +} + +// Resolve options with defaults. The plugin uses the resolved object +// throughout to gate extension-only code paths. +function resolveOptions(o?: COptions): Required { + return { + extended: o?.extended === true, + } } +// Extension-only grammar rule names. When `extended: false` the +// plugin strips these from the parsed grammar spec before passing it +// to jsonic.grammar(). This is the physical companion to the +// `c: '@extended-on'` dispatch-alt gating: the gates already make the +// rules unreachable; deleting them outright is housekeeping that +// makes plain-C mode self-evidently free of extension grammar. +// +// Plain C keeps: +// - C23 attribute_spec_c23 + attribute_item + attribute_argument_list +// - static_assert_declaration (C23 + _Static_assert) +// - statement_expression (GCC, kept in plain by user choice) +const EXTENSION_RULES: ReadonlyArray = [ + // GCC inline assembly + 'asm_statement', 'asm_template', 'asm_section', + 'asm_operand', 'asm_clobber', 'asm_label_ref', + // Preprocessor (in-body opaque + top-level structured) + 'preprocessor_line', + 'preprocessor_directive', + 'define_directive', 'macro_parameter_list', 'macro_body', + 'undef_directive', + 'include_directive', 'header_form', + 'conditional_directive', 'simple_directive', + // Compiler-specific attribute spec syntax + 'attribute_spec_gcc', 'attribute_spec_msvc', +] + // ---- AST types ------------------------------------------------------ export interface Span { start: number; end: number; line: number; col: number } @@ -1809,6 +2075,14 @@ function getCMeta(ctx: Context): CMeta { // own rule, deferred to phase B5). Everything else has a rule. const UNSUPPORTED_BODY_TOKENS = new Set([ 'KW_STATIC_ASSERT', 'KW__STATIC_ASSERT', + // Function bodies that contain extension constructs (inline + // assembly, preprocessor lines) currently go through the legacy + // structuring path which has full asm-operand / pp-line shapes. + // The grammar's asm_operand is opaque token-list — extending it to + // produce the structured shape is a separate task. Until then, + // gate any body that mentions asm/pp away from the grammar path. + 'KW_ASM', 'KW___ASM', 'KW___ASM__', + 'PP_HASH', ]) // Walk the function body starting at the token index of `{` and @@ -1828,29 +2102,32 @@ const UNSUPPORTED_BODY_TOKENS = new Set([ // at the keyword token. Consumes the optional tag ID, optional C23 // `: utype` for enums, and the optional balanced `{ … }` body. function skipTaggedSpec(ctx: Context, i: number): number { - const head = ctx.t[i] + const head = ctx.t[i] || fetchDeep(ctx, i) if (!head) return i if (head.name !== 'KW_STRUCT' && head.name !== 'KW_UNION' && head.name !== 'KW_ENUM') return i i++ // keyword - // Skip leading attribute specs that may follow the keyword. i = skipLeadingAttributes(ctx, i) - // Optional tag identifier. - const tagN = ctx.t[i]?.name + const tagN = fetchDeep(ctx, i)?.name if (tagN === 'ID' || tagN === 'TYPEDEF_NAME' || tagN === 'MACRO_NAME') { i++ } - // C23 enum fixed-underlying-type: `enum E : int { … }`. - if (head.name === 'KW_ENUM' && ctx.t[i]?.name === 'PUNC_COLON') { + if (head.name === 'KW_ENUM' && fetchDeep(ctx, i)?.name === 'PUNC_COLON') { i++ - while (simpleTypeHeadSet.has(ctx.t[i]?.name)) i++ + while (true) { + const n = fetchDeep(ctx, i)?.name + if (!n || !simpleTypeHeadSet.has(n)) break + i++ + } } - // Optional body. - if (ctx.t[i]?.name === 'PUNC_LBRACE') { + if (fetchDeep(ctx, i)?.name === 'PUNC_LBRACE') { let depth = 0 - while (ctx.t[i]) { - if (ctx.t[i].name === 'PUNC_LBRACE') depth++ - else if (ctx.t[i].name === 'PUNC_RBRACE') { + const start = i + while (i < start + 4096) { + const t = fetchDeep(ctx, i) + if (!t) break + if (t.name === 'PUNC_LBRACE') depth++ + else if (t.name === 'PUNC_RBRACE') { depth-- if (depth === 0) { i++; break } } @@ -1864,6 +2141,16 @@ function skipTaggedSpec(ctx: Context, i: number): number { // ctx.t[i]. Returns the new index. Recognises GCC `__attribute__` // / `__attribute` (with their `(( … ))` body), MSVC `__declspec` // (with `( … )`), and C23 `[[ … ]]` (two adjacent `[`s). +// +// Token access strategy: the leading-token check uses ctx.t directly +// (never fetchDeep) so a non-attribute lead returns without growing +// ctx.t — otherwise the side effect would extend the lookahead window +// for @looks-simple-decl's other ctx.t reads. fetchDeep is invoked +// only after we've confirmed an attribute keyword and need to scan +// past its body. The body scan is bounded to ~64 tokens past the +// keyword so a single call can't grow ctx.t unboundedly; bodies that +// overflow bail back to the original i, matching the original +// ctx.t-only walker's behaviour at the ctx.t boundary. function skipLeadingAttributes(ctx: Context, i: number): number { while (true) { const t = ctx.t[i] @@ -1871,41 +2158,62 @@ function skipLeadingAttributes(ctx: Context, i: number): number { if (t.name === 'KW___ATTRIBUTE__' || t.name === 'KW___ATTRIBUTE' || t.name === 'KW___DECLSPEC') { // Walk past the `(...)` (and inner `(...)` for GCC). - if (ctx.t[i + 1]?.name !== 'PUNC_LPAREN') return i + const fetchBound = i + 64 + const fetchAt = (idx: number) => + idx < ctx.t.length ? ctx.t[idx] : (idx <= fetchBound ? fetchDeep(ctx, idx) : undefined) + if (fetchAt(i + 1)?.name !== 'PUNC_LPAREN') return i let j = i + 1 let depth = 0 - while (ctx.t[j]) { - if (ctx.t[j].name === 'PUNC_LPAREN') depth++ - else if (ctx.t[j].name === 'PUNC_RPAREN') { + let sawClose = false + while (true) { + const tj = fetchAt(j) + if (!tj) break + if (tj.name === 'PUNC_LPAREN') depth++ + else if (tj.name === 'PUNC_RPAREN') { depth-- - if (depth === 0) { j++; break } + if (depth === 0) { j++; sawClose = true; break } } j++ } - if (!ctx.t[j - 1] || ctx.t[j - 1].name !== 'PUNC_RPAREN') return i + if (!sawClose) return i + // Extend ctx.t a bit past the attribute body so the caller's + // post-attribute lookups (e.g. @looks-simple-decl walking the + // declarator) don't fall off the dispatch window. 8 tokens + // covers the typical ` [postfix...] ` tail. + for (let k = 0; k < 8; k++) fetchAt(j + k) i = j continue } - if (t.name === 'PUNC_LBRACKET' && - ctx.t[i + 1]?.name === 'PUNC_LBRACKET' && - (t as any).sI + (t as any).len === (ctx.t[i + 1] as any).sI) { + if (t.name === 'PUNC_LBRACKET') { + const tNext = ctx.t[i + 1] + if (!tNext || tNext.name !== 'PUNC_LBRACKET' || + (t as any).sI + (t as any).len !== (tNext as any).sI) return i // C23 [[ ... ]] — find matching ]] + const fetchBound = i + 64 + const fetchAt = (idx: number) => + idx < ctx.t.length ? ctx.t[idx] : (idx <= fetchBound ? fetchDeep(ctx, idx) : undefined) let j = i + 2 let depth = 0 - while (ctx.t[j]) { - if (ctx.t[j].name === 'PUNC_LBRACKET') depth++ - else if (ctx.t[j].name === 'PUNC_RBRACKET') { + let sawClose = false + while (true) { + const tj = fetchAt(j) + if (!tj) break + if (tj.name === 'PUNC_LBRACKET') depth++ + else if (tj.name === 'PUNC_RBRACKET') { + const tj1 = fetchAt(j + 1) if (depth === 0 && - ctx.t[j + 1]?.name === 'PUNC_RBRACKET' && - (ctx.t[j] as any).sI + (ctx.t[j] as any).len === - (ctx.t[j + 1] as any).sI) { + tj1?.name === 'PUNC_RBRACKET' && + (tj as any).sI + (tj as any).len === (tj1 as any).sI) { j += 2 + sawClose = true break } depth-- } j++ } + if (!sawClose) return i + for (let k = 0; k < 8; k++) fetchAt(j + k) i = j continue } @@ -1913,23 +2221,42 @@ function skipLeadingAttributes(ctx: Context, i: number): number { } } +// Maximum lookahead depth fetchDeep will search. The validator only +// needs to see past the current declaration; capping prevents the +// dispatch validator from lexing the entire translation unit when +// it accidentally walks into pathological input. +const FETCH_DEEP_CAP = 256 + function fetchDeep(ctx: Context, idx: number): Token | undefined { - if (idx < ctx.t.length && ctx.t[idx]) return ctx.t[idx] + // ctx.t is jsonic's lookahead ring buffer. Slots that haven't been + // lexed yet sit at indices >= length; slots the parser has + // explicitly cleared (after consume-and-shift) are filled with + // NOTOKEN — a sentinel with name='' and tin=-1. Both cases mean + // "no real token here yet" — but we never RETURN NOTOKEN, since + // callers using `t?.name === 'PUNC_LBRACE'` would treat NOTOKEN + // (name='') as a non-match and spin until the safety cap. + if (idx >= FETCH_DEEP_CAP) return undefined + const NOTOKEN: any = (ctx as any).NOTOKEN + const isReal = (t: any): boolean => !!t && t !== NOTOKEN && t.name !== '' + if (idx < ctx.t.length && isReal(ctx.t[idx])) return ctx.t[idx] const cfg: any = (ctx as any).cfg const IGNORE = cfg && cfg.tokenSetTins && cfg.tokenSetTins.IGNORE const lex: any = (ctx as any).lex if (!lex || typeof lex.next !== 'function' || !IGNORE) return undefined - let safety = 0 - while (ctx.t.length <= idx && safety++ < 4096) { + // Push at tail until ctx.t.length > idx. Filling NOTOKEN slots + // in place breaks the parser's lookahead invariant (re-consumed + // tokens) and inflates memory on csmith-style large inputs. + while (ctx.t.length <= idx) { let tkn: any do { tkn = lex.next((ctx as any).rule, undefined, undefined, ctx.t.length) } while (tkn && IGNORE[tkn.tin]) - if (!tkn) break + if (!tkn) return undefined ctx.t.push(tkn) if (tkn.name === '#ZZ') break } - return ctx.t[idx] + const result = idx < ctx.t.length ? ctx.t[idx] : undefined + return isReal(result) ? result : undefined } function isFunctionBodySupported(ctx: Context, lbraceI: number): boolean { @@ -1959,12 +2286,22 @@ function isFunctionBodySupported(ctx: Context, lbraceI: number): boolean { // ---- Plugin --------------------------------------------------------- -const C: any = function C(jsonic: Jsonic, _options: COptions): void { +const C: any = function C(jsonic: Jsonic, options: COptions): void { + const opts = resolveOptions(options) + // Stash resolved options on the jsonic instance so grammar-ref + // conditions and lex-time helpers can read them. + ;(jsonic as any).cOptions = opts // 1. Register punctuator token names with their fixed sources, and // keyword token names. We disable jsonic's built-in fixed-token // matcher so identifier boundaries (e.g. `int_value`) aren't broken // by a `int` keyword cut. + // + // Even when `extended` is false we register every keyword token name + // (cheap — just integer mapping) so ANY_C_TOKEN stays consistent. + // What changes under `!extended` is which lex matchers are installed, + // which grammar dispatch alts fire, and which @-action refs are + // registered. const fixedTokens: Record = {} for (const [name, src] of PUNCTUATORS) fixedTokens[name] = src for (const kw of [...C23_KEYWORDS, ...EXT_KEYWORDS]) { @@ -2032,6 +2369,7 @@ const C: any = function C(jsonic: Jsonic, _options: COptions): void { 'KW__THREAD_LOCAL', 'KW_THREAD_LOCAL', 'KW_CONSTEXPR', 'KW___THREAD', 'KW_INLINE', 'KW___INLINE__', 'KW___INLINE', + 'KW___EXTENSION__', ], // C-atom set used by val's paren-preval alt (call / subscript // detection). Distinct from jsonic's standard VAL set so the @@ -2041,6 +2379,13 @@ const C: any = function C(jsonic: Jsonic, _options: COptions): void { 'ID', 'MACRO_NAME', 'TYPEDEF_NAME', ], C_PAREN_OPEN: ['PUNC_LPAREN', 'PUNC_LBRACKET'], + // All keyword tokens. Used by attribute_item so a C keyword + // (e.g. `const` in `__attribute__((const))`) can stand in as + // the attribute name — mirroring legacy parseAttributeItem + // which accepts any `KW_*` token in that slot. + KW_TOKEN: [...C23_KEYWORDS, ...EXT_KEYWORDS] + .map((kw) => keywordTokenName(kw)!) + .filter((n) => !!n), // Phase C.2: sizeof / _Alignof / __alignof__ keyword set used // by val's open alt that disambiguates the type-form // (`sizeof ( int )`) from the expression-form @@ -2119,9 +2464,20 @@ const C: any = function C(jsonic: Jsonic, _options: COptions): void { // GrammarSpec via jsonic.grammar(). All conditions and actions are // bound to @-named refs defined in this file, keeping the grammar // file free of TypeScript noise. + // Grammar load. The full grammar text in c-grammar.jsonic defines + // both plain-C rules and extension rules (preprocessor, GCC asm, + // GCC/MSVC attribute specs). In plain-C mode (`extended: false`) + // the extension rules are stripped from the parsed spec — the + // dispatch alts that would have reached them are also gated with + // `c: '@extended-on'` so removing the rule definitions is purely + // belt-and-suspenders housekeeping. + const spec = parseGrammar(grammarText) + if (!opts.extended && spec && spec.rule) { + for (const name of EXTENSION_RULES) delete spec.rule[name] + } jsonic.grammar({ - ...parseGrammar(grammarText), - ref: grammarRefs, + ...spec, + ref: makeGrammarRefs(opts), }) // Phase A: install @jsonic/expr on the same jsonic instance with @@ -2147,7 +2503,27 @@ C.defaults = {} as any // entries auto-install as state actions on their rule (see jsonic // rules.js fnref handling); the rest are explicit alt actions / // conditions referenced via `a:` / `c:` clauses in the grammar. -const grammarRefs: Record = { +// grammarRefs is built by makeGrammarRefs(opts) so the @-action +// closures can capture the resolved plugin options. The exported +// `grammarRefs` const below is a default for reference / tests; the +// plugin builds a fresh refs map per install in C() above. +function makeGrammarRefs(opts: Required): Record { +return { + + // Extension gate ---- + // True when the plugin was installed with `extended: true`. Every + // extension dispatch alt (preprocessor, GCC/MSVC keywords, asm) is + // gated with `c: '@extended-on'` so the alt is dead under plain-C + // mode. The companion `@extended-off` is the negation, used by + // alts that should ONLY fire in plain-C mode (rare). + '@extended-on': (_rule: Rule): boolean => opts.extended === true, + '@extended-off': (_rule: Rule): boolean => opts.extended !== true, + // Combined: extension-on AND first iteration of the dispatching rule. + // Used by extdecl_loop's PP_HASH / KW_ASM extension dispatch alts + // which originally checked `@is-first-iter` alone. + '@ext-and-first-iter': (rule: Rule): boolean => + opts.extended === true && + (!rule.k.tokens || rule.k.tokens.length === 0), // translation_unit ---- '@translation_unit-bo': (rule: Rule): void => { @@ -2246,6 +2622,22 @@ const grammarRefs: Record = { '@is-first-iter': (rule: Rule): boolean => !rule.k.tokens || rule.k.tokens.length === 0, + // Plain-mode + first-iter combo. Used by external_declaration's + // direct-dispatch alts that route a clearly-shaped declaration + // straight to simple_declaration (no @looks-simple-decl lookahead + // walk). In extended mode the validator+wildcard cascade still + // handles dispatch so the asm/pp gate via isFunctionBodySupported + // continues to route those bodies to legacy structuring. + '@plain-and-first-iter': (rule: Rule): boolean => + opts.extended !== true && + (!rule.k.tokens || rule.k.tokens.length === 0), + '@plain-as23-and-first': (rule: Rule, ctx: Context): boolean => { + if (opts.extended === true) return false + if (rule.k.tokens && rule.k.tokens.length > 0) return false + const a = ctx.t[0] as any, b = ctx.t[1] as any + return !!(a && b && a.sI + a.len === b.sI) + }, + // Phase B2.3: lookahead-based dispatch shape check. // Walks ctx.t and validates: optional STORAGE_PREFIX, 1+ // SIMPLE_TYPE_HEAD, then ID, then `;` or `=`. Combined with the @@ -2265,34 +2657,61 @@ const grammarRefs: Record = { // Walk through specifiers. Each iteration takes one specifier // and any tagged-type body (`struct S { … }` / `enum E { … }`) // so the post-spec terminator check looks at the right token. - while (i < 32) { - const n = ctx.t[i]?.name + while (i < 256) { + const n = fetchDeep(ctx, i)?.name if (!n) break if (n === 'KW_STRUCT' || n === 'KW_UNION' || n === 'KW_ENUM') { + const before = i i = skipTaggedSpec(ctx, i) + if (i === before) break // safety: no progress continue } if (simpleTypeHeadSet.has(n)) { i++ continue } + // C23 _BitInt(N) — width-parameterised type. Skip the keyword + // and its `(N)` parens as a single specifier so the validator + // sees the trailing ID/`;`. spec_loop dispatches into the + // bit_int_paren sub-rule at parse time. + if (n === 'KW__BITINT') { + i++ + if (fetchDeep(ctx, i)?.name !== 'PUNC_LPAREN') return false + let d = 1; i++ + while (i < FETCH_DEEP_CAP && d > 0) { + const m = fetchDeep(ctx, i)?.name + if (!m) return false + if (m === 'PUNC_LPAREN') d++ + else if (m === 'PUNC_RPAREN') d-- + i++ + } + if (d !== 0) return false + continue + } i = skipLeadingAttributes(ctx, i) - if (i !== typeStart && ctx.t[i]?.name === n) break - // No advance via attributes either — break. + if (i !== typeStart && fetchDeep(ctx, i)?.name === n) break const beforeAttr = i i = skipLeadingAttributes(ctx, i) if (i === beforeAttr) break } if (i === typeStart) return false - // Standalone tagged-type definition: `struct S { … };`. The - // last specifier was a tag-with-body and the next token is the - // declaration terminator. + // Tag-body declarations route to legacy because the grammar's + // struct/enum body parsing produces much more memory than the + // legacy opaque-token absorption — for large csmith translation + // units (~400KB) this blows the heap. Standalone tag references + // (`struct S;`, `enum E;`) and pre-existing simple cases still + // flow via the SEMI check below using ctx.t (no fetchDeep). if (ctx.t[i]?.name === 'PUNC_SEMI') return true // Phase P: parenthesised sub-declarator (function pointer). // Shape: `+ ( * + ID ) ( ? ) ;`. No initializer // for now — initialised forms still flow through the legacy // chomp because val doesn't yet handle every initializer form. if (ctx.t[i]?.name === 'PUNC_LPAREN') { + // Parenthesised compound declarator. Three shapes: + // `int (*p)(int);` — function pointer + // `int (*p)[10];` — pointer to array + // `int (*arr[3])(int);` — array of fn-pointers (inner has + // its own array postfix) let p = i + 1 // Require at least one `*` inside the parens. if (fetchDeep(ctx, p)?.name !== 'PUNC_STAR') return false @@ -2301,58 +2720,91 @@ const grammarRefs: Record = { if (innerName !== 'ID' && innerName !== 'TYPEDEF_NAME' && innerName !== 'MACRO_NAME') return false p++ + // Optional inner array postfix(es): `(*arr[3])`. + while (fetchDeep(ctx, p)?.name === 'PUNC_LBRACKET') { + let bd = 1; p++ + while (p < FETCH_DEEP_CAP && bd > 0) { + const n2 = fetchDeep(ctx, p)?.name + if (!n2) return false + if (n2 === 'PUNC_LBRACKET') bd++ + else if (n2 === 'PUNC_RBRACKET') bd-- + p++ + } + if (bd !== 0) return false + } if (fetchDeep(ctx, p)?.name !== 'PUNC_RPAREN') return false p++ - if (fetchDeep(ctx, p)?.name !== 'PUNC_LPAREN') return false - // Walk past balanced parens of the function postfix. - let depth = 0 - let closed = false - while (p < 64) { - const n2 = fetchDeep(ctx, p)?.name - if (!n2) return false - if (n2 === 'PUNC_LPAREN') depth++ - else if (n2 === 'PUNC_RPAREN') depth-- - if (depth === 0 && n2 !== 'PUNC_LPAREN') { closed = true; break } - p++ + // Trailing postfix: `(...)` (function postfix), `[...]` (array + // postfix), or chain of either. Walk balanced parens / brackets. + const post1 = fetchDeep(ctx, p)?.name + if (post1 !== 'PUNC_LPAREN' && post1 !== 'PUNC_LBRACKET') return false + while (true) { + const start = fetchDeep(ctx, p)?.name + if (start !== 'PUNC_LPAREN' && start !== 'PUNC_LBRACKET') break + const closer = start === 'PUNC_LPAREN' ? 'PUNC_RPAREN' : 'PUNC_RBRACKET' + let depth = 0 + let closed = false + while (p < FETCH_DEEP_CAP) { + const n2 = fetchDeep(ctx, p)?.name + if (!n2) return false + if (n2 === start) depth++ + else if (n2 === closer) depth-- + if (depth === 0 && n2 !== start) { closed = true; break } + p++ + } + if (!closed) return false + p++ // past the closing token } - if (!closed) return false - const post = fetchDeep(ctx, p + 1)?.name + const post = fetchDeep(ctx, p)?.name // Only the simple `;` terminator for now; initializers and - // function bodies (`{`) for fn-pointer types stay on the - // chomp path. + // function bodies for compound declarators stay on legacy. return post === 'PUNC_SEMI' } - // Optional pointer prefix on the first declarator: zero or more `*`. - const sawPointer = ctx.t[i]?.name === 'PUNC_STAR' - while (i < 10 && ctx.t[i]?.name === 'PUNC_STAR') i++ - if (ctx.t[i]?.name !== 'ID' && ctx.t[i]?.name !== 'TYPEDEF_NAME' && - ctx.t[i]?.name !== 'MACRO_NAME') return false + // Optional pointer prefix on the first declarator: zero or more + // `*`, each optionally followed by type qualifiers + // (`const`/`volatile`/`restrict`/`_Atomic`) that bind to the + // pointer (`int * const p` is a const pointer to int). + const sawPointer = fetchDeep(ctx, i)?.name === 'PUNC_STAR' + while (i < 64) { + const n = fetchDeep(ctx, i)?.name + if (n === 'PUNC_STAR') { i++; continue } + if (n === 'KW_CONST' || n === 'KW_VOLATILE' || + n === 'KW_RESTRICT' || n === 'KW__ATOMIC') { + if (!sawPointer) break + i++ + continue + } + break + } + const idName = fetchDeep(ctx, i)?.name + if (idName !== 'ID' && idName !== 'TYPEDEF_NAME' && + idName !== 'MACRO_NAME') return false i++ - const after = ctx.t[i]?.name + const after = fetchDeep(ctx, i)?.name if (after !== 'PUNC_SEMI' && after !== 'PUNC_ASSIGN' && after !== 'PUNC_COMMA' && after !== 'PUNC_LBRACKET' && after !== 'PUNC_LPAREN') return false - // Pointer-with-initializer and array-with-initializer expressions - // in csmith bodies routinely include cast expressions - // (`(void*)0`), brace-enclosed initializer lists, and subscript - // chains that val doesn't fully handle yet. Until phase C lifts - // those to val open-alts, dispatch declarator-postfix shapes only - // when there's no initializer. Plain forms flow through the new - // path; initialised forms stay on the chomp. - if (sawPointer && after === 'PUNC_ASSIGN') return false + // Pointer-with-initializer flows through grammar. The + // `initializer` rule dispatches to initializer_list for `{...}` + // and to val for expression initializers. The chained-subscript + // gap that previously kept these on the legacy path is fixed + // (vendored expr.ts: postfix paren-form chaining via p: 'expr' + // in val.close). + void sawPointer if (after === 'PUNC_LBRACKET') { // Walk past consecutive balanced bracket pairs (e.g. `[2][2]`) - // to find what follows. If `=` or we run out of pre-fetched - // lookahead before the brackets close, bail and let the chomp - // path handle it. Plain forms like `int arr[10];` resolve here. + // to find what follows. Both plain `int arr[10];` and array- + // with-initializer `int arr[3] = {…};` / `char buf[] = "…";` + // flow through grammar (val handles string-literal / brace + // initializers and chained subscripts). let j = i while (true) { let depth = 0 let closed = false - while (j < ctx.t.length && j < 32) { - const n2 = ctx.t[j]?.name + while (j < 32) { + const n2 = fetchDeep(ctx, j)?.name if (!n2) return false if (n2 === 'PUNC_LBRACKET') depth++ else if (n2 === 'PUNC_RBRACKET') depth-- @@ -2360,38 +2812,35 @@ const grammarRefs: Record = { j++ } if (!closed) return false - const next = ctx.t[j + 1]?.name + const next = fetchDeep(ctx, j + 1)?.name if (!next) return false - if (next !== 'PUNC_LBRACKET') { - if (next === 'PUNC_ASSIGN') return false - break - } + if (next !== 'PUNC_LBRACKET') break j += 1 } } if (after === 'PUNC_LPAREN') { // Walk past balanced parens looking for the matching `)`. - // Accept `;` (function declaration, phase B3.1) or `{` - // (function definition, phase B3.3) — for the `{` form, - // additionally validate that the body contains only block - // items the new grammar can structure (expression-stmts, - // jump-stmts, simple declarations, nested blocks). Bodies - // with if/while/for/switch/labeled/asm/preprocessor lines - // fall back to the legacy chomp until phase B4.2.2+ extends - // statement coverage. + // Accept `;` (function declaration) or `{` (function + // definition) — for the `{` form, additionally validate that + // the body contains only block items the grammar can + // structure. Use fetchDeep so we can see beyond the dispatch + // lookahead window (which is only 6 tokens for the longest + // wildcard alt). let depth = 0 let j = i let closed = false - while (j < ctx.t.length) { - const n2 = ctx.t[j]?.name - if (!n2) return false + const SAFETY = 4096 + while (j < i + SAFETY) { + const t = fetchDeep(ctx, j) + const n2 = t?.name + if (!n2 || n2 === '#ZZ') return false if (n2 === 'PUNC_LPAREN') depth++ else if (n2 === 'PUNC_RPAREN') depth-- if (depth === 0 && n2 !== 'PUNC_LPAREN') { closed = true; break } j++ } if (!closed) return false - const post = ctx.t[j + 1]?.name + const post = fetchDeep(ctx, j + 1)?.name if (post !== 'PUNC_SEMI' && post !== 'PUNC_LBRACE') return false if (post === 'PUNC_LBRACE') { if (!isFunctionBodySupported(ctx, j + 1)) return false @@ -2452,6 +2901,25 @@ const grammarRefs: Record = { rule.node.declKind = 'declaration' rule.u.specs = makeNode('declaration_specifiers') rule.u.idl = makeNode('init_declarator_list') + // Strip stale per-rule k state that may have leaked into our k + // via the shallow-copy-on-push that jsonic does at every rule + // transition. Without this, e.g. enumerator_list's k.elNode + // from a prior declaration's enum-body bleeds into the current + // declaration's struct/enum and the re-entry guards misfire. + delete rule.k.ssNode; delete rule.k.ssKwTaken + delete rule.k.ssTagTaken; delete rule.k.ssBodyTaken + delete rule.k.esNode; delete rule.k.esKwTaken + delete rule.k.esTagTaken; delete rule.k.esUtypeTaken + delete rule.k.esBodyTaken; delete rule.k.esUtypeAttached + delete rule.k.elNode; delete rule.k.elOpened + delete rule.k.takenEnums + delete rule.k.mdlNode; delete rule.k.mdlOpened + delete rule.k.takenSecs; delete rule.k.takenItems + delete rule.k.ilNode; delete rule.k.ilOpened + delete rule.k.iiNode; delete rule.k.hasDesig; delete rule.k.tookEq + delete rule.k.declarator; delete rule.k.directDeclarator + delete rule.k.lastPointer + clearStmtState(rule) }, // Phase B2.3+B2.4 actions. simple_declaration's open descends into @@ -2470,6 +2938,36 @@ const grammarRefs: Record = { pushTokenWithTrivia(owner.u.specs, tkn) }, + // ---- bit_int_paren refs (C23 _BitInt(N)) ---- + // The keyword has already been pushed onto the parent spec_loop's + // u.specs by @absorb-spec-type. This rule appends the `(`, the + // width expression, and the `)` to the same specs node so the + // `_BitInt(8)` triple sits as adjacent children of the + // declaration_specifiers, matching the legacy CST shape. + '@bip-take-lparen': (rule: Rule): void => { + const owner = specOwner(rule) + if (owner && owner.u && owner.u.specs) { + pushTokenWithTrivia(owner.u.specs, rule.o0 as Token) + } + }, + '@bip-mark-val': (_rule: Rule): void => { /* see -bc */ }, + '@bit_int_paren-bc': (rule: Rule): void => { + if (rule.k.bipValAttached) return + if (rule.child && rule.child.name === 'val' && rule.child.node) { + const owner = specOwner(rule) + if (owner && owner.u && owner.u.specs) { + owner.u.specs.children.push(rule.child.node) + rule.k.bipValAttached = true + } + } + }, + '@bip-take-rparen': (rule: Rule): void => { + const owner = specOwner(rule) + if (owner && owner.u && owner.u.specs) { + pushTokenWithTrivia(owner.u.specs, rule.c0 as Token) + } + }, + // Capture the comma between declarators onto the init_declarator_list. '@simple-decl-take-comma': (rule: Rule): void => { pushTokenWithTrivia(rule.u.idl, rule.c0 as Token) @@ -2514,11 +3012,11 @@ const grammarRefs: Record = { pushTokenWithTrivia(rule.k.directDeclarator, lparen) rule.k.declarator.children.push(rule.k.directDeclarator) rule.node.children.push(rule.k.declarator) - rule.k.parenPending = true + rule.k.idclParenPending = true }, '@idecl-paren-pending': (rule: Rule): boolean => - rule.k.parenPending === true && rule.k.parenClosed !== true, + rule.k.idclParenPending === true && rule.k.parenClosed !== true, // Consume the matching `)` after paren_inner_declarator returns, // append it to the outer direct_declarator, and latch named so @@ -2586,6 +3084,20 @@ const grammarRefs: Record = { const ptr = makeNode('pointer') pushTokenWithTrivia(ptr, rule.o0 as Token) owner.k.declarator.children.push(ptr) + // Stash the just-built pointer node so the pointer_qualifier_loop + // sub-rule (descended via p: in this same alt) can append + // qualifier tokens to it. + rule.k.lastPointer = ptr + }, + + // Append a type-qualifier token to the parent pointer_list's most + // recently-pushed pointer node. C qualifiers following `*` qualify + // the pointer itself, not the pointee — `int * const p` is a const + // pointer to int. + '@absorb-pq-const': (rule: Rule): void => { + const owner = rule.parent as Rule // pointer_list + const ptr = owner?.k?.lastPointer + if (ptr) pushTokenWithTrivia(ptr, rule.o0 as Token) }, // ---- array_postfix refs ---- @@ -2602,11 +3114,15 @@ const grammarRefs: Record = { '@arr-close': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.c0 as Token) - // Attach this postfix to the parent init_declarator's - // direct_declarator children. The init_declarator stores its - // scaffolding on k (not u) because r:-recursion preserves k. + // Attach the array_postfix node onto the parent's declarator + // shell. init_declarator uses k.directDeclarator; + // parameter_declaration uses k.declarator. Either way, append. const owner = rule.parent as Rule - owner.k.directDeclarator.children.push(rule.node) + if (owner.k.directDeclarator) { + owner.k.directDeclarator.children.push(rule.node) + } else if (owner.k.declarator) { + owner.k.declarator.children.push(rule.node) + } }, // bc: when val just produced a size expression, splice it into the @@ -2656,6 +3172,59 @@ const grammarRefs: Record = { pushTokenWithTrivia(rule.parent.k.ptl, rule.c0 as Token) }, + // ---- identifier_list refs (K&R-style prototype) ---- + + // bo: build the identifier_list CST scaffold. Guard against + // r:-recursion (each comma re-enters the rule); jsonic passes + // rule.node as the 3rd arg of makeRule so the existing children + // carry forward — we just must avoid replacing the node. + '@identifier_list-bo': (rule: Rule): void => { + if (rule.node && rule.node.kind === 'identifier_list') return + rule.node = makeNode('identifier_list') + }, + + // Take an ID token and append it as a child of identifier_list. + // Fires for both the first ID (open alt) and subsequent IDs after + // a comma (close alt's `r:`-recursion re-fires this on re-entry). + '@idlist-take': (rule: Rule): void => { + const tkn = (rule.state === 'o' ? rule.o0 : rule.c0) as Token + pushTokenWithTrivia(rule.node, tkn) + }, + + // Append the comma between identifier_list items as a token-ref + // child so the original source order is preserved. + '@idlist-comma': (rule: Rule): void => { + pushTokenWithTrivia(rule.node, rule.c0 as Token) + }, + + // Attach the completed identifier_list onto the parent + // function_postfix's CST. Mirrors @ptl-attach-and-end's role for + // typed parameter lists. + '@idlist-attach': (rule: Rule): void => { + const fn = rule.parent as Rule + if (fn.node && rule.node) { + fn.node.children.push(rule.node) + } + }, + + // Variadic terminator `, ...`. Build a parameter_variadic CST + // node, push it onto the parameter_type_list, tag the ptl as + // variadic, AND attach the ptl to the function_postfix — + // because this alt completes the rule without falling through + // to @ptl-attach-and-end, we need to do the attach here. + '@ptl-take-ellipsis': (rule: Rule): void => { + const fn = rule.parent as Rule + const ptl = fn.k.ptl + pushTokenWithTrivia(ptl, rule.c0 as Token) + const pv = makeNode('parameter_variadic') + pushTokenWithTrivia(pv, rule.c1 as Token) + ptl.children.push(pv) + ptl.variadic = true + if (ptl.children.length > 0) { + fn.node.children.push(ptl) + } + }, + // ---- parameter_declaration refs ---- '@parameter_declaration-bo': (rule: Rule): void => { @@ -2723,6 +3292,92 @@ const grammarRefs: Record = { rule.k.declarator.children.push(ptr) }, + // ---- parenthesised abstract / named parameter declarator ---- + + // Open `(` of a parenthesised parameter declarator, e.g. the + // outer `(` in `int (*)(int)` or `int (*name)(int)`. Capture + // the LPAREN onto the parameter's declarator, mark the rule so + // re-entry via `r:` knows we're inside a paren-pending state, + // and seed the inner declarator scaffold. + '@param-paren-open': (rule: Rule): void => { + if (!rule.k.declarator) { + rule.k.declarator = makeNode('declarator') + } + pushTokenWithTrivia(rule.k.declarator, rule.c0 as Token) + rule.k.paramParenPending = true + // The inner declarator is attached to rule.k.declarator by + // param_paren_inner's actions; the outer rule's close will + // append the closing `)` and drop the pending flag. + }, + + // True between @param-paren-open and the matching `)`. Lets the + // PUNC_RPAREN close-alt fire only when we actually opened a + // paren-form declarator (not the function_postfix closer). + '@param-paren-pending': (rule: Rule): boolean => + rule.k.paramParenPending === true, + + // True before any paren-form declarator has been absorbed. Once + // we've opened a paren-form, subsequent `(` should be treated as + // a function postfix, not as a fresh paren-form. + '@param-can-paren-form': (rule: Rule): boolean => + rule.k.paramParenDone !== true && rule.k.paramParenPending !== true, + + // Match the outer `)` of a paren-form parameter declarator and + // append it onto rule.k.declarator. The rule re-enters via `r:` + // so the next close-state can take a trailing function-postfix + // / array-postfix / nothing. + '@param-paren-close': (rule: Rule): void => { + if (!rule.k.declarator) return + pushTokenWithTrivia(rule.k.declarator, rule.c0 as Token) + rule.k.paramParenPending = false + rule.k.paramParenDone = true + }, + + // Cond for the function-postfix-after-paren alt: a `(` here is a + // function postfix only when we just closed a paren-form + // declarator AND haven't already absorbed a function postfix. + '@param-need-fn-postfix': (rule: Rule): boolean => + rule.k.paramParenDone === true && rule.k.paramFnPostfixDone !== true, + + // ---- param_paren_inner refs ---- + + // True when an inner ID has already been captured; gates the + // re-entry no-op alt at the top of param_paren_inner's open. + '@ppi-named': (rule: Rule): boolean => + rule.k.ppiNamed === true, + + // Pointer prefix INSIDE the parenthesised inner declarator. Each + // `*` becomes a pointer node attached to the OUTER (parameter + // declaration's) declarator, so the abstract `(*)` produces a + // pointer-on-declarator without needing a separate inner node. + '@ppi-pointer': (rule: Rule): void => { + const owner = rule.parent as Rule // parameter_declaration + if (!owner.k.declarator) { + owner.k.declarator = makeNode('declarator') + } + const ptr = makeNode('pointer') + pushTokenWithTrivia(ptr, rule.c0 as Token) + owner.k.declarator.children.push(ptr) + }, + + // Capture the optional ID inside the parenthesised inner + // declarator (e.g. the `fn` in `int (*fn)(int)`). The ID belongs + // to the OUTER parameter's declarator and tagged as declaredName. + '@ppi-name': (rule: Rule): void => { + const owner = rule.parent as Rule // parameter_declaration + const idTkn = (rule.state === 'o' ? rule.o0 : rule.c0) as Token + if (!owner.k.declarator) { + owner.k.declarator = makeNode('declarator') + } + const dd = makeNode('direct_declarator') + pushTokenWithTrivia(dd, idTkn) + dd.declaredName = idTkn.src + owner.k.declarator.children.push(dd) + owner.k.declarator.declaredName = idTkn.src + owner.node.declaredName = idTkn.src + rule.k.ppiNamed = true + }, + '@parameter_declaration-bc': (rule: Rule): void => { // Attach specs once (after param_spec_loop returned). Attach // declarator separately, after pointer-prefix loop and ID @@ -2757,18 +3412,37 @@ const grammarRefs: Record = { rule.u.hasInit = true }, - // bc on init_declarator: if val supplied an initializer, splice it - // into the node's children with the `=` token preceding it. + // bc on init_declarator: if the initializer rule supplied a child, + // splice it into the node's children with the `=` token preceding it. + // The initializer rule wraps both brace-init and val-expr forms in + // an `initializer` node so we just push it directly. '@init_declarator-bc': (rule: Rule): void => { if (rule.u.hasInit && rule.child && rule.child.node) { - const initNode = makeNode('initializer') - initNode.children.push(rule.child.node) + let initNode = rule.child.node + // Pre-Q2.1 fallthrough: if a non-initializer child still surfaces + // here (e.g. dispatched directly to val), wrap it for the legacy + // CST shape compatibility. + if (initNode.kind !== 'initializer') { + const wrapped = makeNode('initializer') + wrapped.children.push(initNode) + initNode = wrapped + } for (const tr of rule.u.eqTrivia || []) rule.node.children.push(tr) rule.node.children.push(rule.u.eqTokenRef) rule.node.children.push(initNode) } }, + // ---- initializer wrapper rule (phase Q2.1) ---- + '@initializer-bo': (rule: Rule): void => { + rule.node = makeNode('initializer') + }, + '@initializer-bc': (rule: Rule): void => { + if (rule.child && rule.child.node) { + rule.node.children.push(rule.child.node) + } + }, + // bc on simple_declaration: when an init_declarator sub-rule has // just completed, push its node onto the declaration's idl list // and remember its declared name so the typedef finaliser can @@ -3016,8 +3690,20 @@ const grammarRefs: Record = { // ---- if_statement (phase B4.2.2) --------------------------------- '@if_statement-bo': (rule: Rule): void => { - if (rule.node && rule.node.kind === 'if_statement') return + // Check r:-recursion via prev.name (NOT rule.node.kind, which + // is the inherited parent node and would mis-trigger for nested + // statements). On a fresh instance, build a new node and strip + // stale took*/elseSeen state inherited from a parent control- + // flow rule (took* keys are shared across if/while/do/for/ + // switch and would otherwise misfire when an `if` is nested + // inside a `for` body or vice versa). + if (rule.prev && rule.prev.name === rule.name && rule.k.ifNode) { + rule.node = rule.k.ifNode + return + } rule.node = makeNode('if_statement') + rule.k.ifNode = rule.node + clearStmtState(rule) }, '@if-take-keyword': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.o0 as Token) @@ -3053,8 +3739,13 @@ const grammarRefs: Record = { // ---- while_statement (phase B4.2.2) ------------------------------ '@while_statement-bo': (rule: Rule): void => { - if (rule.node && rule.node.kind === 'while_statement') return + if (rule.prev && rule.prev.name === rule.name && rule.k.whileNode) { + rule.node = rule.k.whileNode + return + } rule.node = makeNode('while_statement') + rule.k.whileNode = rule.node + clearStmtState(rule) }, '@while-take-keyword': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.o0 as Token) @@ -3077,8 +3768,13 @@ const grammarRefs: Record = { // ---- do_statement (phase B4.2.2) --------------------------------- '@do_statement-bo': (rule: Rule): void => { - if (rule.node && rule.node.kind === 'do_statement') return + if (rule.prev && rule.prev.name === rule.name && rule.k.doNode) { + rule.node = rule.k.doNode + return + } rule.node = makeNode('do_statement') + rule.k.doNode = rule.node + clearStmtState(rule) }, '@do-take-keyword': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.o0 as Token) @@ -3113,8 +3809,13 @@ const grammarRefs: Record = { // ---- switch_statement (phase B4.2.2) ----------------------------- '@switch_statement-bo': (rule: Rule): void => { - if (rule.node && rule.node.kind === 'switch_statement') return + if (rule.prev && rule.prev.name === rule.name && rule.k.switchNode) { + rule.node = rule.k.switchNode + return + } rule.node = makeNode('switch_statement') + rule.k.switchNode = rule.node + clearStmtState(rule) }, '@switch-take-keyword': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.o0 as Token) @@ -3137,8 +3838,13 @@ const grammarRefs: Record = { // ---- for_statement family (phase B4.2.3) ------------------------- '@for_statement-bo': (rule: Rule): void => { - if (rule.node && rule.node.kind === 'for_statement') return + if (rule.prev && rule.prev.name === rule.name && rule.k.forNode) { + rule.node = rule.k.forNode + return + } rule.node = makeNode('for_statement') + rule.k.forNode = rule.node + clearStmtState(rule) }, '@for-take-keyword': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.o0 as Token) @@ -3160,8 +3866,13 @@ const grammarRefs: Record = { }, '@for_controls-bo': (rule: Rule): void => { - if (rule.node && rule.node.kind === 'for_controls') return + if (rule.prev && rule.prev.name === rule.name && rule.k.fcNode) { + rule.node = rule.k.fcNode + return + } rule.node = makeNode('for_controls') + rule.k.fcNode = rule.node + clearStmtState(rule) }, '@fc-open': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.o0 as Token) @@ -4332,6 +5043,10 @@ const grammarRefs: Record = { // ---- enumerator_list (phase F.4) --------------------------------- '@enumerator_list-bo': (rule: Rule): void => { + if ((globalThis as any).Q22_DEBUG) { + const keys = Object.keys(rule.k).filter(k => k !== 'tokens') + console.error('EL-BO', { keys, parent: rule.parent?.name, prev: rule.prev?.name, hasElNode: 'elNode' in rule.k }) + } const prev = (rule as any).prev const isRecursion = prev && prev.name === rule.name if (isRecursion && rule.k.elNode) { @@ -4344,16 +5059,23 @@ const grammarRefs: Record = { }, '@el-reentered': (rule: Rule): boolean => rule.k.elOpened === true, '@el-take-lbrace': (rule: Rule): void => { + if ((globalThis as any).Q22_DEBUG) console.error('EL-LBRACE') pushTokenWithTrivia(rule.node, rule.o0 as Token) rule.k.elOpened = true }, '@el-take-rbrace': (rule: Rule): void => { + if ((globalThis as any).Q22_DEBUG) console.error('EL-RBRACE') pushTokenWithTrivia(rule.node, rule.c0 as Token) }, '@el-take-comma': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.c0 as Token) }, - '@enumerator_list-bc': (rule: Rule): void => { + '@enumerator_list-bc': (rule: Rule, ctx: Context): void => { + if ((globalThis as any).Q22_DEBUG) { + const t0 = (ctx as any).t?.[0]?.name + const t1 = (ctx as any).t?.[1]?.name + console.error('EL-BC', { childName: rule.child?.name, hasNode: !!rule.child?.node, t0, t1 }) + } if (rule.child && rule.child.name === 'enumerator' && rule.child.node && !rule.k.takenEnums?.has(rule.child)) { rule.node.children.push(rule.child.node) @@ -4364,6 +5086,10 @@ const grammarRefs: Record = { // ---- enumerator (phase F.4) -------------------------------------- '@enumerator-bo': (rule: Rule): void => { + if ((globalThis as any).Q22_DEBUG) { + const ctx = (rule as any).ctx || (rule as any)._ctx + console.error('ENR-BO ctxT0:', ctx?.t0?.name) + } const prev = (rule as any).prev const isRecursion = prev && prev.name === rule.name if (isRecursion && rule.k.enrNode) { @@ -4396,6 +5122,18 @@ const grammarRefs: Record = { rule.node.children.push(init) rule.k.enrValueAttached = true } + // C23 attribute on enumerator (e.g. `A [[deprecated]] = 1`). + // Append the returned attribute_spec_c23 onto the enumerator + // node so consumers can find it via findKind. Track per-child + // to avoid double-attaching when r:'enumerator' re-fires bc. + if (rule.child && rule.child.name === 'attribute_spec_c23' && + rule.child.node) { + if (!rule.k.enrAttrTaken) rule.k.enrAttrTaken = new Set() + if (!rule.k.enrAttrTaken.has(rule.child)) { + rule.node.children.push(rule.child.node) + rule.k.enrAttrTaken.add(rule.child) + } + } }, // ---- attribute_spec_gcc (phase G.2) ------------------------------ @@ -4600,11 +5338,11 @@ const grammarRefs: Record = { rule.node.attributeName = tkn.src rule.k.aiNameTaken = true }, - '@ai-need-colon-1': (rule: Rule, ctx: Context): boolean => { - if (!rule.k.aiNameTaken || rule.k.aiPrefixed) return false - // Only enter the `::` namespace path when the next token is also `:`. - return ctx.t[0]?.name === 'PUNC_COLON' && - ctx.t[1]?.name === 'PUNC_COLON' + '@ai-need-colon-1': (rule: Rule): boolean => { + // The alt's `s: 'PUNC_COLON PUNC_COLON'` ensures both colons + // are physically present (parse_alts force-fetches both); the + // cond just gates by rule state. + return !!rule.k.aiNameTaken && !rule.k.aiPrefixed }, '@ai-take-colon-1': (rule: Rule): void => { pushTokenWithTrivia(rule.node, rule.c0 as Token) @@ -4693,22 +5431,28 @@ const grammarRefs: Record = { } } }, - '@ppd-is-define': (_rule: Rule, ctx: Context): boolean => - ctx.t[1]?.src === 'define', - '@ppd-is-undef': (_rule: Rule, ctx: Context): boolean => - ctx.t[1]?.src === 'undef', - '@ppd-is-include': (_rule: Rule, ctx: Context): boolean => { - const s = ctx.t[1]?.src + // The preprocessor_directive open alts use a 2-token `s:` pattern + // (`PP_HASH #ANY_C_TOKEN`); these conds inspect rule.o1 (the + // second matched token — the directive-name ID lexed from the + // line body). Reading rule.o* instead of ctx.t[*] avoids the + // NOTOKEN-in-place gap that bites callers using ctx.t for + // lookahead past the parser's consume-and-shift. + '@ppd-is-define': (rule: Rule): boolean => + rule.o1?.src === 'define', + '@ppd-is-undef': (rule: Rule): boolean => + rule.o1?.src === 'undef', + '@ppd-is-include': (rule: Rule): boolean => { + const s = rule.o1?.src return s === 'include' || s === 'include_next' || s === 'embed' }, - '@ppd-is-conditional': (_rule: Rule, ctx: Context): boolean => { - const s = ctx.t[1]?.src + '@ppd-is-conditional': (rule: Rule): boolean => { + const s = rule.o1?.src return s === 'if' || s === 'ifdef' || s === 'ifndef' || s === 'elif' || s === 'elifdef' || s === 'elifndef' || s === 'else' || s === 'endif' }, - '@ppd-is-simple': (_rule: Rule, ctx: Context): boolean => { - const s = ctx.t[1]?.src + '@ppd-is-simple': (rule: Rule): boolean => { + const s = rule.o1?.src return s === 'pragma' || s === 'error' || s === 'warning' || s === 'line' }, @@ -5108,9 +5852,26 @@ const grammarRefs: Record = { } }, } +} // Push a token-ref onto `node`, prefixed with any preserved trivia // (comments, line continuations) the sub-lex hook stashed on +// Strip stale took*/elseSeen control-flow tracking keys from a +// rule's k. Each control-flow rule (if/while/do/for/switch/ +// for_controls) uses the SAME generic key names (tookCond, +// tookBody, tookThen, etc.), so the shallow-copy-on-push that +// jsonic does at every rule transition leaks state between nested +// statements — `if (1) for (;;) ;` would inherit if's tookCond +// into for_controls and bypass the cond-fetching alts. +function clearStmtState(rule: Rule): void { + delete rule.k.tookCond; delete rule.k.tookBody + delete rule.k.tookThen; delete rule.k.elseSeen + delete rule.k.tookElse + delete rule.k.tookWhile; delete rule.k.tookSemi + delete rule.k.tookInit; delete rule.k.tookIter + delete rule.k.tookControls +} + // tkn.use.leading. Mirrors the chomp's @absorb-token logic so the // new-path CST carries the same source-order trivia siblings. function pushTokenWithTrivia(node: CNode, tkn: Token): void { @@ -5132,7 +5893,9 @@ function attachFunctionPostfix(rule: Rule): void { // scaffolding, regardless of whether the action is firing on // simple_declaration itself or on its spec_loop child. function specOwner(rule: Rule): Rule { - return rule.name === 'simple_declaration' ? rule : (rule.parent as Rule) + if (rule.name === 'simple_declaration' || + rule.name === 'struct_declaration') return rule + return rule.parent as Rule } // Token-name sets used by @looks-simple-decl. Mirror the SIMPLE_TYPE_HEAD @@ -5159,6 +5922,7 @@ const storagePrefixSet = new Set([ 'KW__THREAD_LOCAL', 'KW_THREAD_LOCAL', 'KW_CONSTEXPR', 'KW___THREAD', 'KW_INLINE', 'KW___INLINE__', 'KW___INLINE', + 'KW___EXTENSION__', ]) function leadingTriviaRefs(tkn: Token): CTokenRef[] { diff --git a/test/c.test.ts b/test/c.test.ts index b1b9a00..3e34442 100644 --- a/test/c.test.ts +++ b/test/c.test.ts @@ -8,7 +8,11 @@ import { join } from 'node:path' import { Jsonic } from 'jsonic' import { C } from '../dist/c.js' -const j = Jsonic.make().use(C, {}) +// Most tests in this file exercise extension constructs (preprocessor, +// GCC __attribute__, MSVC __declspec, inline asm, etc.), so the shared +// instance enables them via { extended: true }. New plain-C-only tests +// should construct a separate instance with the default (no opt-in). +const j = Jsonic.make().use(C, { extended: true }) // Walk a CST node depth-first and yield its token-refs in source order. function walkTokens(node: any): any[] { @@ -1342,7 +1346,7 @@ describe('c parser smoke', () => { // typed (not as identifier names). // The parser does this via a typedef directive in the actual .h // file; for this synthetic test we inject it inline. - const j2 = (Jsonic.make().use(C, {}) as any) + const j2 = (Jsonic.make().use(C, { extended: true }) as any) const out = j2(`typedef struct { int x; int y; int z; } vec_t;\n${src}`) // After the synthetic typedef, our source begins with #include then // three function definitions. @@ -1436,7 +1440,10 @@ describe('path-dispatch spec', () => { ? `${row.path}: ${row.src} (${row.notes})` : `${row.path}: ${row.src}` test(tag, () => { - const parser = Jsonic.make().use(C) + // path-dispatch rows include extension shapes (preprocessor, + // GCC __attribute__, asm). Construct the parser with extensions + // enabled so those rows assert the correct dispatch path. + const parser = Jsonic.make().use(C, { extended: true }) const out = parser(row.src) const decls = (out.children || []).filter( (c: any) => c.kind === 'external_declaration', diff --git a/test/csmith-common.ts b/test/csmith-common.ts index f44c39a..4ec304c 100644 --- a/test/csmith-common.ts +++ b/test/csmith-common.ts @@ -41,8 +41,10 @@ export function fixturePath(seed: number): string { } // Parse a csmith-shaped source with stdint typedef-names pre-registered. +// csmith generates GCC-flavoured C with #include / __attribute__ etc., +// so the parser is constructed with `extended: true`. export function parseCsmithSource(src: string): any { - const j = Jsonic.make().use(C) + const j = Jsonic.make().use(C, { extended: true }) const cmeta = makeCMeta() for (const n of STDINT_TYPEDEFS) cmeta.symbols.bindTypedef(n) return j(src, { cmeta }) diff --git a/test/csmith-fixtures/seed-001.json.gz b/test/csmith-fixtures/seed-001.json.gz index dca1a77..5834c0e 100644 Binary files a/test/csmith-fixtures/seed-001.json.gz and b/test/csmith-fixtures/seed-001.json.gz differ diff --git a/test/csmith-fixtures/seed-002.json.gz b/test/csmith-fixtures/seed-002.json.gz index 8ea1458..48a7157 100644 Binary files a/test/csmith-fixtures/seed-002.json.gz and b/test/csmith-fixtures/seed-002.json.gz differ diff --git a/test/csmith-fixtures/seed-003.json.gz b/test/csmith-fixtures/seed-003.json.gz index cad1009..2350540 100644 Binary files a/test/csmith-fixtures/seed-003.json.gz and b/test/csmith-fixtures/seed-003.json.gz differ diff --git a/test/csmith-fixtures/seed-004.json.gz b/test/csmith-fixtures/seed-004.json.gz index 15dff5a..538e7b4 100644 Binary files a/test/csmith-fixtures/seed-004.json.gz and b/test/csmith-fixtures/seed-004.json.gz differ diff --git a/test/csmith-fixtures/seed-005.json.gz b/test/csmith-fixtures/seed-005.json.gz index fb23d3a..dc5cbae 100644 Binary files a/test/csmith-fixtures/seed-005.json.gz and b/test/csmith-fixtures/seed-005.json.gz differ diff --git a/test/csmith-fixtures/seed-006.json.gz b/test/csmith-fixtures/seed-006.json.gz index 7c6c58a..313f853 100644 Binary files a/test/csmith-fixtures/seed-006.json.gz and b/test/csmith-fixtures/seed-006.json.gz differ diff --git a/test/csmith-fixtures/seed-007.json.gz b/test/csmith-fixtures/seed-007.json.gz index 5172ccd..b57bdfe 100644 Binary files a/test/csmith-fixtures/seed-007.json.gz and b/test/csmith-fixtures/seed-007.json.gz differ diff --git a/test/csmith-fixtures/seed-008.json.gz b/test/csmith-fixtures/seed-008.json.gz index 30c47cd..683e838 100644 Binary files a/test/csmith-fixtures/seed-008.json.gz and b/test/csmith-fixtures/seed-008.json.gz differ diff --git a/test/csmith-fixtures/seed-009.json.gz b/test/csmith-fixtures/seed-009.json.gz index d4ec47f..55bb2b9 100644 Binary files a/test/csmith-fixtures/seed-009.json.gz and b/test/csmith-fixtures/seed-009.json.gz differ diff --git a/test/csmith-fixtures/seed-010.json.gz b/test/csmith-fixtures/seed-010.json.gz index 73d2d25..d87343a 100644 Binary files a/test/csmith-fixtures/seed-010.json.gz and b/test/csmith-fixtures/seed-010.json.gz differ diff --git a/test/csmith-fixtures/seed-011.json.gz b/test/csmith-fixtures/seed-011.json.gz index fa36afb..36e1a6d 100644 Binary files a/test/csmith-fixtures/seed-011.json.gz and b/test/csmith-fixtures/seed-011.json.gz differ diff --git a/test/csmith-fixtures/seed-012.json.gz b/test/csmith-fixtures/seed-012.json.gz index c91995a..39310b7 100644 Binary files a/test/csmith-fixtures/seed-012.json.gz and b/test/csmith-fixtures/seed-012.json.gz differ diff --git a/test/csmith-fixtures/seed-015.json.gz b/test/csmith-fixtures/seed-015.json.gz index f6e4ed0..c9918c8 100644 Binary files a/test/csmith-fixtures/seed-015.json.gz and b/test/csmith-fixtures/seed-015.json.gz differ diff --git a/test/csmith-fixtures/seed-016.json.gz b/test/csmith-fixtures/seed-016.json.gz index 58e7c2f..97fe948 100644 Binary files a/test/csmith-fixtures/seed-016.json.gz and b/test/csmith-fixtures/seed-016.json.gz differ diff --git a/test/csmith-fixtures/seed-018.json.gz b/test/csmith-fixtures/seed-018.json.gz index d301e8d..31e0d9f 100644 Binary files a/test/csmith-fixtures/seed-018.json.gz and b/test/csmith-fixtures/seed-018.json.gz differ diff --git a/test/csmith-fixtures/seed-019.json.gz b/test/csmith-fixtures/seed-019.json.gz index e277b86..8e1a793 100644 Binary files a/test/csmith-fixtures/seed-019.json.gz and b/test/csmith-fixtures/seed-019.json.gz differ diff --git a/test/csmith-fixtures/seed-020.json.gz b/test/csmith-fixtures/seed-020.json.gz index 85111f1..2a29b70 100644 Binary files a/test/csmith-fixtures/seed-020.json.gz and b/test/csmith-fixtures/seed-020.json.gz differ diff --git a/test/csmith-fixtures/seed-022.json.gz b/test/csmith-fixtures/seed-022.json.gz index e2e10a5..d1cc3b9 100644 Binary files a/test/csmith-fixtures/seed-022.json.gz and b/test/csmith-fixtures/seed-022.json.gz differ diff --git a/test/csmith-fixtures/seed-023.json.gz b/test/csmith-fixtures/seed-023.json.gz index dd3e684..21f77cf 100644 Binary files a/test/csmith-fixtures/seed-023.json.gz and b/test/csmith-fixtures/seed-023.json.gz differ diff --git a/test/csmith-fixtures/seed-024.json.gz b/test/csmith-fixtures/seed-024.json.gz index 4dcc2fa..59c0501 100644 Binary files a/test/csmith-fixtures/seed-024.json.gz and b/test/csmith-fixtures/seed-024.json.gz differ diff --git a/test/csmith-fixtures/seed-025.json.gz b/test/csmith-fixtures/seed-025.json.gz index 949a23f..a078b56 100644 Binary files a/test/csmith-fixtures/seed-025.json.gz and b/test/csmith-fixtures/seed-025.json.gz differ diff --git a/test/csmith-fixtures/seed-026.json.gz b/test/csmith-fixtures/seed-026.json.gz index a218db6..ca94a17 100644 Binary files a/test/csmith-fixtures/seed-026.json.gz and b/test/csmith-fixtures/seed-026.json.gz differ diff --git a/test/csmith-fixtures/seed-027.json.gz b/test/csmith-fixtures/seed-027.json.gz index 23a4cef..721292b 100644 Binary files a/test/csmith-fixtures/seed-027.json.gz and b/test/csmith-fixtures/seed-027.json.gz differ diff --git a/test/csmith-fixtures/seed-028.json.gz b/test/csmith-fixtures/seed-028.json.gz index 6962eb9..3e6c136 100644 Binary files a/test/csmith-fixtures/seed-028.json.gz and b/test/csmith-fixtures/seed-028.json.gz differ diff --git a/test/csmith-fixtures/seed-029.json.gz b/test/csmith-fixtures/seed-029.json.gz index 7466f71..30d517f 100644 Binary files a/test/csmith-fixtures/seed-029.json.gz and b/test/csmith-fixtures/seed-029.json.gz differ diff --git a/test/csmith-fixtures/seed-030.json.gz b/test/csmith-fixtures/seed-030.json.gz index 7fc9a5a..fd45d1a 100644 Binary files a/test/csmith-fixtures/seed-030.json.gz and b/test/csmith-fixtures/seed-030.json.gz differ diff --git a/test/csmith-fixtures/seed-031.json.gz b/test/csmith-fixtures/seed-031.json.gz index 42abf44..964fcf0 100644 Binary files a/test/csmith-fixtures/seed-031.json.gz and b/test/csmith-fixtures/seed-031.json.gz differ diff --git a/test/csmith-fixtures/seed-032.json.gz b/test/csmith-fixtures/seed-032.json.gz index b8137f3..41718d6 100644 Binary files a/test/csmith-fixtures/seed-032.json.gz and b/test/csmith-fixtures/seed-032.json.gz differ diff --git a/test/csmith-fixtures/seed-033.json.gz b/test/csmith-fixtures/seed-033.json.gz index 5751f82..69a8cf5 100644 Binary files a/test/csmith-fixtures/seed-033.json.gz and b/test/csmith-fixtures/seed-033.json.gz differ diff --git a/test/csmith-fixtures/seed-035.json.gz b/test/csmith-fixtures/seed-035.json.gz index 06e7bed..4c7dbbb 100644 Binary files a/test/csmith-fixtures/seed-035.json.gz and b/test/csmith-fixtures/seed-035.json.gz differ diff --git a/test/csmith-fixtures/seed-036.json.gz b/test/csmith-fixtures/seed-036.json.gz index 8964a94..5809bae 100644 Binary files a/test/csmith-fixtures/seed-036.json.gz and b/test/csmith-fixtures/seed-036.json.gz differ diff --git a/test/csmith-fixtures/seed-037.json.gz b/test/csmith-fixtures/seed-037.json.gz index f83588a..b24d0c2 100644 Binary files a/test/csmith-fixtures/seed-037.json.gz and b/test/csmith-fixtures/seed-037.json.gz differ diff --git a/test/csmith-fixtures/seed-038.json.gz b/test/csmith-fixtures/seed-038.json.gz index 90942e8..c821ef2 100644 Binary files a/test/csmith-fixtures/seed-038.json.gz and b/test/csmith-fixtures/seed-038.json.gz differ diff --git a/test/csmith-fixtures/seed-039.json.gz b/test/csmith-fixtures/seed-039.json.gz index 620c29a..b7d0b7f 100644 Binary files a/test/csmith-fixtures/seed-039.json.gz and b/test/csmith-fixtures/seed-039.json.gz differ diff --git a/test/csmith-fixtures/seed-040.json.gz b/test/csmith-fixtures/seed-040.json.gz index 7f29f33..60c0844 100644 Binary files a/test/csmith-fixtures/seed-040.json.gz and b/test/csmith-fixtures/seed-040.json.gz differ diff --git a/test/csmith-fixtures/seed-041.json.gz b/test/csmith-fixtures/seed-041.json.gz index cf1a354..189eabf 100644 Binary files a/test/csmith-fixtures/seed-041.json.gz and b/test/csmith-fixtures/seed-041.json.gz differ diff --git a/test/csmith-fixtures/seed-042.json.gz b/test/csmith-fixtures/seed-042.json.gz index 62eea42..b9e3015 100644 Binary files a/test/csmith-fixtures/seed-042.json.gz and b/test/csmith-fixtures/seed-042.json.gz differ diff --git a/test/csmith-fixtures/seed-043.json.gz b/test/csmith-fixtures/seed-043.json.gz index ea7dee2..45beccf 100644 Binary files a/test/csmith-fixtures/seed-043.json.gz and b/test/csmith-fixtures/seed-043.json.gz differ diff --git a/test/csmith-fixtures/seed-044.json.gz b/test/csmith-fixtures/seed-044.json.gz index 8c6ddf4..f404cd3 100644 Binary files a/test/csmith-fixtures/seed-044.json.gz and b/test/csmith-fixtures/seed-044.json.gz differ diff --git a/test/csmith-fixtures/seed-045.json.gz b/test/csmith-fixtures/seed-045.json.gz index 587cbda..7efcd4a 100644 Binary files a/test/csmith-fixtures/seed-045.json.gz and b/test/csmith-fixtures/seed-045.json.gz differ diff --git a/test/csmith-fixtures/seed-046.json.gz b/test/csmith-fixtures/seed-046.json.gz index d4d9c66..8371f67 100644 Binary files a/test/csmith-fixtures/seed-046.json.gz and b/test/csmith-fixtures/seed-046.json.gz differ diff --git a/test/csmith-fixtures/seed-047.json.gz b/test/csmith-fixtures/seed-047.json.gz index d7995c0..18f2ae3 100644 Binary files a/test/csmith-fixtures/seed-047.json.gz and b/test/csmith-fixtures/seed-047.json.gz differ diff --git a/test/csmith-fixtures/seed-048.json.gz b/test/csmith-fixtures/seed-048.json.gz index 57851f1..1c12721 100644 Binary files a/test/csmith-fixtures/seed-048.json.gz and b/test/csmith-fixtures/seed-048.json.gz differ diff --git a/test/csmith-fixtures/seed-049.json.gz b/test/csmith-fixtures/seed-049.json.gz index 3bb62de..fa5b6ab 100644 Binary files a/test/csmith-fixtures/seed-049.json.gz and b/test/csmith-fixtures/seed-049.json.gz differ diff --git a/test/csmith-fixtures/seed-050.json.gz b/test/csmith-fixtures/seed-050.json.gz index 2591683..2355bd9 100644 Binary files a/test/csmith-fixtures/seed-050.json.gz and b/test/csmith-fixtures/seed-050.json.gz differ diff --git a/test/csmith-fixtures/seed-051.json.gz b/test/csmith-fixtures/seed-051.json.gz index c3c5984..1471e08 100644 Binary files a/test/csmith-fixtures/seed-051.json.gz and b/test/csmith-fixtures/seed-051.json.gz differ diff --git a/test/csmith-fixtures/seed-053.json.gz b/test/csmith-fixtures/seed-053.json.gz index 84e319a..4117406 100644 Binary files a/test/csmith-fixtures/seed-053.json.gz and b/test/csmith-fixtures/seed-053.json.gz differ diff --git a/test/csmith-fixtures/seed-055.json.gz b/test/csmith-fixtures/seed-055.json.gz index a0614a7..2c1b71f 100644 Binary files a/test/csmith-fixtures/seed-055.json.gz and b/test/csmith-fixtures/seed-055.json.gz differ diff --git a/test/csmith-fixtures/seed-056.json.gz b/test/csmith-fixtures/seed-056.json.gz index 18317a0..486dce2 100644 Binary files a/test/csmith-fixtures/seed-056.json.gz and b/test/csmith-fixtures/seed-056.json.gz differ diff --git a/test/csmith-fixtures/seed-057.json.gz b/test/csmith-fixtures/seed-057.json.gz index ae640a9..eb15a40 100644 Binary files a/test/csmith-fixtures/seed-057.json.gz and b/test/csmith-fixtures/seed-057.json.gz differ diff --git a/test/csmith-fixtures/seed-058.json.gz b/test/csmith-fixtures/seed-058.json.gz index be234c7..48d829f 100644 Binary files a/test/csmith-fixtures/seed-058.json.gz and b/test/csmith-fixtures/seed-058.json.gz differ diff --git a/test/csmith-fixtures/seed-059.json.gz b/test/csmith-fixtures/seed-059.json.gz index 0ef640d..f42d076 100644 Binary files a/test/csmith-fixtures/seed-059.json.gz and b/test/csmith-fixtures/seed-059.json.gz differ diff --git a/test/csmith-fixtures/seed-060.json.gz b/test/csmith-fixtures/seed-060.json.gz index e4fcad5..88412ab 100644 Binary files a/test/csmith-fixtures/seed-060.json.gz and b/test/csmith-fixtures/seed-060.json.gz differ diff --git a/test/csmith-fixtures/seed-061.json.gz b/test/csmith-fixtures/seed-061.json.gz index b954ba3..ea7863b 100644 Binary files a/test/csmith-fixtures/seed-061.json.gz and b/test/csmith-fixtures/seed-061.json.gz differ diff --git a/test/csmith-fixtures/seed-062.json.gz b/test/csmith-fixtures/seed-062.json.gz index 6d1f415..51bb961 100644 Binary files a/test/csmith-fixtures/seed-062.json.gz and b/test/csmith-fixtures/seed-062.json.gz differ diff --git a/test/csmith-fixtures/seed-063.json.gz b/test/csmith-fixtures/seed-063.json.gz index e0e1f49..9ab6195 100644 Binary files a/test/csmith-fixtures/seed-063.json.gz and b/test/csmith-fixtures/seed-063.json.gz differ diff --git a/test/csmith-fixtures/seed-064.json.gz b/test/csmith-fixtures/seed-064.json.gz index 9787243..504b870 100644 Binary files a/test/csmith-fixtures/seed-064.json.gz and b/test/csmith-fixtures/seed-064.json.gz differ diff --git a/test/csmith-fixtures/seed-065.json.gz b/test/csmith-fixtures/seed-065.json.gz index 0cb1c88..10261a8 100644 Binary files a/test/csmith-fixtures/seed-065.json.gz and b/test/csmith-fixtures/seed-065.json.gz differ diff --git a/test/csmith-fixtures/seed-066.json.gz b/test/csmith-fixtures/seed-066.json.gz index 99cbf2b..02ab0d4 100644 Binary files a/test/csmith-fixtures/seed-066.json.gz and b/test/csmith-fixtures/seed-066.json.gz differ diff --git a/test/csmith-fixtures/seed-067.json.gz b/test/csmith-fixtures/seed-067.json.gz index 8d7377d..c97dddd 100644 Binary files a/test/csmith-fixtures/seed-067.json.gz and b/test/csmith-fixtures/seed-067.json.gz differ diff --git a/test/csmith-fixtures/seed-068.json.gz b/test/csmith-fixtures/seed-068.json.gz index 644e497..c4dfeb4 100644 Binary files a/test/csmith-fixtures/seed-068.json.gz and b/test/csmith-fixtures/seed-068.json.gz differ diff --git a/test/csmith-fixtures/seed-069.json.gz b/test/csmith-fixtures/seed-069.json.gz index 1a5d8a0..93e45e5 100644 Binary files a/test/csmith-fixtures/seed-069.json.gz and b/test/csmith-fixtures/seed-069.json.gz differ diff --git a/test/csmith-fixtures/seed-070.json.gz b/test/csmith-fixtures/seed-070.json.gz index 83983ef..be28c51 100644 Binary files a/test/csmith-fixtures/seed-070.json.gz and b/test/csmith-fixtures/seed-070.json.gz differ diff --git a/test/csmith-fixtures/seed-071.json.gz b/test/csmith-fixtures/seed-071.json.gz index 4194507..931b7f0 100644 Binary files a/test/csmith-fixtures/seed-071.json.gz and b/test/csmith-fixtures/seed-071.json.gz differ diff --git a/test/csmith-fixtures/seed-073.json.gz b/test/csmith-fixtures/seed-073.json.gz index 6706495..9724724 100644 Binary files a/test/csmith-fixtures/seed-073.json.gz and b/test/csmith-fixtures/seed-073.json.gz differ diff --git a/test/csmith-fixtures/seed-074.json.gz b/test/csmith-fixtures/seed-074.json.gz index 985d88a..281cd88 100644 Binary files a/test/csmith-fixtures/seed-074.json.gz and b/test/csmith-fixtures/seed-074.json.gz differ diff --git a/test/csmith-fixtures/seed-075.json.gz b/test/csmith-fixtures/seed-075.json.gz index 2da66a0..687704e 100644 Binary files a/test/csmith-fixtures/seed-075.json.gz and b/test/csmith-fixtures/seed-075.json.gz differ diff --git a/test/csmith-fixtures/seed-077.json.gz b/test/csmith-fixtures/seed-077.json.gz index d5d992b..e5fddb7 100644 Binary files a/test/csmith-fixtures/seed-077.json.gz and b/test/csmith-fixtures/seed-077.json.gz differ diff --git a/test/csmith-fixtures/seed-078.json.gz b/test/csmith-fixtures/seed-078.json.gz index 5cc79a9..a869a90 100644 Binary files a/test/csmith-fixtures/seed-078.json.gz and b/test/csmith-fixtures/seed-078.json.gz differ diff --git a/test/csmith-fixtures/seed-079.json.gz b/test/csmith-fixtures/seed-079.json.gz index b54cec9..186feb7 100644 Binary files a/test/csmith-fixtures/seed-079.json.gz and b/test/csmith-fixtures/seed-079.json.gz differ diff --git a/test/csmith-fixtures/seed-080.json.gz b/test/csmith-fixtures/seed-080.json.gz index d1f674b..cf0d234 100644 Binary files a/test/csmith-fixtures/seed-080.json.gz and b/test/csmith-fixtures/seed-080.json.gz differ diff --git a/test/csmith-fixtures/seed-081.json.gz b/test/csmith-fixtures/seed-081.json.gz index 29d4a02..b88ec66 100644 Binary files a/test/csmith-fixtures/seed-081.json.gz and b/test/csmith-fixtures/seed-081.json.gz differ diff --git a/test/csmith-fixtures/seed-082.json.gz b/test/csmith-fixtures/seed-082.json.gz index 1b29f20..a00a4aa 100644 Binary files a/test/csmith-fixtures/seed-082.json.gz and b/test/csmith-fixtures/seed-082.json.gz differ diff --git a/test/csmith-fixtures/seed-083.json.gz b/test/csmith-fixtures/seed-083.json.gz index 4b1ab16..d888d12 100644 Binary files a/test/csmith-fixtures/seed-083.json.gz and b/test/csmith-fixtures/seed-083.json.gz differ diff --git a/test/csmith-fixtures/seed-084.json.gz b/test/csmith-fixtures/seed-084.json.gz index 726e390..dd63c4f 100644 Binary files a/test/csmith-fixtures/seed-084.json.gz and b/test/csmith-fixtures/seed-084.json.gz differ diff --git a/test/csmith-fixtures/seed-085.json.gz b/test/csmith-fixtures/seed-085.json.gz index efee675..6c0f19c 100644 Binary files a/test/csmith-fixtures/seed-085.json.gz and b/test/csmith-fixtures/seed-085.json.gz differ diff --git a/test/csmith-fixtures/seed-086.json.gz b/test/csmith-fixtures/seed-086.json.gz index 0cb41be..ec8aac8 100644 Binary files a/test/csmith-fixtures/seed-086.json.gz and b/test/csmith-fixtures/seed-086.json.gz differ diff --git a/test/csmith-fixtures/seed-087.json.gz b/test/csmith-fixtures/seed-087.json.gz index b183b91..d24537b 100644 Binary files a/test/csmith-fixtures/seed-087.json.gz and b/test/csmith-fixtures/seed-087.json.gz differ diff --git a/test/csmith-fixtures/seed-088.json.gz b/test/csmith-fixtures/seed-088.json.gz index 3a2ea6d..0b95519 100644 Binary files a/test/csmith-fixtures/seed-088.json.gz and b/test/csmith-fixtures/seed-088.json.gz differ diff --git a/test/csmith-fixtures/seed-089.json.gz b/test/csmith-fixtures/seed-089.json.gz index ecfffde..6bf8980 100644 Binary files a/test/csmith-fixtures/seed-089.json.gz and b/test/csmith-fixtures/seed-089.json.gz differ diff --git a/test/csmith-fixtures/seed-090.json.gz b/test/csmith-fixtures/seed-090.json.gz index 0956b0b..0e94e11 100644 Binary files a/test/csmith-fixtures/seed-090.json.gz and b/test/csmith-fixtures/seed-090.json.gz differ diff --git a/test/csmith-fixtures/seed-091.json.gz b/test/csmith-fixtures/seed-091.json.gz index 3837008..0e95a8a 100644 Binary files a/test/csmith-fixtures/seed-091.json.gz and b/test/csmith-fixtures/seed-091.json.gz differ diff --git a/test/csmith-fixtures/seed-092.json.gz b/test/csmith-fixtures/seed-092.json.gz index 233b9e4..4a26f71 100644 Binary files a/test/csmith-fixtures/seed-092.json.gz and b/test/csmith-fixtures/seed-092.json.gz differ diff --git a/test/csmith-fixtures/seed-093.json.gz b/test/csmith-fixtures/seed-093.json.gz index 63da361..493ba70 100644 Binary files a/test/csmith-fixtures/seed-093.json.gz and b/test/csmith-fixtures/seed-093.json.gz differ diff --git a/test/csmith-fixtures/seed-094.json.gz b/test/csmith-fixtures/seed-094.json.gz index be1ea76..ff15e7d 100644 Binary files a/test/csmith-fixtures/seed-094.json.gz and b/test/csmith-fixtures/seed-094.json.gz differ diff --git a/test/csmith-fixtures/seed-095.json.gz b/test/csmith-fixtures/seed-095.json.gz index e50bd5c..64db947 100644 Binary files a/test/csmith-fixtures/seed-095.json.gz and b/test/csmith-fixtures/seed-095.json.gz differ diff --git a/test/csmith-fixtures/seed-097.json.gz b/test/csmith-fixtures/seed-097.json.gz index b74016d..d640c68 100644 Binary files a/test/csmith-fixtures/seed-097.json.gz and b/test/csmith-fixtures/seed-097.json.gz differ diff --git a/test/csmith-fixtures/seed-098.json.gz b/test/csmith-fixtures/seed-098.json.gz index 36d2aa7..036afac 100644 Binary files a/test/csmith-fixtures/seed-098.json.gz and b/test/csmith-fixtures/seed-098.json.gz differ diff --git a/test/csmith-fixtures/seed-099.json.gz b/test/csmith-fixtures/seed-099.json.gz index 98463ac..268147f 100644 Binary files a/test/csmith-fixtures/seed-099.json.gz and b/test/csmith-fixtures/seed-099.json.gz differ diff --git a/test/csmith-fixtures/seed-100.json.gz b/test/csmith-fixtures/seed-100.json.gz index 2bcb81f..2dc5c9d 100644 Binary files a/test/csmith-fixtures/seed-100.json.gz and b/test/csmith-fixtures/seed-100.json.gz differ diff --git a/test/spec/path-dispatch.tsv b/test/spec/path-dispatch.tsv index 07ec139..5f5b3ac 100644 --- a/test/spec/path-dispatch.tsv +++ b/test/spec/path-dispatch.tsv @@ -58,26 +58,26 @@ static_assert(sizeof(int) == 4, "msg"); grammar declaration Phase O sizeof in co # Bodies that block_item can structure flow through grammar; bodies # with control-flow keywords or asm/static_assert fall back to chomp. int f() { return 0; } grammar function_definition empty body via grammar -int f(int x) { return x; } legacy function_definition chomp (param-with-id case) -static int f(void) { return 0; } legacy function_definition chomp (void params) +int f(int x) { return x; } grammar function_definition param-with-id via grammar +static int f(void) { return 0; } grammar function_definition void params via grammar -# ---- legacy chomp path: tagged-type definitions with bodies ---- -struct S { int a; }; legacy declaration standalone struct definition -union U { int a; }; legacy declaration standalone union definition -enum E { A, B }; legacy declaration standalone enum definition -struct { int a; } s; legacy declaration anonymous struct variable +# ---- grammar path: tagged-type definitions with bodies ---- +struct S { int a; }; grammar declaration standalone struct definition +union U { int a; }; grammar declaration standalone union definition +enum E { A, B }; grammar declaration standalone enum definition +struct { int a; } s; grammar declaration anonymous struct variable # ---- legacy chomp path: typedef shapes with postfix ---- -typedef int Arr[10]; legacy declaration typedef of array +typedef int Arr[10]; grammar declaration typedef of array -# ---- legacy chomp path: complex declarators beyond Phase P ---- -int (*p)[10]; legacy declaration pointer to array -int (*arr[3])(int); legacy declaration array of fn-pointers +# ---- grammar path: complex compound declarators ---- +int (*p)[10]; grammar declaration pointer to array +int (*arr[3])(int); grammar declaration array of fn-pointers # ---- legacy chomp path: brace initializers ---- -int a[3] = { 1, 2, 3 }; legacy declaration brace init -int a[3] = { [0] = 1 }; legacy declaration designated init +int a[3] = { 1, 2, 3 }; grammar declaration brace init +int a[3] = { [0] = 1 }; grammar declaration designated init # ---- legacy chomp path: GCC extensions ---- -__extension__ int x; legacy declaration __extension__ keyword -int __attribute__((unused)) x; legacy declaration attribute between specs +__extension__ int x; grammar declaration __extension__ keyword +int __attribute__((unused)) x; grammar declaration attribute between specs diff --git a/vendor/jsonic-expr/src/expr.ts b/vendor/jsonic-expr/src/expr.ts index d7f74bf..53f0f61 100644 --- a/vendor/jsonic-expr/src/expr.ts +++ b/vendor/jsonic-expr/src/expr.ts @@ -390,19 +390,33 @@ let Expr: Plugin = function Expr(jsonic: Jsonic, options: ExprOptions) { } : NONE, - // WWW - // // The opening parenthesis of an expression with a preceding value. - // // foo(1) => ['(','foo',1] - // hasParen - // ? { - // s: [OP], - // b: 1, - // r: 'val', - // c: (r: Rule) => parenOTM[r.c0.tin].preval.active, - // u: { paren_preval: true }, - // g: 'expr,expr-paren,expr-paren-preval', - // } - // : NONE, + // VENDORED-PATCH: chain for postfix paren forms. + // When a val has just produced a value (e.g. `a[0]`, `f(0)`, + // or a parenthesised expression `(*p)`) and the next token is + // another preval-active paren-open, push expr (which descends + // into paren) so the new paren-form picks up this val's node + // as the preval. We use `p: 'expr'` (NOT `r: 'val'`) so the + // current val rule stays alive and `ctx.root().node` still + // reflects the chained result on parser return. + // `u: { paren_preval: true }` is set so makeCloseParen finds + // it on r.parent.parent (= this val) and pushes our node into + // the new paren CST node. + hasParen + ? { + s: [OP], + b: 1, + c: (r: Rule, _ctx: Context) => { + const pdef = parenOTM[r.c0.tin] + if (!pdef.preval.active) return false + if (undefined === r.node) return false + return null == pdef.preval.allow || + pdef.preval.allow.includes(r.node) + }, + p: 'expr', + u: { paren_preval: true }, + g: 'expr,expr-paren,expr-paren-preval-chain', + } + : NONE, hasTernary ? { @@ -433,6 +447,7 @@ let Expr: Plugin = function Expr(jsonic: Jsonic, options: ExprOptions) { g: 'expr,list,val,imp,space,top', }, ]) + }) @@ -750,11 +765,6 @@ let Expr: Plugin = function Expr(jsonic: Jsonic, options: ExprOptions) { ]) .ac((r: Rule, ctx: Context) => { - // Only evaluate at root of expr (where r.n.expr === 0) - - // console.log('EXPR-AC', r.name, r.i, r.n, p(r.node), - // 'P', r.parent.name, r.parent.i, r.parent.n, p(r.parent.node)) - if (options.evaluate && 0 === r.n.expr) { // The parent node will contain the root of the expr tree @@ -832,9 +842,6 @@ let Expr: Plugin = function Expr(jsonic: Jsonic, options: ExprOptions) { ]) .ac((r: Rule, ctx: Context) => { - - // QQQ - // console.log('PAREN-AC', r.i, p(r.node), 'C', r.parent.i, p(r.parent.node)) r.parent.node = r.node r.parent.parent.node = r.node @@ -947,6 +954,48 @@ let Expr: Plugin = function Expr(jsonic: Jsonic, options: ExprOptions) { g: 'expr,expr-ternary,close', }, ]) + // VENDORED-PATCH: ensure ternary results get evaluated. + // Without this, ternaries that aren't wrapped in expr + // (e.g. val.close's TERN0 alt does `r: 'ternary'` + // directly, not via an expr intermediate) leave the + // result as a raw `[op_ternary, ...]` op-array. We fire + // on every ternary instance's after-close, but only + // act when the chain has reached its final step (the + // node has accumulated all 3 operands and r.next is + // not another ternary). Walk the prev-chain back to the + // original rule (the val that initiated `r: 'ternary'`) + // and write the evaluated CST so jsonic returns the + // structured conditional_expression instead of the + // op-array. + .ac((r: Rule, ctx: Context) => { + if (!options.evaluate) return + // Skip when the chain is still ongoing: r:'ternary' + // replaces the current rule with another ternary + // instance, and we want to evaluate only on the FINAL + // step. r.next is the rule the parser will process + // after this .ac returns. + if (r.next && r.next.name === 'ternary') return + if (!Array.isArray(r.node)) return + if (!isOp(r.node)) return + // Op-array isn't fully populated until 3 operands + // (cond, then, else) sit at r.node[1..3]. Early steps + // have length 2 or 3. + if (r.node.length < 4) return + const out = evaluation(r, ctx, r.node, options.evaluate) + // Write the evaluated CST back to every rule along the + // r:-replacement chain (each successive ternary instance + // and the original val that started the chain). Their + // .node references all currently point at the same + // op-array; replace them all with the structured CST so + // ctx.root().node — whichever one jsonic returns — + // reflects the evaluated form. + let cur: any = r + while (cur) { + cur.node = out + cur = cur.prev + } + if (r.parent) r.parent.node = out + }) }) } }