diff --git a/.github/workflows/node.js.yml b/.github/workflows/node.js.yml index da51e31..47a1acf 100644 --- a/.github/workflows/node.js.yml +++ b/.github/workflows/node.js.yml @@ -28,4 +28,13 @@ jobs: cache: 'npm' - run: npm ci - run: npm run build --if-present - - run: npm test + - name: Test JS library code + run: npm test + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install dependencies + run: python -m pip install lark + - name: Test parser generator (Python) + run: python -m test.test \ No newline at end of file diff --git a/examples/run_python_parser.js b/examples/run_python_parser.js index 8bba465..c1e5a04 100644 --- a/examples/run_python_parser.js +++ b/examples/run_python_parser.js @@ -37,7 +37,7 @@ function test_python_lib(base_dir) { if (require && require.main === module) { - test_python_lib('/Python38/lib/') /* <-- Edit this */ + test_python_lib('/Python310/lib/') /* <-- Edit this */ let text = ` (a, b, diff --git a/larkjs/lark.js b/larkjs/lark.js index 103da34..0172a52 100644 --- a/larkjs/lark.js +++ b/larkjs/lark.js @@ -202,7 +202,7 @@ function intersection(setA, setB) { } function set_subtract(a, b) { - return [...a].filter(e => !b.has(e)) + return [...a].filter((e) => !b.has(e)); } function dict(d) { @@ -401,6 +401,8 @@ class LexError extends LarkError { */ class UnexpectedInput extends LarkError { + pos_in_stream = null; + _terminals_by_name = null; /** Returns a pretty string pinpointing the error in the text, with span amount of context characters around it. @@ -698,8 +700,6 @@ function _deserialize(data, namespace, memo) { class Serialize { static deserialize(data, memo) { const cls = this; - let namespace = (cls && cls["__serialize_namespace__"]) || []; - namespace = Object.fromEntries(namespace.map((c) => [c.name, c])); let fields = cls && cls["__serialize_fields__"]; if ("@" in data) { return memo[data["@"]]; @@ -708,7 +708,7 @@ class Serialize { let inst = new_object(cls); for (const f of fields) { if (data && f in data) { - inst[f] = _deserialize(data[f], namespace, memo); + inst[f] = _deserialize(data[f], NAMESPACE, memo); } else { throw new KeyError("Cannot find key for class", cls, e); } @@ -800,20 +800,20 @@ class Tree { _pretty(level, indent_str) { if (this.children.length === 1 && !(this.children[0] instanceof Tree)) { return [ - indent_str * level, + list_repeat(indent_str, level).join(''), this._pretty_label(), "\t", - format("%s", this.children[0]), + format("%s", this.children[0].value), "\n", ]; } - let l = [indent_str * level, this._pretty_label(), "\n"]; + let l = [list_repeat(indent_str, level).join(''), this._pretty_label(), "\n"]; for (const n of this.children) { if (n instanceof Tree) { l.push(...n._pretty(level + 1, indent_str)); } else { - l.push(...[indent_str * (level + 1), format("%s", n), "\n"]); + l.push(...[list_repeat(indent_str, level+1).join(''), format("%s", n.value), "\n"]); } } @@ -924,8 +924,8 @@ class Tree { } yield node; - for (const n of [...node.children].reverse()) { - stack.push(n); + for (const child of [...node.children].reverse()) { + stack.push(child); } } } @@ -945,19 +945,24 @@ class Tree { // /** - Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. + Transformers work bottom-up (or depth-first), starting with visiting the leaves and working + their way up until ending at the root of the tree. - Methods are provided by the user via inheritance, and called according to ``tree.data``. - The returned value from each method replaces the node in the tree structure. + For each node visited, the transformer will call the appropriate method (callbacks), according to the + node's ``data``, and use the returned value to replace the node, thereby creating a new tree structure. - Transformers work bottom-up (or depth-first), starting with the leaves and ending at the root of the tree. Transformers can be used to implement map & reduce patterns. Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable). + If the transformer cannot find a method with the right name, it will instead call ``__default__``, which by + default creates a copy of the node. + + To discard a node, return Discard (``lark.visitors.Discard``). + ``Transformer`` can do anything ``Visitor`` can do, but because it reconstructs the tree, it is slightly less efficient. - To discard a node, return Discard (``lark.visitors.Discard``). + A transformer without methods essentially performs a non-memoized partial deepcopy. All these classes implement the transformer interface: @@ -970,7 +975,6 @@ class Tree { Setting this to ``False`` is slightly faster. Defaults to ``True``. (For processing ignored tokens, use the ``lexer_callbacks`` options) - NOTE: A transformer without methods essentially performs a non-memoized partial deepcopy. */ @@ -1501,7 +1505,7 @@ class PatternStr extends Pattern { static get __serialize_fields__() { return ["value", "flags"]; } - type = "str"; + static get type() { return "str"; } to_regexp() { return this._get_flags(re.escape(this.value)); } @@ -1519,7 +1523,7 @@ class PatternRE extends Pattern { static get __serialize_fields__() { return ["value", "flags", "_width"]; } - type = "re"; + static get type() { return "re"; } to_regexp() { return this._get_flags(this.value); } @@ -1725,12 +1729,12 @@ class _CallChain { const CallChain = callable_class(_CallChain); function _create_unless(terminals, g_regex_flags, re_, use_bytes) { let s, unless; - let tokens_by_type = classify(terminals, (t) => t.pattern.constructor.name); + let tokens_by_type = classify(terminals, (t) => t.pattern.constructor.type); let embedded_strs = new Set(); let callback = {}; - for (const retok of tokens_by_type.get('PatternRE') || []) { + for (const retok of tokens_by_type.get('re') || []) { unless = []; - for (const strtok of tokens_by_type.get('PatternStr') || []) { + for (const strtok of tokens_by_type.get('str') || []) { if (strtok.priority !== retok.priority) { continue; } @@ -1782,10 +1786,16 @@ function _regexp_has_newline(r) { ); } +/** + Represents the current state of the lexer as it scans the text + (Lexer objects are only instanciated per grammar, not per text) + +*/ + class LexerState { - constructor(text, line_ctr, last_token = null) { + constructor(text, line_ctr = null, last_token = null) { this.text = text; - this.line_ctr = line_ctr; + this.line_ctr = line_ctr || new LineCounter("\n"); this.last_token = last_token; } @@ -1802,6 +1812,26 @@ class LexerState { } } +/** + A thread that ties a lexer instance and a lexer state, to be used by the parser + +*/ + +class LexerThread { + constructor(lexer, lexer_state) { + this.lexer = lexer; + this.state = lexer_state; + } + + static from_text(lexer, text) { + return new this(lexer, new LexerState(text)); + } + + lex(parser_state) { + return this.lexer.lex(this.state, parser_state); + } +} + /** Lexer interface @@ -1814,11 +1844,6 @@ class Lexer extends ABC { lex(lexer_state, parser_state) { return NotImplemented; } - - make_lexer_state(text) { - let line_ctr = new LineCounter("\n"); - return new LexerState(text, line_ctr); - } } function sort_by_key_tuple(arr, key) { @@ -1872,7 +1897,10 @@ class BasicLexer extends Lexer { throw new LexError( format( "Ignore terminals are not defined: %s", - new Set(conf.ignore) - new Set(terminals.map((t) => t.name)) + set_subtract( + new Set(conf.ignore), + new Set(terminals.map((t) => t.name)) + ) ) ); } @@ -2054,10 +2082,6 @@ class ContextualLexer extends Lexer { this.root_lexer = new BasicLexer(trad_conf); } - make_lexer_state(text) { - return this.root_lexer.make_lexer_state(text); - } - *lex(lexer_state, parser_state) { let last_token, lexer, token; try { @@ -2096,21 +2120,6 @@ class ContextualLexer extends Lexer { } } -/** - A thread that ties a lexer instance and a lexer state, to be used by the parser -*/ - -class LexerThread { - constructor(lexer, text) { - this.lexer = lexer; - this.state = lexer.make_lexer_state(text); - } - - lex(parser_state) { - return this.lexer.lex(this.state, parser_state); - } -} - // // Common // @@ -2619,7 +2628,7 @@ class LALR_Parser extends Serialize { while (true) { if (e instanceof UnexpectedCharacters) { - s = e.interactive_parser.lexer_state.state; + s = e.interactive_parser.lexer_thread.state; p = s.line_ctr.char_pos; } @@ -2642,7 +2651,7 @@ class LALR_Parser extends Serialize { e instanceof UnexpectedToken && e.token.type === e2.token.type && e2.token.type === "$END" && - e.interactive_parser === e2.interactive_parser + e.interactive_parser.eq(e2.interactive_parser) ) { // Prevent infinite loop throw e2; @@ -2791,7 +2800,7 @@ class _Parser { // Main LALR-parser loop try { token = null; - for (const token of state.lexer.lex(state)) { + for (token of state.lexer.lex(state)) { state.feed_token(token); } @@ -2849,10 +2858,11 @@ class _Parser { */ class InteractiveParser { - constructor(parser, parser_state, lexer_state) { + constructor(parser, parser_state, lexer_thread) { this.parser = parser; this.parser_state = parser_state; - this.lexer_state = lexer_state; + this.lexer_thread = lexer_thread; + this.result = null; } /** @@ -2865,15 +2875,30 @@ class InteractiveParser { return this.parser_state.feed_token(token, token.type === "$END"); } + /** + Step through the different stages of the parse, by reading tokens from the lexer + and feeding them to the parser, one per iteration. + + Returns an iterator of the tokens it encounters. + + When the parse is over, the resulting tree can be found in ``InteractiveParser.result``. + + */ + *iter_parse() { + for (const token of this.lexer_thread.lex(this.parser_state)) { + yield token; + this.result = this.feed_token(token); + } + } + /** Try to feed the rest of the lexer state into the interactive parser. Note that this modifies the instance in place and does not feed an '$END' Token + */ exhaust_lexer() { - for (const token of this.lexer_state.lex(this.parser_state)) { - this.parser_state.feed_token(token); - } + return [...this.iter_parse()]; } /** @@ -2898,7 +2923,7 @@ class InteractiveParser { return ( this.parser_state === other.parser_state && - this.lexer_state === other.lexer_state + this.lexer_thread === other.lexer_thread ); } @@ -2910,7 +2935,7 @@ class InteractiveParser { return new ImmutableInteractiveParser( p.parser, p.parser_state, - p.lexer_state + p.lexer_thread ); } @@ -2986,12 +3011,7 @@ class InteractiveParser { */ class ImmutableInteractiveParser extends InteractiveParser { - static get result() { - return null; - } - get result() { - return this.constructor.result; - } + result = null; feed_token(token) { let c = copy(this); c.result = InteractiveParser.feed_token(c, token); @@ -3014,7 +3034,7 @@ class ImmutableInteractiveParser extends InteractiveParser { */ as_mutable() { let p = copy(this); - return new InteractiveParser(p.parser, p.parser_state, p.lexer_state); + return new InteractiveParser(p.parser, p.parser_state, p.lexer_thread); } } @@ -3043,7 +3063,6 @@ class ParseTable { serialize(memo) { let tokens = new Enumerator(); - let rules = new Enumerator(); let states = Object.fromEntries( dict_items(this.states).map(([state, actions]) => [ state, @@ -3167,6 +3186,26 @@ class MakeParsingFrontend { // ... Continued later in the module +function _deserialize_parsing_frontend( + data, + memo, + lexer_conf, + callbacks, + options +) { + let parser_conf = ParserConf.deserialize(data["parser_conf"], memo); + let parser = LALR_Parser.deserialize(data["parser"], memo, callbacks, options.debug); + parser_conf.callbacks = callbacks; + return new ParsingFrontend({ + lexer_conf: lexer_conf, + parser_conf: parser_conf, + options: options, + parser: parser, + }); +} + +var _parser_creators = {} + class ParsingFrontend extends Serialize { static get __serialize_fields__() { return ["lexer_conf", "parser_conf", "parser"]; @@ -3182,11 +3221,7 @@ class ParsingFrontend extends Serialize { // From cache this.parser = parser; } else { - create_parser = { - lalr: create_lalr_parser, - earley: create_earley_parser, - cyk: CYK_FrontEnd, - }[parser_conf.parser_type]; + create_parser = dict_get(_parser_creators, parser_conf.parser_type); this.parser = create_parser(lexer_conf, parser_conf, options); } // Set-up lexer @@ -3197,16 +3232,18 @@ class ParsingFrontend extends Serialize { return; } - if ( - { basic: create_basic_lexer, contextual: create_contextual_lexer } && - lexer_type in - { basic: create_basic_lexer, contextual: create_contextual_lexer } - ) { - create_lexer = { + const lexers = { basic: create_basic_lexer, - contextual: create_contextual_lexer, - }[lexer_type]; - this.lexer = create_lexer(lexer_conf, this.parser, lexer_conf.postlex); + contextual: create_contextual_lexer + } + if (lexer_type in lexers) { + create_lexer = lexers[lexer_type]; + this.lexer = create_lexer( + lexer_conf, + this.parser, + lexer_conf.postlex, + options + ); } else { this.lexer = _wrap_lexer(lexer_type)(lexer_conf); } @@ -3240,10 +3277,14 @@ class ParsingFrontend extends Serialize { return start; } + _make_lexer_thread(text) { + return this.skip_lexer ? text : LexerThread.from_text(this.lexer, text); + } + parse(text, start = null, on_error = null) { let chosen_start = this._verify_start(start); - let stream = this.skip_lexer ? text : new LexerThread(this.lexer, text); let kw = on_error === null ? {} : { on_error: on_error }; + let stream = this._make_lexer_thread(text); return this.parser.parse({ lexer: stream, start: chosen_start, @@ -3259,12 +3300,12 @@ class ParsingFrontend extends Serialize { ); } - let stream = this.skip_lexer ? text : new LexerThread(this.lexer, text); + let stream = this._make_lexer_thread(text); return this.parser.parse_interactive(stream, chosen_start); } } -function get_frontend(parser, lexer) { +function _validate_frontend_args(parser, lexer) { let expected; assert_config(parser, ["lalr", "earley", "cyk"]); if (!(typeof lexer === "object")) { @@ -3283,8 +3324,6 @@ function get_frontend(parser, lexer) { ) ); } - - return new MakeParsingFrontend(parser, lexer); } function _get_lexer_callbacks(transformer, terminals) { @@ -3306,21 +3345,17 @@ class PostLexConnector { this.postlexer = postlexer; } - make_lexer_state(text) { - return this.lexer.make_lexer_state(text); - } - lex(lexer_state, parser_state) { let i = this.lexer.lex(lexer_state, parser_state); return this.postlexer.process(i); } } -function create_basic_lexer(lexer_conf, parser, postlex) { +function create_basic_lexer(lexer_conf, parser, postlex, options) { return new BasicLexer(lexer_conf); } -function create_contextual_lexer(lexer_conf, parser, postlex) { +function create_contextual_lexer(lexer_conf, parser, postlex, options) { let states = Object.fromEntries( dict_items(parser._parse_table.states).map(([idx, t]) => [ idx, @@ -3340,8 +3375,7 @@ function create_lalr_parser(lexer_conf, parser_conf, options = null) { return new LALR_Parser({ parser_conf: parser_conf, debug: debug }); } -var create_earley_parser = NotImplemented; -var CYK_FrontEnd = NotImplemented; +_parser_creators["lalr"] = create_lalr_parser; // // Lark @@ -3352,6 +3386,7 @@ class PostLex extends ABC { return stream; } + always_accept = []; } /** @@ -3361,38 +3396,37 @@ class PostLex extends ABC { */ class LarkOptions extends Serialize { - static get OPTIONS_DOC() { - return ` + OPTIONS_DOC = ` **=== General Options ===** start The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start") debug - Display debug information and extra warnings. Use only when debugging (Default: ````False````) + Display debug information and extra warnings. Use only when debugging (Default: \`\`False\`\`) When used with Earley, it generates a forest graph as "sppf.png", if 'dot' is installed. transformer Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) propagate_positions Propagates (line, column, end_line, end_column) attributes into all tree branches. - Accepts ````False````, ````True````, or a callable, which will filter which nodes to ignore when propagating. + Accepts \`\`False\`\`, \`\`True\`\`, or a callable, which will filter which nodes to ignore when propagating. maybe_placeholders - When ````True````, the ````[]```` operator returns ````None```` when not matched. - When ````False````, ````[]```` behaves like the ````?```` operator, and returns no value at all. - (default= ````True````) + When \`\`True\`\`, the \`\`[]\`\` operator returns \`\`None\`\` when not matched. + When \`\`False\`\`, \`\`[]\`\` behaves like the \`\`?\`\` operator, and returns no value at all. + (default= \`\`True\`\`) cache Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. - - When ````False````, does nothing (default) - - When ````True````, caches to a temporary file in the local directory + - When \`\`False\`\`, does nothing (default) + - When \`\`True\`\`, caches to a temporary file in the local directory - When given a string, caches to the path pointed by the string regex - When True, uses the ````regex```` module instead of the stdlib ````re````. + When True, uses the \`\`regex\`\` module instead of the stdlib \`\`re\`\`. g_regex_flags Flags that are applied to all terminals (both regex and strings) keep_all_tokens - Prevent the tree builder from automagically removing "punctuation" tokens (Default: ````False````) + Prevent the tree builder from automagically removing "punctuation" tokens (Default: \`\`False\`\`) tree_class - Lark will produce trees comprised of instances of this class instead of the default ````lark.Tree````. + Lark will produce trees comprised of instances of this class instead of the default \`\`lark.Tree\`\`. **=== Algorithm Options ===** @@ -3418,13 +3452,13 @@ class LarkOptions extends Serialize { **=== Misc. / Domain Specific Options ===** postlex - Lexer post-processing (Default: ````None````) Only works with the basic and contextual lexers. + Lexer post-processing (Default: \`\`None\`\`) Only works with the basic and contextual lexers. priority - How priorities should be evaluated - "auto", ````None````, "normal", "invert" (Default: "auto") + How priorities should be evaluated - "auto", \`\`None\`\`, "normal", "invert" (Default: "auto") lexer_callbacks Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. use_bytes - Accept an input of type ````bytes```` instead of ````str````. + Accept an input of type \`\`bytes\`\` instead of \`\`str\`\`. edit_terminals A callback for editing the terminals before parse. import_paths @@ -3433,10 +3467,6 @@ class LarkOptions extends Serialize { Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading **=== End of Options ===** `; - } - get OPTIONS_DOC() { - return this.constructor.OPTIONS_DOC; - } // Adding a new option needs to be done in multiple places: // - In the dictionary below. This is the primary truth of which options `Lark.__init__` accepts // - In the docstring above. It is used both for the docstring of `LarkOptions` and `Lark`, and in readthedocs @@ -3515,7 +3545,7 @@ class LarkOptions extends Serialize { } // Options that can be passed to the Lark parser, even when it was loaded from cache/standalone. -// These option are only used outside of `load_grammar`. +// These options are only used outside of `load_grammar`. var _LOAD_ALLOWED_OPTIONS = new Set([ "postlex", "transformer", @@ -3633,7 +3663,7 @@ class Lark extends Serialize { this.options = LarkOptions.deserialize(options, memo); this.rules = data["rules"].map((r) => Rule.deserialize(r, memo)); this.source_path = ""; - let parser_class = get_frontend(this.options.parser, this.options.lexer); + _validate_frontend_args(this.options.parser, this.options.lexer); this.lexer_conf = this._deserialize_lexer_conf( data["parser"], memo, @@ -3644,7 +3674,7 @@ class Lark extends Serialize { this._terminals_dict = Object.fromEntries( this.terminals.map((t) => [t.name, t]) ); - this.parser = parser_class.deserialize( + this.parser = _deserialize_parsing_frontend( data["parser"], memo, this.lexer_conf, @@ -3710,7 +3740,7 @@ class Lark extends Serialize { } else { lexer = this.lexer; } - let lexer_thread = new LexerThread(lexer, text); + let lexer_thread = LexerThread.from_text(lexer, text); let stream = lexer_thread.lex(null); if (this.options.postlex) { return this.options.postlex.process(stream); @@ -3911,6 +3941,16 @@ class PythonIndenter extends Indenter { return this.constructor.tab_len; } } + +const NAMESPACE = { + Terminal: Terminal, + NonTerminal: NonTerminal, + RuleOptions: RuleOptions, + PatternStr: PatternStr, + PatternRE: PatternRE, + TerminalDef: TerminalDef +} + module.exports = { LarkError, ConfigurationError, diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/json.lark b/test/json.lark new file mode 100644 index 0000000..a32c907 --- /dev/null +++ b/test/json.lark @@ -0,0 +1,21 @@ +?start: value + +?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null + +array : "[" [value ("," value)*] "]" +object : "{" [pair ("," pair)*] "}" +pair : string ":" value + +string : ESCAPED_STRING + +%import common.ESCAPED_STRING +%import common.SIGNED_NUMBER +%import common.WS + +%ignore WS diff --git a/test/test.py b/test/test.py new file mode 100644 index 0000000..6f9a87a --- /dev/null +++ b/test/test.py @@ -0,0 +1,72 @@ +from subprocess import Popen, PIPE + +from larkjs.__main__ import generate_js_standalone +from lark import Lark + +TEST_CODE = r""" + +const parser = get_parser({transformer}) + +console.log(JSON.stringify(parser.parse(input_text))) +""" + + +class JsParser: + def __init__(self, lark_instance): + self.code = generate_js_standalone(lark_instance) + + def parse(self, text, transformer=None): + js_code = self.code + js_code += 'const input_text = `' + text + '`;' + if transformer: + js_code += transformer + js_code += TEST_CODE + + p = Popen(["node", "-"], stdin=PIPE, stdout=PIPE, stderr=PIPE) + stdout, stderr = p.communicate(js_code.encode()) + + if stderr: + raise ValueError(stderr.decode()) + return stdout.decode() + + + + +def test_json_parser(): + parser = Lark.open('json.lark', rel_to=__file__, parser="lalr") + js_parser = JsParser(parser) + + transformer = """ + let transformer = { + number: ([n]) => parseFloat(n.value), + string: ([s]) => s.value.slice(1, -1), + array: Array.from, + pair: Array.from, + object: Object.fromEntries, + + null: () => null, + true: () => true, + false: () => false, + } + """ + + text = r""" + { + "empty_object" : {}, + "empty_array" : [], + "booleans" : { "YES" : true, "NO" : false }, + "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], + "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ], + "nothing" : null + } + """ + + res = js_parser.parse(text, transformer) + expected = r"""{"empty_object":{},"empty_array":[],"booleans":{"YES":true,"NO":false},"numbers":[0,1,-2,3.3,440000,6.6e-7],"strings":["This",["And","That","And a \\\"b"]],"nothing":null}""" + assert res.strip() == expected, res + +def test(): + test_json_parser() + +if __name__ == '__main__': + test() \ No newline at end of file pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy