Skip to content

Commit fb00ddf

Browse files
authored
Serialize Lark.grammar (fixes issue #1472) (#1506)
* Serialize Lark.grammar * Lark option: cache_grammar = False * Add documentation and error message to Reconstructor * Move parser.grammar check deeper into TreeMatcher
1 parent 87bb8ef commit fb00ddf

File tree

3 files changed

+34
-7
lines changed

3 files changed

+34
-7
lines changed

lark/lark.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
1919
from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice
20-
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
20+
from .load_grammar import load_grammar, _deserialize_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
2121
from .tree import Tree
2222
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
2323

@@ -56,6 +56,7 @@ class LarkOptions(Serialize):
5656
propagate_positions: Union[bool, str]
5757
maybe_placeholders: bool
5858
cache: Union[bool, str]
59+
cache_grammar: bool
5960
regex: bool
6061
g_regex_flags: int
6162
keep_all_tokens: bool
@@ -99,6 +100,10 @@ class LarkOptions(Serialize):
99100
- When ``False``, does nothing (default)
100101
- When ``True``, caches to a temporary file in the local directory
101102
- When given a string, caches to the path pointed by the string
103+
cache_grammar
104+
For use with ``cache`` option. When ``True``, the unanalyzed grammar is also included in the cache.
105+
Useful for classes that require the ``Lark.grammar`` to be present (e.g. Reconstructor).
106+
(default= ``False``)
102107
regex
103108
When True, uses the ``regex`` module instead of the stdlib ``re``.
104109
g_regex_flags
@@ -165,6 +170,7 @@ class LarkOptions(Serialize):
165170
'keep_all_tokens': False,
166171
'tree_class': None,
167172
'cache': False,
173+
'cache_grammar': False,
168174
'postlex': None,
169175
'parser': 'earley',
170176
'lexer': 'auto',
@@ -211,6 +217,9 @@ def __init__(self, options_dict: Dict[str, Any]) -> None:
211217
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
212218
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
213219

220+
if self.cache_grammar and not self.cache:
221+
raise ConfigurationError('cache_grammar cannot be set when cache is disabled')
222+
214223
if o:
215224
raise ConfigurationError("Unknown options: %s" % o.keys())
216225

@@ -264,8 +273,12 @@ class Lark(Serialize):
264273
parser: 'ParsingFrontend'
265274
terminals: Collection[TerminalDef]
266275

276+
__serialize_fields__ = ['parser', 'rules', 'options']
277+
267278
def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
268279
self.options = LarkOptions(options)
280+
if self.options.cache_grammar:
281+
self.__serialize_fields__.append('grammar')
269282
re_module: types.ModuleType
270283

271284
# Set regex or re module
@@ -327,7 +340,9 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
327340
# specific reason - we just want a username.
328341
username = "unknown"
329342

330-
cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_sha256, *sys.version_info[:2])
343+
344+
cache_fn = tempfile.gettempdir() + "/.lark_%s_%s_%s_%s_%s.tmp" % (
345+
"cache_grammar" if self.options.cache_grammar else "cache", username, cache_sha256, *sys.version_info[:2])
331346

332347
old_options = self.options
333348
try:
@@ -454,8 +469,6 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
454469
if __doc__:
455470
__doc__ += "\n\n" + LarkOptions.OPTIONS_DOC
456471

457-
__serialize_fields__ = 'parser', 'rules', 'options'
458-
459472
def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
460473
lexer_conf = self.lexer_conf
461474
if dont_ignore:
@@ -531,6 +544,8 @@ def _load(self: _T, f: Any, **kwargs) -> _T:
531544

532545
assert memo_json
533546
memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
547+
if 'grammar' in data:
548+
self.grammar = _deserialize_grammar(data['grammar'], memo)
534549
options = dict(data['options'])
535550
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
536551
raise ConfigurationError("Some options are not allowed when loading a Parser: {}"

lark/load_grammar.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from contextlib import suppress
1212
from typing import List, Tuple, Union, Callable, Dict, Optional, Sequence, Generator
1313

14-
from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet
14+
from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet, Serialize
1515
from .lexer import Token, TerminalDef, PatternStr, PatternRE, Pattern
1616

1717
from .parse_tree_builder import ParseTreeBuilder
@@ -676,7 +676,7 @@ def nr_deepcopy_tree(t):
676676
return Transformer_NonRecursive(False).transform(t)
677677

678678

679-
class Grammar:
679+
class Grammar(Serialize):
680680

681681
term_defs: List[Tuple[str, Tuple[Tree, int]]]
682682
rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]]
@@ -687,6 +687,8 @@ def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions
687687
self.rule_defs = rule_defs
688688
self.ignore = ignore
689689

690+
__serialize_fields__ = 'term_defs', 'rule_defs', 'ignore'
691+
690692
def compile(self, start, terminals_to_keep) -> Tuple[List[TerminalDef], List[Rule], List[str]]:
691693
# We change the trees in-place (to support huge grammars)
692694
# So deepcopy allows calling compile more than once.
@@ -977,6 +979,8 @@ def _parse_grammar(text, name, start='start'):
977979

978980
return PrepareGrammar().transform(tree)
979981

982+
def _deserialize_grammar(data, memo) -> Grammar:
983+
return Grammar.deserialize(data, memo)
980984

981985
def _error_repr(error):
982986
if isinstance(error, UnexpectedToken):

lark/tree_matcher.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from . import Tree, Token
77
from .common import ParserConf
8+
from .exceptions import ConfigurationError
89
from .parsers import earley
910
from .grammar import Rule, Terminal, NonTerminal
1011

@@ -89,8 +90,15 @@ class TreeMatcher:
8990
def __init__(self, parser):
9091
# XXX TODO calling compile twice returns different results!
9192
assert not parser.options.maybe_placeholders
93+
9294
# XXX TODO: we just ignore the potential existence of a postlexer
93-
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())
95+
if parser.options.postlex is None:
96+
self.tokens = parser.terminals.copy()
97+
rules = parser.rules.copy()
98+
else:
99+
if not hasattr(parser, 'grammar') and parser.options.cache:
100+
raise ConfigurationError('Unanalyzed grammar not available from cached parser, use cache_grammar=True')
101+
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())
94102

95103
self.rules_for_root = defaultdict(list)
96104

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy