Serialize Lark.grammar (fixes issue #1472) (#1506)

NasalDaemon · web-flow · commit fb00ddff06c3 · 2025-07-13T14:01:09.000+02:00
* Serialize Lark.grammar

* Lark option: cache_grammar = False

* Add documentation and error message to Reconstructor

* Move parser.grammar check deeper into TreeMatcher
diff --git a/lark/lark.py b/lark/lark.py
@@ -17,7 +17,7 @@
 
 from .exceptions import ConfigurationError, assert_config, UnexpectedInput
 from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice
-from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
+from .load_grammar import load_grammar, _deserialize_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
 from .tree import Tree
 from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
 
@@ -56,6 +56,7 @@ class LarkOptions(Serialize):
     propagate_positions: Union[bool, str]
     maybe_placeholders: bool
     cache: Union[bool, str]
+    cache_grammar: bool
     regex: bool
     g_regex_flags: int
     keep_all_tokens: bool
@@ -99,6 +100,10 @@ class LarkOptions(Serialize):
             - When ``False``, does nothing (default)
             - When ``True``, caches to a temporary file in the local directory
             - When given a string, caches to the path pointed by the string
+    cache_grammar
+            For use with ``cache`` option. When ``True``, the unanalyzed grammar is also included in the cache.
+            Useful for classes that require the ``Lark.grammar`` to be present (e.g. Reconstructor).
+            (default= ``False``)
     regex
             When True, uses the ``regex`` module instead of the stdlib ``re``.
     g_regex_flags
@@ -165,6 +170,7 @@ class LarkOptions(Serialize):
         'keep_all_tokens': False,
         'tree_class': None,
         'cache': False,
+        'cache_grammar': False,
         'postlex': None,
         'parser': 'earley',
         'lexer': 'auto',
@@ -211,6 +217,9 @@ def __init__(self, options_dict: Dict[str, Any]) -> None:
             raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
                              'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
 
+        if self.cache_grammar and not self.cache:
+            raise ConfigurationError('cache_grammar cannot be set when cache is disabled')
+
         if o:
             raise ConfigurationError("Unknown options: %s" % o.keys())
 
@@ -264,8 +273,12 @@ class Lark(Serialize):
     parser: 'ParsingFrontend'
     terminals: Collection[TerminalDef]
 
+    __serialize_fields__ = ['parser', 'rules', 'options']
+
     def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
         self.options = LarkOptions(options)
+        if self.options.cache_grammar:
+            self.__serialize_fields__.append('grammar')
         re_module: types.ModuleType
 
         # Set regex or re module
@@ -327,7 +340,9 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
                         # specific reason - we just want a username.
                         username = "unknown"
 
-                    cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_sha256, *sys.version_info[:2])
+
+                    cache_fn = tempfile.gettempdir() + "/.lark_%s_%s_%s_%s_%s.tmp" % (
+                        "cache_grammar" if self.options.cache_grammar else "cache", username, cache_sha256, *sys.version_info[:2])
 
                 old_options = self.options
                 try:
@@ -454,8 +469,6 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
     if __doc__:
         __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC
 
-    __serialize_fields__ = 'parser', 'rules', 'options'
-
     def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
         lexer_conf = self.lexer_conf
         if dont_ignore:
@@ -531,6 +544,8 @@ def _load(self: _T, f: Any, **kwargs) -> _T:
 
         assert memo_json
         memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
+        if 'grammar' in data:
+            self.grammar = _deserialize_grammar(data['grammar'], memo)
         options = dict(data['options'])
         if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
             raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
@@ -11,7 +11,7 @@
 from contextlib import suppress
 from typing import List, Tuple, Union, Callable, Dict, Optional, Sequence, Generator
 
-from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet
+from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet, Serialize
 from .lexer import Token, TerminalDef, PatternStr, PatternRE, Pattern
 
 from .parse_tree_builder import ParseTreeBuilder
@@ -676,7 +676,7 @@ def nr_deepcopy_tree(t):
     return Transformer_NonRecursive(False).transform(t)
 
 
-class Grammar:
+class Grammar(Serialize):
 
     term_defs: List[Tuple[str, Tuple[Tree, int]]]
     rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]]
@@ -687,6 +687,8 @@ def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions
         self.rule_defs = rule_defs
         self.ignore = ignore
 
+    __serialize_fields__ = 'term_defs', 'rule_defs', 'ignore'
+
     def compile(self, start, terminals_to_keep) -> Tuple[List[TerminalDef], List[Rule], List[str]]:
         # We change the trees in-place (to support huge grammars)
         # So deepcopy allows calling compile more than once.
@@ -977,6 +979,8 @@ def _parse_grammar(text, name, start='start'):
 
     return PrepareGrammar().transform(tree)
 
+def _deserialize_grammar(data, memo) -> Grammar:
+    return Grammar.deserialize(data, memo)
 
 def _error_repr(error):
     if isinstance(error, UnexpectedToken):
diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py
@@ -5,6 +5,7 @@
 
 from . import Tree, Token
 from .common import ParserConf
+from .exceptions import ConfigurationError
 from .parsers import earley
 from .grammar import Rule, Terminal, NonTerminal
 
@@ -89,8 +90,15 @@ class TreeMatcher:
     def __init__(self, parser):
         # XXX TODO calling compile twice returns different results!
         assert not parser.options.maybe_placeholders
+
         # XXX TODO: we just ignore the potential existence of a postlexer
-        self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())
+        if parser.options.postlex is None:
+            self.tokens = parser.terminals.copy()
+            rules = parser.rules.copy()
+        else:
+            if not hasattr(parser, 'grammar') and parser.options.cache:
+                raise ConfigurationError('Unanalyzed grammar not available from cached parser, use cache_grammar=True')
+            self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())
 
         self.rules_for_root = defaultdict(list)