Skip to content

Commit 1640da5

Browse files
authored
Merge pull request #1452 from lark-parser/textslice
Added TextSlice; Lark can now parse/lex a text-slice
2 parents 9a12577 + c3893d8 commit 1640da5

File tree

9 files changed

+254
-60
lines changed

9 files changed

+254
-60
lines changed

docs/classes.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,8 @@ Indenter
9696

9797
.. autoclass:: lark.indenter.Indenter
9898
.. autoclass:: lark.indenter.PythonIndenter
99+
100+
TextSlice
101+
---------
102+
103+
.. autoclass:: lark.utils.TextSlice

lark/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from .lark import Lark
1212
from .lexer import Token
1313
from .tree import ParseTree, Tree
14-
from .utils import logger
14+
from .utils import logger, TextSlice
1515
from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
1616

1717
__version__: str = "1.2.2"
@@ -33,6 +33,7 @@
3333
"Discard",
3434
"Transformer",
3535
"Transformer_NonRecursive",
36+
"TextSlice",
3637
"Visitor",
3738
"v_args",
3839
)

lark/lark.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .parser_frontends import ParsingFrontend
1717

1818
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
19-
from .utils import Serialize, SerializeMemoizer, FS, logger
19+
from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice
2020
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
2121
from .tree import Tree
2222
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
@@ -598,7 +598,7 @@ def __repr__(self):
598598
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
599599

600600

601-
def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
601+
def lex(self, text: TextOrSlice, dont_ignore: bool=False) -> Iterator[Token]:
602602
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'
603603
604604
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
@@ -620,11 +620,11 @@ def get_terminal(self, name: str) -> TerminalDef:
620620
"""Get information about a terminal"""
621621
return self._terminals_dict[name]
622622

623-
def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
624-
"""Start an interactive parsing session.
623+
def parse_interactive(self, text: Optional[TextOrSlice]=None, start: Optional[str]=None) -> 'InteractiveParser':
624+
"""Start an interactive parsing session. Only works when parser='lalr'.
625625
626626
Parameters:
627-
text (str, optional): Text to be parsed. Required for ``resume_parse()``.
627+
text (TextOrSlice, optional): Text to be parsed. Required for ``resume_parse()``.
628628
start (str, optional): Start symbol
629629
630630
Returns:
@@ -634,13 +634,15 @@ def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None)
634634
"""
635635
return self.parser.parse_interactive(text, start=start)
636636

637-
def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
637+
def parse(self, text: TextOrSlice, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
638638
"""Parse the given text, according to the options provided.
639639
640640
Parameters:
641-
text (str): Text to be parsed.
641+
text (TextOrSlice): Text to be parsed, as `str` or `bytes`.
642+
TextSlice may also be used, but only when lexer='basic' or 'contextual'.
642643
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
643-
on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing.
644+
on_error (function, optional): if provided, will be called on UnexpectedInput error,
645+
with the exception as its argument. Return true to resume parsing, or false to raise the exception.
644646
LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.
645647
646648
Returns:

lark/lexer.py

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .common import LexerConf
1818
from .parsers.lalr_parser_state import ParserState
1919

20-
from .utils import classify, get_regexp_width, Serialize, logger
20+
from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice
2121
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
2222
from .grammar import TOKEN_DEFAULT_PRIORITY
2323

@@ -289,7 +289,7 @@ def __eq__(self, other):
289289

290290
return self.char_pos == other.char_pos and self.newline_char == other.newline_char
291291

292-
def feed(self, token: Token, test_newline=True):
292+
def feed(self, token: TextOrSlice, test_newline=True):
293293
"""Consume a token and calculate the new line & column.
294294
295295
As an optional optimization, set test_newline=False if token doesn't contain a newline.
@@ -305,13 +305,13 @@ def feed(self, token: Token, test_newline=True):
305305

306306

307307
class UnlessCallback:
308-
def __init__(self, scanner):
308+
def __init__(self, scanner: 'Scanner'):
309309
self.scanner = scanner
310310

311-
def __call__(self, t):
312-
res = self.scanner.match(t.value, 0)
313-
if res:
314-
_value, t.type = res
311+
def __call__(self, t: Token):
312+
res = self.scanner.fullmatch(t.value)
313+
if res is not None:
314+
t.type = res
315315
return t
316316

317317

@@ -347,19 +347,18 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
347347
if strtok.pattern.flags <= retok.pattern.flags:
348348
embedded_strs.add(strtok)
349349
if unless:
350-
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
350+
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, use_bytes=use_bytes))
351351

352352
new_terminals = [t for t in terminals if t not in embedded_strs]
353353
return new_terminals, callback
354354

355355

356356
class Scanner:
357-
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
357+
def __init__(self, terminals, g_regex_flags, re_, use_bytes):
358358
self.terminals = terminals
359359
self.g_regex_flags = g_regex_flags
360360
self.re_ = re_
361361
self.use_bytes = use_bytes
362-
self.match_whole = match_whole
363362

364363
self.allowed_types = {t.name for t in self.terminals}
365364

@@ -369,10 +368,9 @@ def _build_mres(self, terminals, max_size):
369368
# Python sets an unreasonable group limit (currently 100) in its re module
370369
# Worse, the only way to know we reached it is by catching an AssertionError!
371370
# This function recursively tries less and less groups until it's successful.
372-
postfix = '$' if self.match_whole else ''
373371
mres = []
374372
while terminals:
375-
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
373+
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp()) for t in terminals[:max_size])
376374
if self.use_bytes:
377375
pattern = pattern.encode('latin-1')
378376
try:
@@ -384,13 +382,20 @@ def _build_mres(self, terminals, max_size):
384382
terminals = terminals[max_size:]
385383
return mres
386384

387-
def match(self, text, pos):
385+
def match(self, text: TextSlice, pos):
388386
for mre in self._mres:
389-
m = mre.match(text, pos)
387+
m = mre.match(text.text, pos, text.end)
390388
if m:
391389
return m.group(0), m.lastgroup
392390

393391

392+
def fullmatch(self, text: str) -> Optional[str]:
393+
for mre in self._mres:
394+
m = mre.fullmatch(text)
395+
if m:
396+
return m.lastgroup
397+
return None
398+
394399
def _regexp_has_newline(r: str):
395400
r"""Expressions that may indicate newlines in a regexp:
396401
- newlines (\n)
@@ -409,20 +414,31 @@ class LexerState:
409414

410415
__slots__ = 'text', 'line_ctr', 'last_token'
411416

412-
text: str
417+
text: TextSlice
413418
line_ctr: LineCounter
414419
last_token: Optional[Token]
415420

416-
def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None):
421+
def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None):
422+
if line_ctr is None:
423+
line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n')
424+
425+
if text.start > 0:
426+
# Advance the line-count until line_ctr.char_pos == text.start
427+
line_ctr.feed(TextSlice(text.text, 0, text.start))
428+
429+
if not (text.start <= line_ctr.char_pos <= text.end):
430+
raise ValueError("LineCounter.char_pos is out of bounds")
431+
417432
self.text = text
418-
self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n')
433+
self.line_ctr = line_ctr
419434
self.last_token = last_token
420435

436+
421437
def __eq__(self, other):
422438
if not isinstance(other, LexerState):
423439
return NotImplemented
424440

425-
return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
441+
return self.text == other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
426442

427443
def __copy__(self):
428444
return type(self)(self.text, copy(self.line_ctr), self.last_token)
@@ -432,15 +448,18 @@ class LexerThread:
432448
"""A thread that ties a lexer instance and a lexer state, to be used by the parser
433449
"""
434450

435-
def __init__(self, lexer: 'Lexer', lexer_state: LexerState):
451+
def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState]):
436452
self.lexer = lexer
437453
self.state = lexer_state
438454

439455
@classmethod
440-
def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread':
456+
def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread':
457+
text = TextSlice.cast_from(text_or_slice)
441458
return cls(lexer, LexerState(text))
442459

443460
def lex(self, parser_state):
461+
if self.state is None:
462+
raise TypeError("Cannot lex: No text assigned to lexer state")
444463
return self.lexer.lex(self.state, parser_state)
445464

446465
def __copy__(self):
@@ -461,9 +480,9 @@ class Lexer(ABC):
461480
def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
462481
return NotImplemented
463482

464-
def make_lexer_state(self, text):
483+
def make_lexer_state(self, text: str):
465484
"Deprecated"
466-
return LexerState(text)
485+
return LexerState(TextSlice.cast_from(text))
467486

468487

469488
def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
@@ -563,9 +582,9 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
563582
self.use_bytes = conf.use_bytes
564583
self.terminals_by_name = conf.terminals_by_name
565584

566-
self._scanner = None
585+
self._scanner: Optional[Scanner] = None
567586

568-
def _build_scanner(self):
587+
def _build_scanner(self) -> Scanner:
569588
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
570589
assert all(self.callback.values())
571590

@@ -576,26 +595,26 @@ def _build_scanner(self):
576595
else:
577596
self.callback[type_] = f
578597

579-
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
598+
return Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
580599

581600
@property
582-
def scanner(self):
601+
def scanner(self) -> Scanner:
583602
if self._scanner is None:
584-
self._build_scanner()
603+
self._scanner = self._build_scanner()
585604
return self._scanner
586605

587606
def match(self, text, pos):
588607
return self.scanner.match(text, pos)
589608

590609
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
591610
line_ctr = lex_state.line_ctr
592-
while line_ctr.char_pos < len(lex_state.text):
611+
while line_ctr.char_pos < lex_state.text.end:
593612
res = self.match(lex_state.text, line_ctr.char_pos)
594613
if not res:
595614
allowed = self.scanner.allowed_types - self.ignore_types
596615
if not allowed:
597616
allowed = {"<END-OF-FILE>"}
598-
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
617+
raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
599618
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
600619
state=parser_state, terminals_by_name=self.terminals_by_name)
601620

lark/parser_frontends.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
22

33
from .exceptions import ConfigurationError, GrammarError, assert_config
4-
from .utils import get_regexp_width, Serialize
4+
from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice
55
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
66
from .parsers import earley, xearley, cyk
77
from .parsers.lalr_parser import LALR_Parser
@@ -15,16 +15,31 @@
1515
###{standalone
1616

1717
def _wrap_lexer(lexer_class):
18-
future_interface = getattr(lexer_class, '__future_interface__', False)
19-
if future_interface:
18+
future_interface = getattr(lexer_class, '__future_interface__', 0)
19+
if future_interface == 2:
2020
return lexer_class
21-
else:
22-
class CustomLexerWrapper(Lexer):
21+
elif future_interface == 1:
22+
class CustomLexerWrapper1(Lexer):
23+
def __init__(self, lexer_conf):
24+
self.lexer = lexer_class(lexer_conf)
25+
def lex(self, lexer_state, parser_state):
26+
if not lexer_state.text.is_complete_text():
27+
raise TypeError("Interface=1 Custom Lexer don't support TextSlice")
28+
lexer_state.text = lexer_state.text
29+
return self.lexer.lex(lexer_state, parser_state)
30+
return CustomLexerWrapper1
31+
elif future_interface == 0:
32+
class CustomLexerWrapper0(Lexer):
2333
def __init__(self, lexer_conf):
2434
self.lexer = lexer_class(lexer_conf)
35+
2536
def lex(self, lexer_state, parser_state):
26-
return self.lexer.lex(lexer_state.text)
27-
return CustomLexerWrapper
37+
if not lexer_state.text.is_complete_text():
38+
raise TypeError("Interface=0 Custom Lexer don't support TextSlice")
39+
return self.lexer.lex(lexer_state.text.text)
40+
return CustomLexerWrapper0
41+
else:
42+
raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected")
2843

2944

3045
def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options):
@@ -93,23 +108,27 @@ def _verify_start(self, start=None):
93108
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
94109
return start
95110

96-
def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]:
111+
def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]:
97112
cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
98-
return text if self.skip_lexer else cls.from_text(self.lexer, text)
113+
return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text)
114+
115+
def parse(self, text: Optional[TextOrSlice], start=None, on_error=None):
116+
if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):
117+
if isinstance(text, TextSlice) and not text.is_complete_text():
118+
raise TypeError(f"Lexer {self.lexer_conf.lexer_type} does not support text slices.")
99119

100-
def parse(self, text: str, start=None, on_error=None):
101120
chosen_start = self._verify_start(start)
102121
kw = {} if on_error is None else {'on_error': on_error}
103122
stream = self._make_lexer_thread(text)
104123
return self.parser.parse(stream, chosen_start, **kw)
105124

106-
def parse_interactive(self, text: Optional[str]=None, start=None):
125+
def parse_interactive(self, text: Optional[TextOrSlice]=None, start=None):
107126
# TODO BREAK - Change text from Optional[str] to text: str = ''.
108127
# Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return []
109128
chosen_start = self._verify_start(start)
110129
if self.parser_conf.parser_type != 'lalr':
111130
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
112-
stream = self._make_lexer_thread(text) # type: ignore[arg-type]
131+
stream = self._make_lexer_thread(text)
113132
return self.parser.parse_interactive(stream, chosen_start)
114133

115134

lark/parsers/lalr_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def parse(self, lexer, start, on_error=None):
5555
if isinstance(e, UnexpectedCharacters):
5656
# If user didn't change the character position, then we should
5757
if p == s.line_ctr.char_pos:
58-
s.line_ctr.feed(s.text[p:p+1])
58+
s.line_ctr.feed(s.text.text[p:p+1])
5959

6060
try:
6161
return e.interactive_parser.resume_parse()

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy