17
17
from .common import LexerConf
18
18
from .parsers .lalr_parser_state import ParserState
19
19
20
- from .utils import classify , get_regexp_width , Serialize , logger
20
+ from .utils import classify , get_regexp_width , Serialize , logger , TextSlice , TextOrSlice
21
21
from .exceptions import UnexpectedCharacters , LexError , UnexpectedToken
22
22
from .grammar import TOKEN_DEFAULT_PRIORITY
23
23
@@ -289,7 +289,7 @@ def __eq__(self, other):
289
289
290
290
return self .char_pos == other .char_pos and self .newline_char == other .newline_char
291
291
292
- def feed (self , token : Token , test_newline = True ):
292
+ def feed (self , token : TextOrSlice , test_newline = True ):
293
293
"""Consume a token and calculate the new line & column.
294
294
295
295
As an optional optimization, set test_newline=False if token doesn't contain a newline.
@@ -305,13 +305,13 @@ def feed(self, token: Token, test_newline=True):
305
305
306
306
307
307
class UnlessCallback :
308
- def __init__ (self , scanner ):
308
+ def __init__ (self , scanner : 'Scanner' ):
309
309
self .scanner = scanner
310
310
311
- def __call__ (self , t ):
312
- res = self .scanner .match (t .value , 0 )
313
- if res :
314
- _value , t .type = res
311
+ def __call__ (self , t : Token ):
312
+ res = self .scanner .fullmatch (t .value )
313
+ if res is not None :
314
+ t .type = res
315
315
return t
316
316
317
317
@@ -347,19 +347,18 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
347
347
if strtok .pattern .flags <= retok .pattern .flags :
348
348
embedded_strs .add (strtok )
349
349
if unless :
350
- callback [retok .name ] = UnlessCallback (Scanner (unless , g_regex_flags , re_ , match_whole = True , use_bytes = use_bytes ))
350
+ callback [retok .name ] = UnlessCallback (Scanner (unless , g_regex_flags , re_ , use_bytes = use_bytes ))
351
351
352
352
new_terminals = [t for t in terminals if t not in embedded_strs ]
353
353
return new_terminals , callback
354
354
355
355
356
356
class Scanner :
357
- def __init__ (self , terminals , g_regex_flags , re_ , use_bytes , match_whole = False ):
357
+ def __init__ (self , terminals , g_regex_flags , re_ , use_bytes ):
358
358
self .terminals = terminals
359
359
self .g_regex_flags = g_regex_flags
360
360
self .re_ = re_
361
361
self .use_bytes = use_bytes
362
- self .match_whole = match_whole
363
362
364
363
self .allowed_types = {t .name for t in self .terminals }
365
364
@@ -369,10 +368,9 @@ def _build_mres(self, terminals, max_size):
369
368
# Python sets an unreasonable group limit (currently 100) in its re module
370
369
# Worse, the only way to know we reached it is by catching an AssertionError!
371
370
# This function recursively tries less and less groups until it's successful.
372
- postfix = '$' if self .match_whole else ''
373
371
mres = []
374
372
while terminals :
375
- pattern = u'|' .join (u'(?P<%s>%s)' % (t .name , t .pattern .to_regexp () + postfix ) for t in terminals [:max_size ])
373
+ pattern = u'|' .join (u'(?P<%s>%s)' % (t .name , t .pattern .to_regexp ()) for t in terminals [:max_size ])
376
374
if self .use_bytes :
377
375
pattern = pattern .encode ('latin-1' )
378
376
try :
@@ -384,13 +382,20 @@ def _build_mres(self, terminals, max_size):
384
382
terminals = terminals [max_size :]
385
383
return mres
386
384
387
- def match (self , text , pos ):
385
+ def match (self , text : TextSlice , pos ):
388
386
for mre in self ._mres :
389
- m = mre .match (text , pos )
387
+ m = mre .match (text . text , pos , text . end )
390
388
if m :
391
389
return m .group (0 ), m .lastgroup
392
390
393
391
392
+ def fullmatch (self , text : str ) -> Optional [str ]:
393
+ for mre in self ._mres :
394
+ m = mre .fullmatch (text )
395
+ if m :
396
+ return m .lastgroup
397
+ return None
398
+
394
399
def _regexp_has_newline (r : str ):
395
400
r"""Expressions that may indicate newlines in a regexp:
396
401
- newlines (\n)
@@ -409,20 +414,31 @@ class LexerState:
409
414
410
415
__slots__ = 'text' , 'line_ctr' , 'last_token'
411
416
412
- text : str
417
+ text : TextSlice
413
418
line_ctr : LineCounter
414
419
last_token : Optional [Token ]
415
420
416
- def __init__ (self , text : str , line_ctr : Optional [LineCounter ]= None , last_token : Optional [Token ]= None ):
421
+ def __init__ (self , text : TextSlice , line_ctr : Optional [LineCounter ] = None , last_token : Optional [Token ]= None ):
422
+ if line_ctr is None :
423
+ line_ctr = LineCounter (b'\n ' if isinstance (text .text , bytes ) else '\n ' )
424
+
425
+ if text .start > 0 :
426
+ # Advance the line-count until line_ctr.char_pos == text.start
427
+ line_ctr .feed (TextSlice (text .text , 0 , text .start ))
428
+
429
+ if not (text .start <= line_ctr .char_pos <= text .end ):
430
+ raise ValueError ("LineCounter.char_pos is out of bounds" )
431
+
417
432
self .text = text
418
- self .line_ctr = line_ctr or LineCounter ( b' \n ' if isinstance ( text , bytes ) else ' \n ' )
433
+ self .line_ctr = line_ctr
419
434
self .last_token = last_token
420
435
436
+
421
437
def __eq__ (self , other ):
422
438
if not isinstance (other , LexerState ):
423
439
return NotImplemented
424
440
425
- return self .text is other .text and self .line_ctr == other .line_ctr and self .last_token == other .last_token
441
+ return self .text == other .text and self .line_ctr == other .line_ctr and self .last_token == other .last_token
426
442
427
443
def __copy__ (self ):
428
444
return type (self )(self .text , copy (self .line_ctr ), self .last_token )
@@ -432,15 +448,18 @@ class LexerThread:
432
448
"""A thread that ties a lexer instance and a lexer state, to be used by the parser
433
449
"""
434
450
435
- def __init__ (self , lexer : 'Lexer' , lexer_state : LexerState ):
451
+ def __init__ (self , lexer : 'Lexer' , lexer_state : Optional [ LexerState ] ):
436
452
self .lexer = lexer
437
453
self .state = lexer_state
438
454
439
455
@classmethod
440
- def from_text (cls , lexer : 'Lexer' , text : str ) -> 'LexerThread' :
456
+ def from_text (cls , lexer : 'Lexer' , text_or_slice : TextOrSlice ) -> 'LexerThread' :
457
+ text = TextSlice .cast_from (text_or_slice )
441
458
return cls (lexer , LexerState (text ))
442
459
443
460
def lex (self , parser_state ):
461
+ if self .state is None :
462
+ raise TypeError ("Cannot lex: No text assigned to lexer state" )
444
463
return self .lexer .lex (self .state , parser_state )
445
464
446
465
def __copy__ (self ):
@@ -461,9 +480,9 @@ class Lexer(ABC):
461
480
def lex (self , lexer_state : LexerState , parser_state : Any ) -> Iterator [Token ]:
462
481
return NotImplemented
463
482
464
- def make_lexer_state (self , text ):
483
+ def make_lexer_state (self , text : str ):
465
484
"Deprecated"
466
- return LexerState (text )
485
+ return LexerState (TextSlice . cast_from ( text ) )
467
486
468
487
469
488
def _check_regex_collisions (terminal_to_regexp : Dict [TerminalDef , str ], comparator , strict_mode , max_collisions_to_show = 8 ):
@@ -563,9 +582,9 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
563
582
self .use_bytes = conf .use_bytes
564
583
self .terminals_by_name = conf .terminals_by_name
565
584
566
- self ._scanner = None
585
+ self ._scanner : Optional [ Scanner ] = None
567
586
568
- def _build_scanner (self ):
587
+ def _build_scanner (self ) -> Scanner :
569
588
terminals , self .callback = _create_unless (self .terminals , self .g_regex_flags , self .re , self .use_bytes )
570
589
assert all (self .callback .values ())
571
590
@@ -576,26 +595,26 @@ def _build_scanner(self):
576
595
else :
577
596
self .callback [type_ ] = f
578
597
579
- self . _scanner = Scanner (terminals , self .g_regex_flags , self .re , self .use_bytes )
598
+ return Scanner (terminals , self .g_regex_flags , self .re , self .use_bytes )
580
599
581
600
@property
582
- def scanner (self ):
601
+ def scanner (self ) -> Scanner :
583
602
if self ._scanner is None :
584
- self ._build_scanner ()
603
+ self ._scanner = self . _build_scanner ()
585
604
return self ._scanner
586
605
587
606
def match (self , text , pos ):
588
607
return self .scanner .match (text , pos )
589
608
590
609
def next_token (self , lex_state : LexerState , parser_state : Any = None ) -> Token :
591
610
line_ctr = lex_state .line_ctr
592
- while line_ctr .char_pos < len ( lex_state .text ) :
611
+ while line_ctr .char_pos < lex_state .text . end :
593
612
res = self .match (lex_state .text , line_ctr .char_pos )
594
613
if not res :
595
614
allowed = self .scanner .allowed_types - self .ignore_types
596
615
if not allowed :
597
616
allowed = {"<END-OF-FILE>" }
598
- raise UnexpectedCharacters (lex_state .text , line_ctr .char_pos , line_ctr .line , line_ctr .column ,
617
+ raise UnexpectedCharacters (lex_state .text . text , line_ctr .char_pos , line_ctr .line , line_ctr .column ,
599
618
allowed = allowed , token_history = lex_state .last_token and [lex_state .last_token ],
600
619
state = parser_state , terminals_by_name = self .terminals_by_name )
601
620
0 commit comments