@@ -360,8 +360,9 @@ STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
360
360
return MP_OBJ_NULL ; // op not supported
361
361
}
362
362
363
- // Convert an index into a pointer to its lead byte, or raise IndexError if out of bounds
364
- STATIC const char * str_index_to_ptr (const char * self_data , uint self_len , mp_obj_t index ) {
363
+ // Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
364
+ // be capped to the first/last character of the string, depending on is_slice.
365
+ STATIC const char * str_index_to_ptr (const char * self_data , uint self_len , mp_obj_t index , bool is_slice ) {
365
366
machine_int_t i ;
366
367
// Copied from mp_get_index; I don't want bounds checking, just give me
367
368
// the integer as-is. (I can't bounds-check without scanning the whole
@@ -377,29 +378,37 @@ STATIC const char *str_index_to_ptr(const char *self_data, uint self_len, mp_obj
377
378
// Negative indexing is performed by counting from the end of the string.
378
379
for (s = top - 1 ; i ; -- s ) {
379
380
if (s < self_data ) {
381
+ if (is_slice ) {
382
+ return self_data ;
383
+ }
380
384
nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_IndexError , "string index out of range" ));
381
385
}
382
386
if (!UTF8_IS_CONT (* s )) {
383
387
++ i ;
384
388
}
385
389
}
386
390
++ s ;
391
+ } else if (!i ) {
392
+ return self_data ; // Shortcut - str[0] is its base pointer
387
393
} else {
388
394
// Positive indexing, correspondingly, counts from the start of the string.
389
395
// It's assumed that negative indexing will generally be used with small
390
396
// absolute values (eg str[-1], not str[-1000000]), which means it'll be
391
397
// more efficient this way.
392
- for (s = self_data ; i ; ++ s ) {
398
+ for (s = self_data ; true ; ++ s ) {
393
399
if (s >= top ) {
400
+ if (is_slice ) {
401
+ while (UTF8_IS_CONT (* -- s ));
402
+ return s ;
403
+ }
394
404
nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_IndexError , "string index out of range" ));
395
405
}
396
- if (!UTF8_IS_CONT (* s )) {
397
- -- i ;
406
+ while (UTF8_IS_CONT (* s )) {
407
+ ++ s ;
408
+ }
409
+ if (!i -- ) {
410
+ return s ;
398
411
}
399
- }
400
- // Skip continuation bytes after the last lead byte
401
- while (UTF8_IS_CONT (* s )) {
402
- ++ s ;
403
412
}
404
413
}
405
414
return s ;
@@ -424,7 +433,7 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
424
433
uint index_val = mp_get_index (type , self_len , index , false);
425
434
return MP_OBJ_NEW_SMALL_INT ((mp_small_int_t )self_data [index_val ]);
426
435
}
427
- const char * s = str_index_to_ptr ((const char * )self_data , self_len , index );
436
+ const char * s = str_index_to_ptr ((const char * )self_data , self_len , index , false );
428
437
int len = 1 ;
429
438
if (UTF8_IS_NONASCII (* s )) {
430
439
// Count the number of 1 bits (after the first)
0 commit comments