Skip to content

Commit 5473e1a

Browse files
committed
Remove the charlen field from strings, calculating it when required
1 parent 5c1658e commit 5473e1a

File tree

5 files changed

+15
-59
lines changed

5 files changed

+15
-59
lines changed

py/makeqstrdata.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,7 @@ def do_work(infiles):
5959
for order, ident, qstr in sorted(qstrs.values(), key=lambda x: x[0]):
6060
qhash = compute_hash(qstr)
6161
qlen = len(qstr)
62-
qchlen = len(qstr.decode("utf-8"))
63-
print('Q({}, (const byte*)"\\x{:02x}\\x{:02x}\\x{:02x}\\x{:02x}\\x{:02x}\\x{:02x}" "{}")'.format(ident, qhash & 0xff, (qhash >> 8) & 0xff, qlen & 0xff, (qlen >> 8) & 0xff, qchlen & 0xff, (qchlen >> 8) & 0xff, qstr))
62+
print('Q({}, (const byte*)"\\x{:02x}\\x{:02x}\\x{:02x}\\x{:02x}" "{}")'.format(ident, qhash & 0xff, (qhash >> 8) & 0xff, qlen & 0xff, (qlen >> 8) & 0xff, qstr))
6463

6564
return True
6665

py/objstr.c

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,6 @@ const mp_obj_t mp_const_empty_bytes;
5252
// use this macro to extract the string data and length
5353
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }
5454

55-
// use this macro to extract the string data and both lengths
56-
#define GET_STR_INFO(str_obj_in, str_data, str_len, str_charlen) const byte *str_data; uint str_len, str_charlen; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); str_charlen = qstr_charlen(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_charlen = ((mp_obj_str_t*)str_obj_in)->charlen; str_data = ((mp_obj_str_t*)str_obj_in)->data; }
57-
5855
STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
5956
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
6057
STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in);
@@ -365,7 +362,7 @@ STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
365362

366363
STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
367364
mp_obj_type_t *type = mp_obj_get_type(self_in);
368-
GET_STR_INFO(self_in, self_data, self_len, self_charlen);
365+
GET_STR_DATA_LEN(self_in, self_data, self_len);
369366
if (value == MP_OBJ_SENTINEL) {
370367
// load
371368
#if MICROPY_PY_BUILTINS_SLICE
@@ -378,7 +375,8 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
378375
return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
379376
}
380377
#endif
381-
uint index_val = mp_get_index(type, self_charlen, index, false);
378+
// TODO: Don't use mp_get_index() here
379+
uint index_val = mp_get_index(type, unichar_charlen((const char *)self_data, self_len), index, false);
382380
if (type == &mp_type_bytes) {
383381
return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
384382
} else {
@@ -1734,7 +1732,7 @@ const mp_obj_type_t mp_type_bytes = {
17341732
};
17351733

17361734
// the zero-length bytes
1737-
STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 0, NULL};
1735+
STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, NULL};
17381736
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;
17391737

17401738
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
@@ -1761,20 +1759,6 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin
17611759
o->base.type = type;
17621760
o->len = len;
17631761
if (data) {
1764-
if (MP_OBJ_IS_STR(o)) {
1765-
// Count non-continuation bytes so we know how long the string is in characters.
1766-
const byte *endptr, *top = data + len;
1767-
uint charlen = 0;
1768-
for (endptr = data; endptr < top; ++endptr) {
1769-
if (!UTF8_IS_CONT(*endptr)) {
1770-
++charlen;
1771-
}
1772-
}
1773-
o->charlen = charlen;
1774-
} else {
1775-
// For byte strings, the 'character' length (really the "exposed length" or "Python length") equals the byte length.
1776-
o->charlen = len;
1777-
}
17781762
o->hash = qstr_compute_hash(data, len);
17791763
byte *p = m_new(byte, len + 1);
17801764
o->data = p;
@@ -1844,8 +1828,8 @@ uint mp_obj_str_get_hash(mp_obj_t self_in) {
18441828
uint mp_obj_str_get_len(mp_obj_t self_in) {
18451829
// TODO This has a double check for the type, one in obj.c and one here
18461830
if (MP_OBJ_IS_STR(self_in) || MP_OBJ_IS_TYPE(self_in, &mp_type_bytes)) {
1847-
GET_STR_INFO(self_in, self_data, self_len, self_charlen); (void)self_data;
1848-
return self_charlen;
1831+
GET_STR_DATA_LEN(self_in, self_data, self_len);
1832+
return unichar_charlen((const char *)self_data, self_len);
18491833
} else {
18501834
bad_implicit_conversion(self_in);
18511835
}

py/objstr.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,10 @@ typedef struct _mp_obj_str_t {
3030
machine_uint_t hash : 16;
3131
// len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
3232
machine_uint_t len : 16;
33-
// charlen == number of characters in the string - charlen <= len - 1, and is the value returned by len() in Python
34-
machine_uint_t charlen : 16;
3533
const void *data; //Character data is encoded UTF-8 and should not be blindly indexed.
3634
} mp_obj_str_t;
3735

38-
// This is valid ONLY for pure-ASCII strings!
39-
#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, sizeof(str) - 1, (const byte*)str};
36+
#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, (const byte*)str};
4037

4138
mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args);
4239
mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len);

py/qstr.c

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,13 @@
4646
// For now we use very simple encoding, just to get the framework correct:
4747
// - hash is 2 bytes (see function below)
4848
// - length is 2 bytes
49-
// - character length is 2 bytes
5049
// - data follows
5150
// - \0 terminated (for now, so they can be printed using printf)
5251

5352
#define Q_GET_HASH(q) ((q)[0] | ((q)[1] << 8))
54-
#define Q_GET_ALLOC(q) (6 + Q_GET_LENGTH(q) + 1)
53+
#define Q_GET_ALLOC(q) (4 + Q_GET_LENGTH(q) + 1)
5554
#define Q_GET_LENGTH(q) ((q)[2] | ((q)[3] << 8))
56-
#define Q_GET_CHARLEN(q) ((q)[4] | ((q)[5] << 8))
57-
#define Q_GET_DATA(q) ((q) + 6)
55+
#define Q_GET_DATA(q) ((q) + 4)
5856

5957
// this must match the equivalent function in makeqstrdata.py
6058
// Note that this hashes the UTF-8 encoded data bytes.
@@ -158,29 +156,21 @@ qstr qstr_from_strn(const char *str, uint len) {
158156
qstr q = qstr_find_strn(str, len);
159157
if (q == 0) {
160158
machine_uint_t hash = qstr_compute_hash((const byte*)str, len);
161-
byte *q_ptr = m_new(byte, 6 + len + 1);
162-
uint charlen = 0;
163-
for (const char *s = str; s < str + len; ++s) {
164-
if (!UTF8_IS_CONT(*s)) {
165-
++charlen;
166-
}
167-
}
159+
byte *q_ptr = m_new(byte, 4 + len + 1);
168160
q_ptr[0] = hash;
169161
q_ptr[1] = hash >> 8;
170162
q_ptr[2] = len;
171163
q_ptr[3] = len >> 8;
172-
q_ptr[4] = charlen;
173-
q_ptr[5] = charlen >> 8;
174-
memcpy(q_ptr + 6, str, len);
175-
q_ptr[6 + len] = '\0';
164+
memcpy(q_ptr + 4, str, len);
165+
q_ptr[4 + len] = '\0';
176166
q = qstr_add(q_ptr);
177167
}
178168
return q;
179169
}
180170

181171
byte *qstr_build_start(uint len, byte **q_ptr) {
182172
assert(len <= 65535);
183-
*q_ptr = m_new(byte, 7 + len + 1);
173+
*q_ptr = m_new(byte, 4 + len + 1);
184174
(*q_ptr)[2] = len;
185175
(*q_ptr)[3] = len >> 8;
186176
return Q_GET_DATA(*q_ptr);
@@ -194,15 +184,7 @@ qstr qstr_build_end(byte *q_ptr) {
194184
machine_uint_t hash = qstr_compute_hash(str, len);
195185
q_ptr[0] = hash;
196186
q_ptr[1] = hash >> 8;
197-
uint charlen = 0;
198-
for (const byte *s = str; s < str + len; ++s) {
199-
if (!UTF8_IS_CONT(*s)) {
200-
++charlen;
201-
}
202-
}
203-
q_ptr[4] = charlen;
204-
q_ptr[5] = charlen >> 8;
205-
q_ptr[6 + len] = '\0';
187+
q_ptr[4 + len] = '\0';
206188
q = qstr_add(q_ptr);
207189
} else {
208190
m_del(byte, q_ptr, Q_GET_ALLOC(q_ptr));
@@ -219,11 +201,6 @@ uint qstr_len(qstr q) {
219201
return Q_GET_LENGTH(qd);
220202
}
221203

222-
uint qstr_charlen(qstr q) {
223-
const byte *qd = find_qstr(q);
224-
return Q_GET_CHARLEN(qd);
225-
}
226-
227204
// XXX to remove!
228205
const char *qstr_str(qstr q) {
229206
const byte *qd = find_qstr(q);

py/qstr.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ qstr qstr_build_end(byte *q_ptr);
5959
machine_uint_t qstr_hash(qstr q);
6060
const char* qstr_str(qstr q);
6161
uint qstr_len(qstr q);
62-
uint qstr_charlen(qstr q);
6362
const byte* qstr_data(qstr q, uint *len);
6463

6564
void qstr_pool_info(uint *n_pool, uint *n_qstr, uint *n_str_data_bytes, uint *n_total_bytes);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy