From 9423603478f23e5f423771da405d70180a590855 Mon Sep 17 00:00:00 2001 From: Neradoc Date: Thu, 6 Mar 2025 03:27:51 +0100 Subject: [PATCH 1/4] fix reading strings that contain quotes or open/closed brackets --- adafruit_json_stream.py | 52 ++++++++++---- tests/test_json_stream.py | 142 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+), 14 deletions(-) diff --git a/adafruit_json_stream.py b/adafruit_json_stream.py index 0c08ae5..4b9da92 100644 --- a/adafruit_json_stream.py +++ b/adafruit_json_stream.py @@ -40,7 +40,9 @@ def read(self): self.i += 1 return char - def fast_forward(self, closer, *, return_object=False): + def fast_forward( + self, closer, *, return_object=False + ): # pylint: disable=too-many-branches """ Read through the stream until the character is ``closer``, ``]`` (ending a list) or ``}`` (ending an object.) Intermediate lists and @@ -62,6 +64,7 @@ def fast_forward(self, closer, *, return_object=False): # } = 125, { = 123 buffer[0] = closer - 2 + ignore_next = False while close_stack: char = self.read() count += 1 @@ -71,8 +74,14 @@ def fast_forward(self, closer, *, return_object=False): new_buffer[: len(buffer)] = buffer buffer = new_buffer buffer[count] = char - if char == close_stack[-1]: + if ignore_next: + # that character was escaped, skip it + ignore_next = False + elif char == close_stack[-1]: close_stack.pop() + elif char == ord("\\") and close_stack[-1] == ord('"'): + # if backslash, ignore the next character + ignore_next = True elif char == ord('"'): close_stack.append(ord('"')) elif close_stack[-1] == ord('"'): @@ -96,26 +105,41 @@ def next_value(self, endswith=None): if isinstance(endswith, str): endswith = ord(endswith) in_string = False + ignore_next = False while True: try: char = self.read() except EOFError: char = endswith - if not in_string and (char == endswith or char in (ord("]"), ord("}"))): - self.last_char = char - if len(buf) == 0: - return None - value_string = bytes(buf).decode("utf-8") - return json.loads(value_string) - if char == ord("{"): - return TransientObject(self) - if char == ord("["): - return TransientList(self) + in_string = False + ignore_next = False if not in_string: - in_string = char == ord('"') + # end character or object/list end + if char == endswith or char in (ord("]"), ord("}")): + self.last_char = char + if len(buf) == 0: + return None + value_string = bytes(buf).decode("utf-8") + return json.loads(value_string) + # string or sub object + if char == ord("{"): + return TransientObject(self) + if char == ord("["): + return TransientList(self) + # start a string + if char == ord('"'): + in_string = True else: - in_string = char != ord('"') + # skipping any closing or opening character if in a string + # also skipping escaped characters (like quotes in string) + if ignore_next: + ignore_next = False + elif char == ord("\\"): + ignore_next = True + elif char == ord('"'): + in_string = False + buf.append(char) diff --git a/tests/test_json_stream.py b/tests/test_json_stream.py index b8197fe..04f4faa 100644 --- a/tests/test_json_stream.py +++ b/tests/test_json_stream.py @@ -66,6 +66,38 @@ def dict_with_all_types(): """ +@pytest.fixture +def list_with_bad_strings(): + return r""" + [ + "\"}\"", + "{\"a\": 1, \"b\": [2,3]}", + "\"", + "\\\"", + "\\\\\"", + "\\x40\"", + "[[[{{{", + "]]]}}}" + ] + """ + + +@pytest.fixture +def dict_with_bad_strings(): + return r""" + { + "1": "\"}\"", + "2": "{\"a\": 1, \"b\": [2,3]}", + "3": "\"", + "4": "\\\"", + "5": "\\\\\"", + "6": "\\x40\"", + "7": "[[[{{{", + "8": "]]]}}}" + } + """ + + @pytest.fixture def list_with_values(): return """ @@ -308,6 +340,116 @@ def test_complex_dict(complex_dict): assert sub_counter == 12 +def test_bad_strings_in_list(list_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = [ + '"}"', + '{"a": 1, "b": [2,3]}', + '"', + '\\"', + '\\\\"', + '\\x40"', + "[[[{{{", + "]]]}}}", + ] + + assert json.loads(list_with_bad_strings) + + # get each separately + stream = adafruit_json_stream.load(BytesChunkIO(list_with_bad_strings.encode())) + for i, item in enumerate(stream): + assert item == bad_strings[i] + + +def test_bad_strings_in_list_iter(list_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = [ + '"}"', + '{"a": 1, "b": [2,3]}', + '"', + '\\"', + '\\\\"', + '\\x40"', + "[[[{{{", + "]]]}}}", + ] + + assert json.loads(list_with_bad_strings) + + # get each separately + stream = adafruit_json_stream.load(BytesChunkIO(list_with_bad_strings.encode())) + for i, item in enumerate(stream): + assert item == bad_strings[i] + + +def test_bad_strings_in_dict_as_object(dict_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = { + "1": '"}"', + "2": '{"a": 1, "b": [2,3]}', + "3": '"', + "4": '\\"', + "5": '\\\\"', + "6": '\\x40"', + "7": "[[[{{{", + "8": "]]]}}}", + } + + # read all at once + stream = adafruit_json_stream.load(BytesChunkIO(dict_with_bad_strings.encode())) + assert stream.as_object() == bad_strings + + +def test_bad_strings_in_dict_all_keys(dict_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = { + "1": '"}"', + "2": '{"a": 1, "b": [2,3]}', + "3": '"', + "4": '\\"', + "5": '\\\\"', + "6": '\\x40"', + "7": "[[[{{{", + "8": "]]]}}}", + } + + # read one after the other with keys + stream = adafruit_json_stream.load(BytesChunkIO(dict_with_bad_strings.encode())) + assert stream["1"] == bad_strings["1"] + assert stream["2"] == bad_strings["2"] + assert stream["3"] == bad_strings["3"] + assert stream["4"] == bad_strings["4"] + assert stream["5"] == bad_strings["5"] + assert stream["6"] == bad_strings["6"] + assert stream["7"] == bad_strings["7"] + assert stream["8"] == bad_strings["8"] + + +def test_bad_strings_in_dict_skip_some(dict_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = { + "1": '"}"', + "2": '{"a": 1, "b": [2,3]}', + "3": '"', + "4": '\\"', + "5": '\\\\"', + "6": '\\x40"', + "7": "[[[{{{", + "8": "]]]}}}", + } + + # read some, skip some + stream = adafruit_json_stream.load(BytesChunkIO(dict_with_bad_strings.encode())) + assert stream["2"] == bad_strings["2"] + assert stream["5"] == bad_strings["5"] + assert stream["8"] == bad_strings["8"] + + def test_complex_dict_grabbing(complex_dict): """Test loading a complex dict and grabbing specific keys.""" From 3fdd10e06683e5300b1efbf9b9f230d6adb9eef5 Mon Sep 17 00:00:00 2001 From: Neradoc Date: Wed, 5 Mar 2025 21:02:29 +0100 Subject: [PATCH 2/4] iterator on objects --- adafruit_json_stream.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_json_stream.py | 19 +++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/adafruit_json_stream.py b/adafruit_json_stream.py index 4b9da92..fe8d8ef 100644 --- a/adafruit_json_stream.py +++ b/adafruit_json_stream.py @@ -239,6 +239,41 @@ def __getitem__(self, key): self.done = self.data.fast_forward(",") raise KeyError(key) + def __iter__(self): + return self + + def _next_item(self): + """Return the next item as a (key, value) pair, regardless of key.""" + if self.active_child: + self.active_child.finish() + self.done = self.data.fast_forward(",") + self.active_child = None + if self.done: + raise StopIteration() + + current_key = self.data.next_value(":") + if current_key is None: + self.done = True + raise StopIteration() + + next_value = self.data.next_value(",") + if self.data.last_char == ord("}"): + self.done = True + if isinstance(next_value, Transient): + self.active_child = next_value + return (current_key, next_value) + + def __next__(self): + return self._next_item()[0] + + def items(self): + """Return iterator ine the dictionary’s items ((key, value) pairs).""" + try: + while not self.done: + yield self._next_item() + except StopIteration: + return + def load(data_iter): """Returns an object to represent the top level of the given JSON stream.""" diff --git a/tests/test_json_stream.py b/tests/test_json_stream.py index 04f4faa..1a5697c 100644 --- a/tests/test_json_stream.py +++ b/tests/test_json_stream.py @@ -685,3 +685,22 @@ def test_as_object_grabbing_multiple_subscriptable_levels_again_after_passed_rai assert next(dict_1["sub_list"]) == "a" with pytest.raises(KeyError, match="sub_dict"): dict_1["sub_dict"]["sub_dict_name"] + + +def test_iterating_keys(dict_with_keys): + """Iterate through keys of a simple object""" + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + output = list(stream) + assert output == ["field_1", "field_2", "field_3"] + + +def test_iterating_items(dict_with_keys): + """Iterate through items of a simple object""" + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + output = list(stream.items()) + assert output == [("field_1", 1), ("field_2", 2), ("field_3", 3)] + From f2ed1a1ce7bc6cc81ba1e1317819c9b0ed8783ca Mon Sep 17 00:00:00 2001 From: Neradoc Date: Tue, 18 Mar 2025 01:49:49 +0100 Subject: [PATCH 3/4] add example that uses .items() and tests for iterating after already accessing an item --- examples/json_stream_advanced.py | 63 ++++++++++++++++++++++++++++++++ tests/test_json_stream.py | 23 +++++++++++- 2 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 examples/json_stream_advanced.py diff --git a/examples/json_stream_advanced.py b/examples/json_stream_advanced.py new file mode 100644 index 0000000..39e0448 --- /dev/null +++ b/examples/json_stream_advanced.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 Scott Shawcroft for Adafruit Industries +# +# SPDX-License-Identifier: Unlicense + +import sys +import time + +import adafruit_json_stream as json_stream + +# import json_stream + + +class FakeResponse: + def __init__(self, file): + self.file = file + + def iter_content(self, chunk_size): + while True: + yield self.file.read(chunk_size) + + +f = open(sys.argv[1], "rb") # pylint: disable=consider-using-with +obj = json_stream.load(FakeResponse(f).iter_content(32)) + + +def find_keys(obj, keys): + """If we don't know the order in which the keys are, + go through all of them and pick the ones we want""" + out = dict() + # iterate on the items of an object + for key, value in obj.items(): + if key in keys: + # if it's a sub object, get it all + if isinstance(value, json_stream.Transient): + value = value.as_object() + out[key] = value + return out + +def time_to_date(stamp): + tt = time.localtime(stamp) + month = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"][tt.tm_mon] + return f"{tt.tm_mday:2d}th of {month}" + +def ftoc(temp): + return (temp - 32) * 5 / 9 + +currently = obj["currently"] +print("Currently:") +print(" ", time_to_date(currently["time"])) +print(" ", currently["icon"]) + +# iterate on the content of a list +for i, day in enumerate(obj["daily"]["data"]): + day_items = find_keys(day, ("time", "summary", "temperatureHigh")) + date = time_to_date(day_items["time"]) + print( + f'On {date}: {day_items["summary"]},', + f'Max: {int(day_items["temperatureHigh"])}F', + f'({int(ftoc(day_items["temperatureHigh"]))}C)' + ) + + if i > 4: + break diff --git a/tests/test_json_stream.py b/tests/test_json_stream.py index 1a5697c..1afc2d7 100644 --- a/tests/test_json_stream.py +++ b/tests/test_json_stream.py @@ -688,7 +688,7 @@ def test_as_object_grabbing_multiple_subscriptable_levels_again_after_passed_rai def test_iterating_keys(dict_with_keys): - """Iterate through keys of a simple object""" + """Iterate through keys of a simple object.""" bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) stream = adafruit_json_stream.load(bytes_io_chunk) @@ -697,10 +697,29 @@ def test_iterating_keys(dict_with_keys): def test_iterating_items(dict_with_keys): - """Iterate through items of a simple object""" + """Iterate through items of a simple object.""" bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) stream = adafruit_json_stream.load(bytes_io_chunk) output = list(stream.items()) assert output == [("field_1", 1), ("field_2", 2), ("field_3", 3)] + +def test_iterating_keys_after_get(dict_with_keys): + """Iterate through keys of a simple object after an item has already been read.""" + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + assert stream["field_1"] == 1 + output = list(stream) + assert output == ["field_2", "field_3"] + + +def test_iterating_items_after_get(dict_with_keys): + """Iterate through items of a simple object after an item has already been read.""" + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + assert stream["field_1"] == 1 + output = list(stream.items()) + assert output == [("field_2", 2), ("field_3", 3)] From a2739d70aa007640c7de73e30ced9eb1f1af256e Mon Sep 17 00:00:00 2001 From: Neradoc Date: Tue, 18 Mar 2025 21:11:14 +0100 Subject: [PATCH 4/4] Rework iteration to only retrieve the key, enable getting the value of the current key. Use common active_key, fix finish(), etc. Rename example and use the key iteration. Additional tests. --- adafruit_json_stream.py | 55 ++++++++++++------- ....py => json_stream_local_file_advanced.py} | 37 ++++++++++--- tests/test_json_stream.py | 37 +++++++++++++ 3 files changed, 100 insertions(+), 29 deletions(-) rename examples/{json_stream_advanced.py => json_stream_local_file_advanced.py} (71%) diff --git a/adafruit_json_stream.py b/adafruit_json_stream.py index fe8d8ef..b5172d7 100644 --- a/adafruit_json_stream.py +++ b/adafruit_json_stream.py @@ -154,7 +154,7 @@ def __init__(self, stream): self.finish_char = "" def finish(self): - """Consume all of the characters for this list from the stream.""" + """Consume all of the characters for this container from the stream.""" if not self.done: if self.active_child: self.active_child.finish() @@ -163,7 +163,8 @@ def finish(self): self.done = True def as_object(self): - """Consume all of the characters for this list from the stream and return as an object.""" + """Consume all of the characters for this container from the stream + and return as an object.""" if self.has_read: raise BufferError("Object has already been partly read.") @@ -207,10 +208,17 @@ class TransientObject(Transient): def __init__(self, stream): super().__init__(stream) self.finish_char = "}" - self.active_child_key = None + self.active_key = None + + def finish(self): + """Consume all of the characters for this container from the stream.""" + if self.active_key and not self.active_child: + self.done = self.data.fast_forward(",") + self.active_key = None + super().finish() def __getitem__(self, key): - if self.active_child and self.active_child_key == key: + if self.active_child and self.active_key == key: return self.active_child self.has_read = True @@ -219,12 +227,16 @@ def __getitem__(self, key): self.active_child.finish() self.done = self.data.fast_forward(",") self.active_child = None - self.active_child_key = None + self.active_key = None if self.done: raise KeyError(key) while not self.done: - current_key = self.data.next_value(":") + if self.active_key: + current_key = self.active_key + self.active_key = None + else: + current_key = self.data.next_value(":") if current_key is None: self.done = True break @@ -234,7 +246,7 @@ def __getitem__(self, key): self.done = True if isinstance(next_value, Transient): self.active_child = next_value - self.active_child_key = key + self.active_key = key return next_value self.done = self.data.fast_forward(",") raise KeyError(key) @@ -242,35 +254,36 @@ def __getitem__(self, key): def __iter__(self): return self - def _next_item(self): - """Return the next item as a (key, value) pair, regardless of key.""" - if self.active_child: - self.active_child.finish() + def _next_key(self): + """Return the next item's key, without consuming the value.""" + if self.active_key: + if self.active_child: + self.active_child.finish() + self.active_child = None self.done = self.data.fast_forward(",") - self.active_child = None + self.active_key = None if self.done: raise StopIteration() + self.has_read = True + current_key = self.data.next_value(":") if current_key is None: self.done = True raise StopIteration() - next_value = self.data.next_value(",") - if self.data.last_char == ord("}"): - self.done = True - if isinstance(next_value, Transient): - self.active_child = next_value - return (current_key, next_value) + self.active_key = current_key + return current_key def __next__(self): - return self._next_item()[0] + return self._next_key() def items(self): - """Return iterator ine the dictionary’s items ((key, value) pairs).""" + """Return iterator in the dictionary’s items ((key, value) pairs).""" try: while not self.done: - yield self._next_item() + key = self._next_key() + yield (key, self[key]) except StopIteration: return diff --git a/examples/json_stream_advanced.py b/examples/json_stream_local_file_advanced.py similarity index 71% rename from examples/json_stream_advanced.py rename to examples/json_stream_local_file_advanced.py index 39e0448..2920619 100644 --- a/examples/json_stream_advanced.py +++ b/examples/json_stream_local_file_advanced.py @@ -23,31 +23,52 @@ def iter_content(self, chunk_size): obj = json_stream.load(FakeResponse(f).iter_content(32)) -def find_keys(obj, keys): +def find_keys(haystack, keys): """If we don't know the order in which the keys are, go through all of them and pick the ones we want""" - out = dict() + out = {} # iterate on the items of an object - for key, value in obj.items(): + for key in haystack: if key in keys: + # retrieve the value only if needed + value = haystack[key] # if it's a sub object, get it all - if isinstance(value, json_stream.Transient): + if hasattr(value, "as_object"): value = value.as_object() out[key] = value return out + +months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", +] + + def time_to_date(stamp): tt = time.localtime(stamp) - month = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"][tt.tm_mon] + month = months[tt.tm_mon] return f"{tt.tm_mday:2d}th of {month}" + def ftoc(temp): return (temp - 32) * 5 / 9 + currently = obj["currently"] print("Currently:") -print(" ", time_to_date(currently["time"])) -print(" ", currently["icon"]) +print(" ", time_to_date(currently["time"])) +print(" ", currently["icon"]) # iterate on the content of a list for i, day in enumerate(obj["daily"]["data"]): @@ -56,7 +77,7 @@ def ftoc(temp): print( f'On {date}: {day_items["summary"]},', f'Max: {int(day_items["temperatureHigh"])}F', - f'({int(ftoc(day_items["temperatureHigh"]))}C)' + f'({int(ftoc(day_items["temperatureHigh"]))}C)', ) if i > 4: diff --git a/tests/test_json_stream.py b/tests/test_json_stream.py index 1afc2d7..7ed05c9 100644 --- a/tests/test_json_stream.py +++ b/tests/test_json_stream.py @@ -696,6 +696,18 @@ def test_iterating_keys(dict_with_keys): assert output == ["field_1", "field_2", "field_3"] +def test_iterating_keys_get(dict_with_keys): + """Iterate through keys of a simple object and get values.""" + + the_dict = json.loads(dict_with_keys) + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + for key in stream: + value = stream[key] + assert value == the_dict[key] + + def test_iterating_items(dict_with_keys): """Iterate through items of a simple object.""" @@ -723,3 +735,28 @@ def test_iterating_items_after_get(dict_with_keys): assert stream["field_1"] == 1 output = list(stream.items()) assert output == [("field_2", 2), ("field_3", 3)] + + +def test_iterating_complex_dict(complex_dict): + """Mix iterating over items of objects in objects in arrays.""" + + names = ["one", "two", "three", "four"] + sub_values = [None, "two point one", "three point one", None] + + stream = adafruit_json_stream.load(BytesChunkIO(complex_dict.encode())) + + thing_num = 0 + for (index, item) in enumerate(stream.items()): + key, a_list = item + assert key == f"list_{index+1}" + for thing in a_list: + assert thing["dict_name"] == names[thing_num] + for sub_key in thing["sub_dict"]: + # break after getting a key with or without the value + # (testing finish() called from the parent list) + if sub_key == "sub_dict_name": + if thing_num in {1, 2}: + value = thing["sub_dict"][sub_key] + assert value == sub_values[thing_num] + break + thing_num += 1 pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy