Skip to content

Commit b77df16

Browse files
samsjaJoan FontanalsJohannesMessner
authored
fix: allow doclist to have nested optional document (docarray#1472)
Signed-off-by: samsja <sami.jaghouar@hotmail.fr> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> Co-authored-by: Joan Fontanals <joan.martinez@jina.ai> Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com>
1 parent c9d0f71 commit b77df16

File tree

12 files changed

+505
-139
lines changed

12 files changed

+505
-139
lines changed

docarray/array/any_array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def __getattr__(self, item: str):
106106
def _get_data_column(
107107
self: T,
108108
field: str,
109-
) -> Union[MutableSequence, T, 'AbstractTensor']:
109+
) -> Union[MutableSequence, T, 'AbstractTensor', None]:
110110
"""Return all values of the fields from all docs this array contains
111111
112112
:param field: name of the fields to extract

docarray/array/doc_list/doc_list.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,13 @@ class Image(BaseDoc):
102102
del docs[0:5] # remove elements for 0 to 5 from DocList
103103
```
104104
105+
!!! note
106+
If the DocList is homogeneous and its schema contains nested BaseDoc
107+
(i.e, BaseDoc inside a BaseDoc) where the nested Document is `Optional`, calling
108+
`docs.nested_doc` will return a List of the nested BaseDoc instead of DocList.
109+
This is because the nested field could be None and therefore could not fit into
110+
a DocList.
111+
105112
:param docs: iterable of Document
106113
107114
"""
@@ -200,6 +207,7 @@ def __class_getitem__(cls, item: Union[Type[BaseDoc], TypeVar, str]):
200207

201208
if (
202209
not is_union_type(field_type)
210+
and self.__class__.doc_type.__fields__[field].required
203211
and isinstance(field_type, type)
204212
and issubclass(field_type, BaseDoc)
205213
):

docarray/array/doc_vec/column_storage.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
Dict,
66
Iterable,
77
MutableMapping,
8+
Optional,
89
Type,
910
TypeVar,
1011
Union,
@@ -37,9 +38,9 @@ class ColumnStorage:
3738

3839
def __init__(
3940
self,
40-
tensor_columns: Dict[str, AbstractTensor],
41-
doc_columns: Dict[str, 'DocVec'],
42-
docs_vec_columns: Dict[str, ListAdvancedIndexing['DocVec']],
41+
tensor_columns: Dict[str, Optional[AbstractTensor]],
42+
doc_columns: Dict[str, Optional['DocVec']],
43+
docs_vec_columns: Dict[str, Optional[ListAdvancedIndexing['DocVec']]],
4344
any_columns: Dict[str, ListAdvancedIndexing],
4445
tensor_type: Type[AbstractTensor] = NdArray,
4546
):
@@ -63,12 +64,22 @@ def __len__(self) -> int:
6364
def __getitem__(self: T, item: IndexIterType) -> T:
6465
if isinstance(item, tuple):
6566
item = list(item)
66-
tensor_columns = {key: col[item] for key, col in self.tensor_columns.items()}
67-
doc_columns = {key: col[item] for key, col in self.doc_columns.items()}
67+
tensor_columns = {
68+
key: col[item] if col is not None else None
69+
for key, col in self.tensor_columns.items()
70+
}
71+
doc_columns = {
72+
key: col[item] if col is not None else None
73+
for key, col in self.doc_columns.items()
74+
}
6875
docs_vec_columns = {
69-
key: col[item] for key, col in self.docs_vec_columns.items()
76+
key: col[item] if col is not None else None
77+
for key, col in self.docs_vec_columns.items()
78+
}
79+
any_columns = {
80+
key: col[item] if col is not None else None
81+
for key, col in self.any_columns.items()
7082
}
71-
any_columns = {key: col[item] for key, col in self.any_columns.items()}
7283

7384
return self.__class__(
7485
tensor_columns,
@@ -91,15 +102,34 @@ def __init__(self, index: int, storage: ColumnStorage):
91102
def __getitem__(self, name: str) -> Any:
92103
if name in self.storage.tensor_columns.keys():
93104
tensor = self.storage.tensor_columns[name]
105+
if tensor is None:
106+
return None
94107
if tensor.get_comp_backend().n_dim(tensor) == 1:
95108
# to ensure consistensy between numpy and pytorch
96109
# we wrap the scalr in a tensor of ndim = 1
97110
# otherwise numpy pass by value whereas torch by reference
98-
return self.storage.tensor_columns[name][self.index : self.index + 1]
111+
col = self.storage.tensor_columns[name]
99112

100-
return self.storage.columns[name][self.index]
113+
if col is not None:
114+
return col[self.index : self.index + 1]
115+
else:
116+
return None
117+
118+
col = self.storage.columns[name]
119+
120+
if col is None:
121+
return None
122+
return col[self.index]
101123

102124
def __setitem__(self, name, value) -> None:
125+
if self.storage.columns[name] is None:
126+
raise ValueError(
127+
f'Cannot set an item to a None column. This mean that '
128+
f'the DocVec that encapsulate this doc has the field '
129+
f'{name} set to None. If you want to modify that you need to do it at the'
130+
f'DocVec level. `docs.field = np.zeros(10)`'
131+
)
132+
103133
self.storage.columns[name][self.index] = value
104134

105135
def __delitem__(self, key):

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy