Skip to content

feat: i/o for DocVec #1562

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Jun 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d43057b
feat: json and dict for docvec
JohannesMessner May 22, 2023
c45bfca
test: add tests
JohannesMessner May 22, 2023
564d144
test: add docvec to dict test
JohannesMessner May 22, 2023
76f9c8e
feat: to from dataframe for docvec
JohannesMessner May 22, 2023
73a1ac7
test: dataframe docvec tests
JohannesMessner May 22, 2023
f83fb4f
feat: to from csv for docvec
JohannesMessner May 22, 2023
ca8dc12
test: test csv with docvec
JohannesMessner May 22, 2023
2b52b1e
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 14, 2023
b115637
feat: pickle serialization for docvec
JohannesMessner Jun 14, 2023
bd86985
feat: protbuf array serialization for docvec
JohannesMessner Jun 14, 2023
c280ff2
test: test base64 deser for docvec
JohannesMessner Jun 14, 2023
ad881cf
test: test save and load for docvec
JohannesMessner Jun 14, 2023
4b1b533
feat: docvec json column wise
JohannesMessner Jun 19, 2023
60e651e
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 19, 2023
f9c97ec
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 20, 2023
0603fc5
test: add test for docvec json
JohannesMessner Jun 20, 2023
c6ace8e
test: add tensor type arg
JohannesMessner Jun 20, 2023
51719b2
fix: mypy stuff
JohannesMessner Jun 26, 2023
ad5f5bd
fix: raising of error when needed
JohannesMessner Jun 26, 2023
200dbac
fix: more exception raising
JohannesMessner Jun 26, 2023
8d1f446
fix: mypy
JohannesMessner Jun 26, 2023
6815720
refactor: don't expose to/from csv for docvec
JohannesMessner Jun 26, 2023
6b5ddc7
test: adjust tests
JohannesMessner Jun 26, 2023
587c20a
docs: add documentation for docvec io
JohannesMessner Jun 27, 2023
663f17d
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 27, 2023
7d035fb
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docarray/array/doc_list/doc_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,12 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
"""
return super().from_protobuf(pb_msg)

@classmethod
def _get_proto_class(cls: Type[T]):
from docarray.proto import DocListProto

return DocListProto

@overload
def __getitem__(self, item: SupportsIndex) -> T_doc:
...
Expand Down
47 changes: 26 additions & 21 deletions docarray/array/doc_list/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
if TYPE_CHECKING:
import pandas as pd

from docarray import DocList
from docarray.proto import DocListProto

T = TypeVar('T', bound='IOMixinArray')
Expand Down Expand Up @@ -332,11 +331,11 @@ def to_json(self) -> bytes:

@classmethod
def from_csv(
cls,
cls: Type['T'],
file_path: str,
encoding: str = 'utf-8',
dialect: Union[str, csv.Dialect] = 'excel',
) -> 'DocList':
) -> 'T':
"""
Load a DocList from a csv file following the schema defined in the
[`.doc_type`][docarray.DocList] attribute.
Expand All @@ -358,10 +357,10 @@ def from_csv(

:return: `DocList` object
"""
if cls.doc_type == AnyDoc:
if cls.doc_type == AnyDoc or cls.doc_type == BaseDoc:
raise TypeError(
'There is no document schema defined. '
'Please specify the DocList\'s Document type using `DocList[MyDoc]`.'
f'Please specify the {cls}\'s Document type using `{cls}[MyDoc]`.'
)

if file_path.startswith('http'):
Expand All @@ -376,14 +375,15 @@ def from_csv(

@classmethod
def _from_csv_file(
cls, file: Union[StringIO, TextIOWrapper], dialect: Union[str, csv.Dialect]
) -> 'DocList':
from docarray import DocList
cls: Type['T'],
file: Union[StringIO, TextIOWrapper],
dialect: Union[str, csv.Dialect],
) -> 'T':

rows = csv.DictReader(file, dialect=dialect)

doc_type = cls.doc_type
docs = DocList.__class_getitem__(doc_type)()
docs = []

field_names: List[str] = (
[] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
Expand All @@ -405,7 +405,7 @@ def _from_csv_file(
doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict(access_path2val)
docs.append(doc_type.parse_obj(doc_dict))

return docs
return cls(docs)

def to_csv(
self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel'
Expand All @@ -426,11 +426,11 @@ def to_csv(
`'unix'` (for csv file generated on UNIX systems).

"""
if self.doc_type == AnyDoc:
if self.doc_type == AnyDoc or self.doc_type == BaseDoc:
raise TypeError(
'DocList must be homogeneous to be converted to a csv.'
f'{type(self)} must be homogeneous to be converted to a csv.'
'There is no document schema defined. '
'Please specify the DocList\'s Document type using `DocList[MyDoc]`.'
f'Please specify the {type(self)}\'s Document type using `{type(self)}[MyDoc]`.'
)
fields = self.doc_type._get_access_paths()

Expand All @@ -443,7 +443,7 @@ def to_csv(
writer.writerow(doc_dict)

@classmethod
def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList':
def from_dataframe(cls: Type['T'], df: 'pd.DataFrame') -> 'T':
"""
Load a `DocList` from a `pandas.DataFrame` following the schema
defined in the [`.doc_type`][docarray.DocList] attribute.
Expand Down Expand Up @@ -486,10 +486,10 @@ class Person(BaseDoc):
"""
from docarray import DocList

if cls.doc_type == AnyDoc:
if cls.doc_type == AnyDoc or cls.doc_type == BaseDoc:
raise TypeError(
'There is no document schema defined. '
'Please specify the DocList\'s Document type using `DocList[MyDoc]`.'
f'Please specify the {cls}\'s Document type using `{cls}[MyDoc]`.'
)

doc_type = cls.doc_type
Expand All @@ -515,6 +515,8 @@ class Person(BaseDoc):
doc_dict = _access_path_dict_to_nested_dict(access_path2val)
docs.append(doc_type.parse_obj(doc_dict))

if not isinstance(docs, cls):
return cls(docs)
return docs

def to_dataframe(self) -> 'pd.DataFrame':
Expand Down Expand Up @@ -563,6 +565,11 @@ def _stream_header(self) -> bytes:
num_docs_as_bytes = len(self).to_bytes(8, 'big', signed=False)
return version_byte + num_docs_as_bytes

@classmethod
@abstractmethod
def _get_proto_class(cls: Type[T]):
...

@classmethod
def _load_binary_all(
cls: Type[T],
Expand Down Expand Up @@ -593,12 +600,10 @@ def _load_binary_all(
compress = None

if protocol is not None and protocol == 'protobuf-array':
from docarray.proto import DocListProto

dap = DocListProto()
dap.ParseFromString(d)
proto = cls._get_proto_class()()
proto.ParseFromString(d)

return cls.from_protobuf(dap)
return cls.from_protobuf(proto)
elif protocol is not None and protocol == 'pickle-array':
return pickle.loads(d)

Expand Down
32 changes: 32 additions & 0 deletions docarray/array/doc_vec/column_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ItemsView,
Iterable,
MutableMapping,
NamedTuple,
Optional,
Type,
TypeVar,
Expand All @@ -26,6 +27,13 @@
T = TypeVar('T', bound='ColumnStorage')


class ColumnsJsonCompatible(NamedTuple):
tensor_columns: Dict[str, Any]
doc_columns: Dict[str, Any]
docs_vec_columns: Dict[str, Any]
any_columns: Dict[str, Any]


class ColumnStorage:
"""
ColumnStorage is a container to store the columns of the
Expand Down Expand Up @@ -91,6 +99,25 @@ def __getitem__(self: T, item: IndexIterType) -> T:
self.tensor_type,
)

def columns_json_compatible(self) -> ColumnsJsonCompatible:
tens_cols = {
key: value._docarray_to_json_compatible() if value is not None else value
for key, value in self.tensor_columns.items()
}
doc_cols = {
key: value._docarray_to_json_compatible() if value is not None else value
for key, value in self.doc_columns.items()
}
doc_vec_cols = {
key: [vec._docarray_to_json_compatible() for vec in value]
if value is not None
else value
for key, value in self.docs_vec_columns.items()
}
return ColumnsJsonCompatible(
tens_cols, doc_cols, doc_vec_cols, self.any_columns
)

def __eq__(self, other: Any) -> bool:
if not isinstance(other, ColumnStorage):
return False
Expand Down Expand Up @@ -146,6 +173,11 @@ def __getitem__(self, name: str) -> Any:
return None
return col[self.index]

def __reduce__(self):
# implementing __reduce__ to solve a pickle issue when subclassing dict
# see here: https://stackoverflow.com/questions/21144845/how-can-i-unpickle-a-subclass-of-dict-that-validates-with-setitem-in-pytho
return (ColumnStorageView, (self.index, self.storage))

def __setitem__(self, name, value) -> None:
if self.storage.columns[name] is None:
raise ValueError(
Expand Down
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy