docarray · JohannesMessner · Jun 28, 2023 · May 22, 2023 · May 22, 2023 · May 22, 2023
diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py
@@ -306,6 +306,12 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
         """
         return super().from_protobuf(pb_msg)
 
+    @classmethod
+    def _get_proto_class(cls: Type[T]):
+        from docarray.proto import DocListProto
+
+        return DocListProto
+
     @overload
     def __getitem__(self, item: SupportsIndex) -> T_doc:
         ...

diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py
@@ -40,7 +40,6 @@
 if TYPE_CHECKING:
     import pandas as pd
 
-    from docarray import DocList
     from docarray.proto import DocListProto
 
 T = TypeVar('T', bound='IOMixinArray')
@@ -332,11 +331,11 @@ def to_json(self) -> bytes:
 
     @classmethod
     def from_csv(
-        cls,
+        cls: Type['T'],
         file_path: str,
         encoding: str = 'utf-8',
         dialect: Union[str, csv.Dialect] = 'excel',
-    ) -> 'DocList':
+    ) -> 'T':
         """
         Load a DocList from a csv file following the schema defined in the
         [`.doc_type`][docarray.DocList] attribute.
@@ -358,10 +357,10 @@ def from_csv(
 
         :return: `DocList` object
         """
-        if cls.doc_type == AnyDoc:
+        if cls.doc_type == AnyDoc or cls.doc_type == BaseDoc:
             raise TypeError(
                 'There is no document schema defined. '
-                'Please specify the DocList\'s Document type using `DocList[MyDoc]`.'
+                f'Please specify the {cls}\'s Document type using `{cls}[MyDoc]`.'
             )
 
         if file_path.startswith('http'):
@@ -376,14 +375,15 @@ def from_csv(
 
     @classmethod
     def _from_csv_file(
-        cls, file: Union[StringIO, TextIOWrapper], dialect: Union[str, csv.Dialect]
-    ) -> 'DocList':
-        from docarray import DocList
+        cls: Type['T'],
+        file: Union[StringIO, TextIOWrapper],
+        dialect: Union[str, csv.Dialect],
+    ) -> 'T':
 
         rows = csv.DictReader(file, dialect=dialect)
 
         doc_type = cls.doc_type
-        docs = DocList.__class_getitem__(doc_type)()
+        docs = []
 
         field_names: List[str] = (
             [] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
@@ -405,7 +405,7 @@ def _from_csv_file(
             doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict(access_path2val)
             docs.append(doc_type.parse_obj(doc_dict))
 
-        return docs
+        return cls(docs)
 
     def to_csv(
         self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel'
@@ -426,11 +426,11 @@ def to_csv(
             `'unix'` (for csv file generated on UNIX systems).
 
         """
-        if self.doc_type == AnyDoc:
+        if self.doc_type == AnyDoc or self.doc_type == BaseDoc:
             raise TypeError(
-                'DocList must be homogeneous to be converted to a csv.'
+                f'{type(self)} must be homogeneous to be converted to a csv.'
                 'There is no document schema defined. '
-                'Please specify the DocList\'s Document type using `DocList[MyDoc]`.'
+                f'Please specify the {type(self)}\'s Document type using `{type(self)}[MyDoc]`.'
             )
         fields = self.doc_type._get_access_paths()
 
@@ -443,7 +443,7 @@ def to_csv(
                 writer.writerow(doc_dict)
 
     @classmethod
-    def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList':
+    def from_dataframe(cls: Type['T'], df: 'pd.DataFrame') -> 'T':
         """
         Load a `DocList` from a `pandas.DataFrame` following the schema
         defined in the [`.doc_type`][docarray.DocList] attribute.
@@ -486,10 +486,10 @@ class Person(BaseDoc):
         """
         from docarray import DocList
 
-        if cls.doc_type == AnyDoc:
+        if cls.doc_type == AnyDoc or cls.doc_type == BaseDoc:
             raise TypeError(
                 'There is no document schema defined. '
-                'Please specify the DocList\'s Document type using `DocList[MyDoc]`.'
+                f'Please specify the {cls}\'s Document type using `{cls}[MyDoc]`.'
             )
 
         doc_type = cls.doc_type
@@ -515,6 +515,8 @@ class Person(BaseDoc):
             doc_dict = _access_path_dict_to_nested_dict(access_path2val)
             docs.append(doc_type.parse_obj(doc_dict))
 
+        if not isinstance(docs, cls):
+            return cls(docs)
         return docs
 
     def to_dataframe(self) -> 'pd.DataFrame':
@@ -563,6 +565,11 @@ def _stream_header(self) -> bytes:
         num_docs_as_bytes = len(self).to_bytes(8, 'big', signed=False)
         return version_byte + num_docs_as_bytes
 
+    @classmethod
+    @abstractmethod
+    def _get_proto_class(cls: Type[T]):
+        ...
+
     @classmethod
     def _load_binary_all(
         cls: Type[T],
@@ -593,12 +600,10 @@ def _load_binary_all(
                 compress = None
 
         if protocol is not None and protocol == 'protobuf-array':
-            from docarray.proto import DocListProto
-
-            dap = DocListProto()
-            dap.ParseFromString(d)
+            proto = cls._get_proto_class()()
+            proto.ParseFromString(d)
 
-            return cls.from_protobuf(dap)
+            return cls.from_protobuf(proto)
         elif protocol is not None and protocol == 'pickle-array':
             return pickle.loads(d)
 

diff --git a/docarray/array/doc_vec/column_storage.py b/docarray/array/doc_vec/column_storage.py
@@ -6,6 +6,7 @@
     ItemsView,
     Iterable,
     MutableMapping,
+    NamedTuple,
     Optional,
     Type,
     TypeVar,
@@ -26,6 +27,13 @@
 T = TypeVar('T', bound='ColumnStorage')
 
 
+class ColumnsJsonCompatible(NamedTuple):
+    tensor_columns: Dict[str, Any]
+    doc_columns: Dict[str, Any]
+    docs_vec_columns: Dict[str, Any]
+    any_columns: Dict[str, Any]
+
+
 class ColumnStorage:
     """
     ColumnStorage is a container to store the columns of the
@@ -91,6 +99,25 @@ def __getitem__(self: T, item: IndexIterType) -> T:
             self.tensor_type,
         )
 
+    def columns_json_compatible(self) -> ColumnsJsonCompatible:
+        tens_cols = {
+            key: value._docarray_to_json_compatible() if value is not None else value
+            for key, value in self.tensor_columns.items()
+        }
+        doc_cols = {
+            key: value._docarray_to_json_compatible() if value is not None else value
+            for key, value in self.doc_columns.items()
+        }
+        doc_vec_cols = {
+            key: [vec._docarray_to_json_compatible() for vec in value]
+            if value is not None
+            else value
+            for key, value in self.docs_vec_columns.items()
+        }
+        return ColumnsJsonCompatible(
+            tens_cols, doc_cols, doc_vec_cols, self.any_columns
+        )
+
     def __eq__(self, other: Any) -> bool:
         if not isinstance(other, ColumnStorage):
             return False
@@ -146,6 +173,11 @@ def __getitem__(self, name: str) -> Any:
             return None
         return col[self.index]
 
+    def __reduce__(self):
+        # implementing __reduce__ to solve a pickle issue when subclassing dict
+        # see here: https://stackoverflow.com/questions/21144845/how-can-i-unpickle-a-subclass-of-dict-that-validates-with-setitem-in-pytho
+        return (ColumnStorageView, (self.index, self.storage))
+
     def __setitem__(self, name, value) -> None:
         if self.storage.columns[name] is None:
             raise ValueError(