diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 377e2311215..449f4492e97 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --without dev - poetry run pip install tensorflow==2.11.0 + poetry run pip install tensorflow==2.12.0 poetry run pip install jax - name: Test basic import run: poetry run python -c 'from docarray import DocList, BaseDoc' @@ -198,7 +198,7 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --all-extras - poetry run pip install protobuf==3.19.0 # we check that we support 3.19 + poetry run pip install protobuf==3.20.0 # we check that we support 3.19 sudo apt-get update sudo apt-get install --no-install-recommends ffmpeg - name: Test @@ -230,7 +230,7 @@ jobs: fail-fast: false matrix: python-version: [3.8] - db_test_folder: [base_classes, elastic, hnswlib, qdrant, weaviate, redis] + db_test_folder: [base_classes, elastic, hnswlib, qdrant, weaviate, redis, milvus] steps: - uses: actions/checkout@v2.5.0 - name: Set up Python ${{ matrix.python-version }} @@ -242,8 +242,8 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --all-extras - poetry run pip install protobuf==3.19.0 - poetry run pip install tensorflow==2.11.0 + poetry run pip install protobuf==3.20.0 + poetry run pip install tensorflow==2.12.0 sudo apt-get update sudo apt-get install --no-install-recommends ffmpeg @@ -287,8 +287,8 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --all-extras - poetry run pip install protobuf==3.19.0 - poetry run pip install tensorflow==2.11.0 + poetry run pip install protobuf==3.20.0 + poetry run pip install tensorflow==2.12.0 poetry run pip install elasticsearch==8.6.2 sudo apt-get update sudo apt-get install --no-install-recommends ffmpeg @@ -332,8 +332,8 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --all-extras - poetry run pip install protobuf==3.19.0 - poetry run pip install tensorflow==2.11.0 + poetry run pip install protobuf==3.20.0 + poetry run pip install tensorflow==2.12.0 sudo apt-get update sudo apt-get install --no-install-recommends ffmpeg diff --git a/README.md b/README.md index e1a74d5a79b..5caeddb0e60 100644 --- a/README.md +++ b/README.md @@ -624,7 +624,7 @@ Like the [PyTorch approach](#coming-from-pytorch), you can also use DocArray wit First off, to use DocArray with TensorFlow we first need to install it as follows: ``` -pip install tensorflow==2.11.0 +pip install tensorflow==2.12.0 pip install protobuf==3.19.0 ``` @@ -879,3 +879,4 @@ Both are user-friendly and are best suited to small to medium-sized datasets. > DocArray is a trademark of LF AI Projects, LLC +> diff --git a/docarray/index/__init__.py b/docarray/index/__init__.py index b24877526a2..dfd0d52f7c0 100644 --- a/docarray/index/__init__.py +++ b/docarray/index/__init__.py @@ -14,8 +14,17 @@ from docarray.index.backends.qdrant import QdrantDocumentIndex # noqa: F401 from docarray.index.backends.weaviate import WeaviateDocumentIndex # noqa: F401 from docarray.index.backends.redis import RedisDocumentIndex # noqa: F401 + from docarray.index.backends.milvus import MilvusDocumentIndex # noqa: F401 -__all__ = ['InMemoryExactNNIndex'] +__all__ = [ + 'InMemoryExactNNIndex', + 'ElasticDocIndex', + 'ElasticV7DocIndex', + 'QdrantDocumentIndex', + 'WeaviateDocumentIndex', + 'RedisDocumentIndex', + 'MilvusDocumentIndex', +] def __getattr__(name: str): @@ -35,6 +44,9 @@ def __getattr__(name: str): elif name == 'WeaviateDocumentIndex': import_library('weaviate', raise_error=True) import docarray.index.backends.weaviate as lib + elif name == 'MilvusDocumentIndex': + import_library('pymilvus', raise_error=True) + import docarray.index.backends.milvus as lib elif name == 'RedisDocumentIndex': import_library('redis', raise_error=True) import docarray.index.backends.redis as lib diff --git a/docarray/index/backends/milvus.py b/docarray/index/backends/milvus.py new file mode 100644 index 00000000000..405ecf9e1f4 --- /dev/null +++ b/docarray/index/backends/milvus.py @@ -0,0 +1,841 @@ +from collections import defaultdict +from dataclasses import dataclass, field +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Generator, + Generic, + List, + Optional, + Sequence, + Type, + TypeVar, + Union, + cast, + Tuple, +) + +import numpy as np + +from docarray import BaseDoc, DocList +from docarray.index.abstract import ( + BaseDocIndex, + _raise_not_supported, + _raise_not_composable, +) +from docarray.index.backends.helper import _collect_query_args +from docarray.typing import AnyTensor, NdArray +from docarray.typing.id import ID +from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.utils._internal._typing import safe_issubclass +from docarray.utils.find import ( + _FindResult, + _FindResultBatched, + FindResult, + FindResultBatched, +) +from docarray.array.any_array import AnyDocArray + +if TYPE_CHECKING: + from pymilvus import ( # type: ignore[import] + Collection, + CollectionSchema, + DataType, + FieldSchema, + connections, + utility, + Hits, + ) +else: + from pymilvus import ( + Collection, + CollectionSchema, + DataType, + FieldSchema, + connections, + utility, + Hits, + ) + +MAX_LEN = 65_535 # Maximum length that Milvus allows for a VARCHAR field +VALID_METRICS = ['L2', 'IP'] +VALID_INDEX_TYPES = [ + 'FLAT', + 'IVF_FLAT', + 'IVF_SQ8', + 'IVF_PQ', + 'HNSW', + 'ANNOY', + 'DISKANN', +] + +TSchema = TypeVar('TSchema', bound=BaseDoc) + + +class MilvusDocumentIndex(BaseDocIndex, Generic[TSchema]): + def __init__(self, db_config=None, **kwargs): + """Initialize MilvusDocumentIndex""" + super().__init__(db_config=db_config, **kwargs) + self._db_config: MilvusDocumentIndex.DBConfig = cast( + MilvusDocumentIndex.DBConfig, self._db_config + ) + self._runtime_config: MilvusDocumentIndex.RuntimeConfig = cast( + MilvusDocumentIndex.RuntimeConfig, self._runtime_config + ) + + self._client = connections.connect( + db_name="default", + host=self._db_config.host, + user=self._db_config.user, + password=self._db_config.password, + token=self._db_config.token, + ) + + self._validate_columns() + self._field_name = self._get_vector_field_name() + self._collection = self._create_or_load_collection() + self._build_index() + self._collection.load() + self._logger.info(f'{self.__class__.__name__} has been initialized') + + @dataclass + class DBConfig(BaseDocIndex.DBConfig): + """Dataclass that contains all "static" configurations of MilvusDocumentIndex. + + :param index_name: The name of the index in the Milvus database. If not provided, default index name will be used. + :param collection_description: Description of the collection in the database. + :param host: Hostname of the server where the database resides. Default is 'localhost'. + :param port: Port number used to connect to the database. Default is 19530. + :param user: User for the database. Can be an empty string if no user is required. + :param password: Password for the specified user. Can be an empty string if no password is required. + :param token: Token for secure connection. Can be an empty string if no token is required. + :param consistency_level: The level of consistency for the database session. Default is 'Session'. + :param search_params: Dictionary containing parameters for search operations, + default has a single key 'params' with 'nprobe' set to 10. + :param serialize_config: Dictionary containing configuration for serialization, + default is {'protocol': 'protobuf'}. + :param default_column_config: Dictionary that defines the default configuration + for each data type column. + """ + + index_name: Optional[str] = None + collection_description: str = "" + host: str = "localhost" + port: int = 19530 + user: Optional[str] = "" + password: Optional[str] = "" + token: Optional[str] = "" + consistency_level: str = 'Session' + search_params: Dict = field( + default_factory=lambda: { + "params": {"nprobe": 10}, + } + ) + serialize_config: Dict = field(default_factory=lambda: {"protocol": "protobuf"}) + default_column_config: Dict[Type, Dict[str, Any]] = field( + default_factory=lambda: defaultdict( + dict, + { + DataType.FLOAT_VECTOR: { + 'index_type': 'IVF_FLAT', + 'metric_type': 'L2', + 'params': {"nlist": 1024}, + }, + }, + ) + ) + + @dataclass + class RuntimeConfig(BaseDocIndex.RuntimeConfig): + """Dataclass that contains all "dynamic" configurations of RedisDocumentIndex. + + :param batch_size: Batch size for index/get/del. + """ + + batch_size: int = 100 + + class QueryBuilder(BaseDocIndex.QueryBuilder): + def __init__(self, query: Optional[List[Tuple[str, Dict]]] = None): + super().__init__() + # list of tuples (method name, kwargs) + self._queries: List[Tuple[str, Dict]] = query or [] + + def build(self, *args, **kwargs) -> Any: + """Build the query object.""" + return self._queries + + find = _collect_query_args('find') + filter = _collect_query_args('filter') + text_search = _raise_not_supported('text_search') + find_batched = _raise_not_composable('find_batched') + filter_batched = _raise_not_composable('filter_batched') + text_search_batched = _raise_not_supported('text_search_batched') + + def python_type_to_db_type(self, python_type: Type) -> Any: + """Map python type to database type. + Takes any python type and returns the corresponding database column type. + + :param python_type: a python type. + :return: the corresponding database column type, or None if ``python_type`` + is not supported. + """ + type_map = { + int: DataType.INT64, + float: DataType.FLOAT, + str: DataType.VARCHAR, + bytes: DataType.VARCHAR, + np.ndarray: DataType.FLOAT_VECTOR, + list: DataType.FLOAT_VECTOR, + AnyTensor: DataType.FLOAT_VECTOR, + AbstractTensor: DataType.FLOAT_VECTOR, + } + + if issubclass(python_type, ID): + return DataType.VARCHAR + + for py_type, db_type in type_map.items(): + if safe_issubclass(python_type, py_type): + return db_type + + raise ValueError(f'Unsupported column type for {type(self)}: {python_type}') + + def _create_or_load_collection(self) -> Collection: + """ + This function initializes or retrieves a Milvus collection with a specified schema, + storing documents as serialized data and using the document's ID as the collection's ID + , while inheriting other schema properties from the indexer's schema. + + !!! note + Milvus framework currently only supports a single vector column, and only one vector + column can store in the schema (others are stored in the serialized data) + """ + + if not utility.has_collection(self.index_name): + fields = [ + FieldSchema( + name="serialized", + dtype=DataType.VARCHAR, + max_length=MAX_LEN, + ), + FieldSchema( + name="id", + dtype=DataType.VARCHAR, + is_primary=True, + max_length=MAX_LEN, + ), + ] + for column_name, info in self._column_infos.items(): + if ( + column_name != 'id' + and not ( + info.db_type == DataType.FLOAT_VECTOR + and column_name + != self._field_name # Only store one vector field as a column + ) + and not safe_issubclass(info.docarray_type, AnyDocArray) + ): + field_dict: Dict[str, Any] = {} + if info.db_type == DataType.VARCHAR: + field_dict = {'max_length': MAX_LEN} + elif info.db_type == DataType.FLOAT_VECTOR: + field_dict = {'dim': info.n_dim or info.config.get('dim')} + + fields.append( + FieldSchema( + name=column_name, + dtype=info.db_type, + is_primary=False, + **field_dict, + ) + ) + + self._logger.info("Collection has been created") + return Collection( + name=self.index_name, + schema=CollectionSchema( + fields=fields, + description=self._db_config.collection_description, + ), + using='default', + ) + + return Collection(self.index_name) + + def _validate_columns(self): + """ + Validates whether the data schema includes at least one vector column used + for embedding (as required by Milvus), and ensures that dimension information + is specified for that column. + """ + vector_columns = sum( + safe_issubclass(info.docarray_type, AbstractTensor) + and info.config.get('is_embedding', False) + for info in self._column_infos.values() + ) + if vector_columns == 0: + raise ValueError( + "Unable to find any vector columns. Please make sure that at least one " + "column is of a vector type with the is_embedding=True attribute specified." + ) + elif vector_columns > 1: + raise ValueError("Specifying multiple vector fields is not supported.") + + for column, info in self._column_infos.items(): + if info.config.get('is_embedding') and ( + not info.n_dim and not info.config.get('dim') + ): + raise ValueError( + f"The dimension information is missing for the column '{column}', which is of vector type." + ) + + @property + def index_name(self): + default_index_name = ( + self._schema.__name__.lower() if self._schema is not None else None + ) + if default_index_name is None: + err_msg = ( + 'A MilvusDocumentIndex must be typed with a Document type. ' + 'To do so, use the syntax: MilvusDocumentIndex[DocumentType]' + ) + + self._logger.error(err_msg) + raise ValueError(err_msg) + index_name = self._db_config.index_name or default_index_name + self._logger.debug(f'Retrieved index name: {index_name}') + return index_name + + @property + def out_schema(self) -> Type[BaseDoc]: + """Return the real schema of the index.""" + if self._is_subindex: + return self._ori_schema + return cast(Type[BaseDoc], self._schema) + + def _build_index(self): + """ + Sets up an index configuration for a specific column index, which is + required by the Milvus backend. + """ + + existing_indices = [index.field_name for index in self._collection.indexes] + if self._field_name in existing_indices: + return + + index_type = self._column_infos[self._field_name].config['index_type'].upper() + if index_type not in VALID_INDEX_TYPES: + raise ValueError( + f"Invalid index type '{index_type}' provided. " + f"Must be one of: {', '.join(VALID_INDEX_TYPES)}" + ) + metric_type = ( + self._column_infos[self._field_name].config.get('space', '').upper() + ) + if metric_type not in VALID_METRICS: + self._logger.warning( + f"Invalid or no distance metric '{metric_type}' was provided. " + f"Should be one of: {', '.join(VALID_INDEX_TYPES)}. " + f"Default distance metric will be used." + ) + metric_type = self._column_infos[self._field_name].config['metric_type'] + + index = { + "index_type": index_type, + "metric_type": metric_type, + "params": self._column_infos[self._field_name].config['params'], + } + + self._collection.create_index(self._field_name, index) + self._logger.info( + f"Index for the field '{self._field_name}' has been successfully created" + ) + + def _get_vector_field_name(self): + for column, info in self._column_infos.items(): + if info.db_type == DataType.FLOAT_VECTOR and info.config.get( + 'is_embedding' + ): + return column + return '' + + @staticmethod + def _get_batches(docs, batch_size): + """Yield successive batch_size batches from docs.""" + for i in range(0, len(docs), batch_size): + yield docs[i : i + batch_size] + + def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): + """Index Documents into the index. + + !!! note + Passing a sequence of Documents that is not a DocList + (such as a List of Docs) comes at a performance penalty. + This is because the Index needs to check compatibility between itself and + the data. With a DocList as input this is a single check; for other inputs + compatibility needs to be checked for every Document individually. + + :param docs: Documents to index. + """ + + n_docs = 1 if isinstance(docs, BaseDoc) else len(docs) + self._logger.debug(f'Indexing {n_docs} documents') + docs = self._validate_docs(docs) + self._update_subindex_data(docs) + data_by_columns = self._get_col_value_dict(docs) + self._index_subindex(data_by_columns) + + positions: Dict[str, int] = { + info.name: num for num, info in enumerate(self._collection.schema.fields) + } + + for batch in self._get_batches( + docs, batch_size=self._runtime_config.batch_size + ): + entities: List[List[Any]] = [ + [] for _ in range(len(self._collection.schema)) + ] + for doc in batch: + # "serialized" will always be in the first position + entities[0].append(doc.to_base64(**self._db_config.serialize_config)) + for schema_field in self._collection.schema.fields: + if schema_field.name == 'serialized': + continue + column_value = self._get_values_by_column([doc], schema_field.name)[ + 0 + ] + if schema_field.dtype == DataType.FLOAT_VECTOR: + column_value = self._map_embedding(column_value) + + entities[positions[schema_field.name]].append(column_value) + self._collection.insert(entities) + + self._collection.flush() + self._logger.info(f"{len(docs)} documents has been indexed") + + def _filter_by_parent_id(self, id: str) -> Optional[List[str]]: + """Filter the ids of the subindex documents given id of root document. + + :param id: the root document id to filter by + :return: a list of ids of the subindex documents + """ + docs = self._filter(filter_query=f"parent_id == '{id}'", limit=self.num_docs()) + return [doc.id for doc in docs] # type: ignore[union-attr] + + def num_docs(self) -> int: + """ + Get the number of documents. + + !!! note + Cannot use Milvus' num_entities method because it's not precise + especially after delete ops (#15201 issue in Milvus) + """ + + self._collection.load() + + result = self._collection.query( + expr=self._always_true_expr("id"), + offset=0, + output_fields=["serialized"], + ) + + return len(result) + + def _get_items( + self, doc_ids: Sequence[str] + ) -> Union[Sequence[TSchema], Sequence[Dict[str, Any]]]: + """Get Documents from the index, by `id`. + If no document is found, a KeyError is raised. + + :param doc_ids: ids to get from the Document index + :param raw: if raw, output the new_schema type (with parent id) + :return: Sequence of Documents, sorted corresponding to the order of `doc_ids`. + Duplicate `doc_ids` can be omitted in the output. + """ + + self._collection.load() + results: List[Dict] = [] + for batch in self._get_batches( + doc_ids, batch_size=self._runtime_config.batch_size + ): + results.extend( + self._collection.query( + expr="id in " + str([id for id in batch]), + offset=0, + output_fields=["serialized"], + consistency_level=self._db_config.consistency_level, + ) + ) + + self._collection.release() + + return self._docs_from_query_response(results) + + def _del_items(self, doc_ids: Sequence[str]): + """Delete Documents from the index. + + :param doc_ids: ids to delete from the Document Store + """ + self._collection.load() + for batch in self._get_batches( + doc_ids, batch_size=self._runtime_config.batch_size + ): + self._collection.delete( + expr="id in " + str([id for id in batch]), + consistency_level=self._db_config.consistency_level, + ) + self._logger.info(f"{len(doc_ids)} documents has been deleted") + + def _filter( + self, + filter_query: Any, + limit: int, + ) -> Union[DocList, List[Dict]]: + """ + Filters the index based on the given filter query. + + :param filter_query: The filter condition. + :param limit: The maximum number of results to return. + :return: Filter results. + """ + + self._collection.load() + + result = self._collection.query( + expr=filter_query, + offset=0, + limit=min(limit, self.num_docs()), + output_fields=["serialized"], + ) + + self._collection.release() + + return self._docs_from_query_response(result) + + def _filter_batched( + self, + filter_queries: Any, + limit: int, + ) -> Union[List[DocList], List[List[Dict]]]: + """ + Filters the index based on the given batch of filter queries. + + :param filter_queries: The filter conditions. + :param limit: The maximum number of results to return for each filter query. + :return: Filter results. + """ + return [ + self._filter(filter_query=query, limit=limit) for query in filter_queries + ] + + def _text_search( + self, + query: str, + limit: int, + search_field: str = '', + ) -> _FindResult: + raise NotImplementedError(f'{type(self)} does not support text search.') + + def _text_search_batched( + self, + queries: Sequence[str], + limit: int, + search_field: str = '', + ) -> _FindResultBatched: + raise NotImplementedError(f'{type(self)} does not support text search.') + + def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]): + """index a document into the store""" + raise NotImplementedError() + + def find( + self, + query: Union[AnyTensor, BaseDoc], + search_field: str = '', + limit: int = 10, + **kwargs, + ) -> FindResult: + """Find documents in the index using nearest neighbor search. + + :param query: query vector for KNN/ANN search. + Can be either a tensor-like (np.array, torch.Tensor, etc.) + with a single axis, or a Document + :param search_field: name of the field to search on. + Documents in the index are retrieved based on this similarity + of this field to the query. + :param limit: maximum number of documents to return + :return: a named tuple containing `documents` and `scores` + """ + self._logger.debug(f'Executing `find` for search field {search_field}') + if search_field != '': + raise ValueError( + 'Argument search_field is not supported for MilvusDocumentIndex.' + 'Set search_field to an empty string to proceed.' + ) + + search_field = self._field_name + if isinstance(query, BaseDoc): + query_vec = self._get_values_by_column([query], search_field)[0] + else: + query_vec = query + query_vec_np = self._to_numpy(query_vec) + docs, scores = self._find( + query_vec_np, search_field=search_field, limit=limit, **kwargs + ) + + if isinstance(docs, List) and not isinstance(docs, DocList): + docs = self._dict_list_to_docarray(docs) + + return FindResult(documents=docs, scores=scores) + + def _find( + self, + query: np.ndarray, + limit: int, + search_field: str = '', + ) -> _FindResult: + """ + Conducts a search on the index. + + :param query: The vector query to search. + :param limit: The maximum number of results to return. + :param search_field: The field to search the query. + :return: Search results. + """ + + return self._hybrid_search(query=query, limit=limit, search_field=search_field) + + def _hybrid_search( + self, + query: np.ndarray, + limit: int, + search_field: str = '', + expr: Optional[str] = None, + ): + """ + Conducts a hybrid search on the index. + + :param query: The vector query to search. + :param limit: The maximum number of results to return. + :param search_field: The field to search the query. + :param expr: Boolean expression used for filtering. + :return: Search results. + """ + self._collection.load() + + results = self._collection.search( + data=[query], + anns_field=search_field, + param=self._db_config.search_params, + limit=limit, + offset=0, + expr=expr, + output_fields=["serialized"], + consistency_level=self._db_config.consistency_level, + ) + + self._collection.release() + + results = next(iter(results), None) # Only consider the first element + + return self._docs_from_find_response(results) + + def find_batched( + self, + queries: Union[AnyTensor, DocList], + search_field: str = '', + limit: int = 10, + **kwargs, + ) -> FindResultBatched: + """Find documents in the index using nearest neighbor search. + + :param queries: query vector for KNN/ANN search. + Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, + or a DocList. + If a tensor-like is passed, it should have shape (batch_size, vector_dim) + :param search_field: name of the field to search on. + Documents in the index are retrieved based on this similarity + of this field to the query. + :param limit: maximum number of documents to return per query + :return: a named tuple containing `documents` and `scores` + """ + self._logger.debug(f'Executing `find_batched` for search field {search_field}') + + if search_field: + if '__' in search_field: + fields = search_field.split('__') + if issubclass(self._schema._get_field_type(fields[0]), AnyDocArray): # type: ignore + return self._subindices[fields[0]].find_batched( + queries, + search_field='__'.join(fields[1:]), + limit=limit, + **kwargs, + ) + if search_field != '': + raise ValueError( + 'Argument search_field is not supported for MilvusDocumentIndex.' + 'Set search_field to an empty string to proceed.' + ) + search_field = self._field_name + if isinstance(queries, Sequence): + query_vec_list = self._get_values_by_column(queries, search_field) + query_vec_np = np.stack( + tuple(self._to_numpy(query_vec) for query_vec in query_vec_list) + ) + else: + query_vec_np = self._to_numpy(queries) + + da_list, scores = self._find_batched( + query_vec_np, search_field=search_field, limit=limit, **kwargs + ) + if ( + len(da_list) > 0 + and isinstance(da_list[0], List) + and not isinstance(da_list[0], DocList) + ): + da_list = [self._dict_list_to_docarray(docs) for docs in da_list] + + return FindResultBatched(documents=da_list, scores=scores) # type: ignore + + def _find_batched( + self, + queries: np.ndarray, + limit: int, + search_field: str = '', + ) -> _FindResultBatched: + """ + Conducts a batched search on the index. + + :param queries: The queries to search. + :param limit: The maximum number of results to return for each query. + :param search_field: The field to search the queries. + :return: Search results. + """ + + self._collection.load() + + results = self._collection.search( + data=queries, + anns_field=self._field_name, + param=self._db_config.search_params, + limit=limit, + expr=None, + output_fields=["serialized"], + consistency_level=self._db_config.consistency_level, + ) + + self._collection.release() + + documents, scores = zip( + *[self._docs_from_find_response(result) for result in results] + ) + + return _FindResultBatched( + documents=list(documents), + scores=list(scores), + ) + + def execute_query(self, query: Any, *args, **kwargs) -> Any: + """ + Executes a hybrid query on the index. + + :param query: Query to execute on the index. + :return: Query results. + """ + components: Dict[str, List[Dict[str, Any]]] = {} + for component, value in query: + if component not in components: + components[component] = [] + components[component].append(value) + + if ( + len(components) != 2 + or len(components.get('find', [])) != 1 + or len(components.get('filter', [])) != 1 + ): + raise ValueError( + 'The query must contain exactly one "find" and "filter" components.' + ) + + expr = components['filter'][0]['filter_query'] + query = components['find'][0]['query'] + limit = ( + components['find'][0].get('limit') + or components['filter'][0].get('limit') + or 10 + ) + docs, scores = self._hybrid_search( + query=query, + expr=expr, + search_field=self._field_name, + limit=limit, + ) + if isinstance(docs, List) and not isinstance(docs, DocList): + docs = self._dict_list_to_docarray(docs) + + return FindResult(documents=docs, scores=scores) + + def _docs_from_query_response(self, result: Sequence[Dict]) -> DocList[Any]: + return DocList[self._schema]( # type: ignore + [ + self._schema.from_base64( # type: ignore + result[i]["serialized"], **self._db_config.serialize_config + ) + for i in range(len(result)) + ] + ) + + def _docs_from_find_response(self, result: Hits) -> _FindResult: + scores: NdArray = NdArray._docarray_from_native( + np.array([hit.score for hit in result]) + ) + + return _FindResult( + documents=DocList[self.out_schema]( # type: ignore + [ + self.out_schema.from_base64( + hit.entity.get('serialized'), **self._db_config.serialize_config + ) + for hit in result + ] + ), + scores=scores, + ) + + def _always_true_expr(self, primary_key: str) -> str: + """ + Returns a Milvus expression that is always true, thus allowing for the retrieval of all entries in a Collection. + Assumes that the primary key is of type DataType.VARCHAR + + :param primary_key: the name of the primary key + :return: a Milvus expression that is always true for that primary key + """ + return f'({primary_key} in ["1"]) or ({primary_key} not in ["1"])' + + def _map_embedding(self, embedding: AnyTensor) -> np.ndarray: + """ + Milvus exclusively supports one-dimensional vectors. If multi-dimensional + vectors are provided, they will be automatically flattened to ensure compatibility. + + :param embedding: The original raw embedding, which can be in the form of a TensorFlow or PyTorch tensor. + :return embedding: A one-dimensional numpy array representing the flattened version of the original embedding. + """ + if embedding is None: + raise ValueError( + "Embedding is None. Each document must have a valid embedding." + ) + + embedding = self._to_numpy(embedding) + if embedding.ndim > 1: + embedding = np.asarray(embedding).squeeze() # type: ignore + + return embedding + + def _doc_exists(self, doc_id: str) -> bool: + result = self._collection.query( + expr="id in " + str([doc_id]), + offset=0, + output_fields=["serialized"], + ) + + return len(result) > 0 diff --git a/docarray/index/backends/redis.py b/docarray/index/backends/redis.py index 937c77efdaa..2a338b424aa 100644 --- a/docarray/index/backends/redis.py +++ b/docarray/index/backends/redis.py @@ -1,4 +1,3 @@ -import uuid from collections import defaultdict from typing import ( TypeVar, @@ -94,11 +93,6 @@ def __init__(self, db_config=None, **kwargs): self._create_index() self._logger.info(f'{self.__class__.__name__} has been initialized') - @staticmethod - def _random_name() -> str: - """Generate a random index name.""" - return uuid.uuid4().hex - def _create_index(self) -> None: """Create a new index in the Redis database if it doesn't already exist.""" if not self._check_index_exists(self.index_name): @@ -220,7 +214,7 @@ class DBConfig(BaseDocIndex.DBConfig): :param host: The host address for the Redis server. Default is 'localhost'. :param port: The port number for the Redis server. Default is 6379. :param index_name: The name of the index in the Redis database. - In case it's not provided, a random index name will be generated. + If not provided, default index name will be used. :param username: The username for the Redis server. Default is None. :param password: The password for the Redis server. Default is None. :param text_scorer: The method for scoring text during text search. diff --git a/docarray/utils/_internal/misc.py b/docarray/utils/_internal/misc.py index ad0d28d9c9e..c753ce303ea 100644 --- a/docarray/utils/_internal/misc.py +++ b/docarray/utils/_internal/misc.py @@ -50,6 +50,7 @@ 'boto3': '"docarray[aws]"', 'botocore': '"docarray[aws]"', 'redis': '"docarray[redis]"', + 'pymilvus': '"docarray[milvus]"', } diff --git a/poetry.lock b/poetry.lock index dd8e1b1ef3f..ee685ca38b3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -943,6 +943,28 @@ files = [ {file = "entrypoints-0.4.tar.gz", hash = "sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4"}, ] +[[package]] +name = "environs" +version = "9.5.0" +description = "simplified environment variable parsing" +category = "main" +optional = true +python-versions = ">=3.6" +files = [ + {file = "environs-9.5.0-py2.py3-none-any.whl", hash = "sha256:1e549569a3de49c05f856f40bce86979e7d5ffbbc4398e7f338574c220189124"}, + {file = "environs-9.5.0.tar.gz", hash = "sha256:a76307b36fbe856bdca7ee9161e6c466fd7fcffc297109a118c59b54e27e30c9"}, +] + +[package.dependencies] +marshmallow = ">=3.0.0" +python-dotenv = "*" + +[package.extras] +dev = ["dj-database-url", "dj-email-url", "django-cache-url", "flake8 (==4.0.1)", "flake8-bugbear (==21.9.2)", "mypy (==0.910)", "pre-commit (>=2.4,<3.0)", "pytest", "tox"] +django = ["dj-database-url", "dj-email-url", "django-cache-url"] +lint = ["flake8 (==4.0.1)", "flake8-bugbear (==21.9.2)", "mypy (==0.910)", "pre-commit (>=2.4,<3.0)"] +tests = ["dj-database-url", "dj-email-url", "django-cache-url", "pytest"] + [[package]] name = "exceptiongroup" version = "1.1.0" @@ -2109,6 +2131,27 @@ files = [ {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, ] +[[package]] +name = "marshmallow" +version = "3.19.0" +description = "A lightweight library for converting complex datatypes to and from native Python datatypes." +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "marshmallow-3.19.0-py3-none-any.whl", hash = "sha256:93f0958568da045b0021ec6aeb7ac37c81bfcccbb9a0e7ed8559885070b3a19b"}, + {file = "marshmallow-3.19.0.tar.gz", hash = "sha256:90032c0fd650ce94b6ec6dc8dfeb0e3ff50c144586462c389b81a07205bedb78"}, +] + +[package.dependencies] +packaging = ">=17.0" + +[package.extras] +dev = ["flake8 (==5.0.4)", "flake8-bugbear (==22.10.25)", "mypy (==0.990)", "pre-commit (>=2.4,<3.0)", "pytest", "pytz", "simplejson", "tox"] +docs = ["alabaster (==0.7.12)", "autodocsumm (==0.2.9)", "sphinx (==5.3.0)", "sphinx-issues (==3.0.1)", "sphinx-version-warning (==1.1.2)"] +lint = ["flake8 (==5.0.4)", "flake8-bugbear (==22.10.25)", "mypy (==0.990)", "pre-commit (>=2.4,<3.0)"] +tests = ["pytest", "pytz", "simplejson"] + [[package]] name = "matplotlib-inline" version = "0.1.6" @@ -2348,8 +2391,8 @@ files = [ [package.dependencies] numpy = [ {version = ">1.20", markers = "python_version <= \"3.9\""}, - {version = ">=1.23.3", markers = "python_version > \"3.10\""}, {version = ">=1.21.2", markers = "python_version > \"3.9\""}, + {version = ">=1.23.3", markers = "python_version > \"3.10\""}, ] [package.extras] @@ -2749,44 +2792,6 @@ jupyter-server = ">=1.8,<3" [package.extras] test = ["pytest", "pytest-console-scripts", "pytest-tornasync"] -[[package]] -name = "numpy" -version = "1.21.1" -description = "NumPy is the fundamental package for array computing with Python." -category = "main" -optional = false -python-versions = ">=3.7" -files = [ - {file = "numpy-1.21.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38e8648f9449a549a7dfe8d8755a5979b45b3538520d1e735637ef28e8c2dc50"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fd7d7409fa643a91d0a05c7554dd68aa9c9bb16e186f6ccfe40d6e003156e33a"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a75b4498b1e93d8b700282dc8e655b8bd559c0904b3910b144646dbbbc03e062"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1412aa0aec3e00bc23fbb8664d76552b4efde98fb71f60737c83efbac24112f1"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e46ceaff65609b5399163de5893d8f2a82d3c77d5e56d976c8b5fb01faa6b671"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c6a2324085dd52f96498419ba95b5777e40b6bcbc20088fddb9e8cbb58885e8e"}, - {file = "numpy-1.21.1-cp37-cp37m-win32.whl", hash = "sha256:73101b2a1fef16602696d133db402a7e7586654682244344b8329cdcbbb82172"}, - {file = "numpy-1.21.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7a708a79c9a9d26904d1cca8d383bf869edf6f8e7650d85dbc77b041e8c5a0f8"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95b995d0c413f5d0428b3f880e8fe1660ff9396dcd1f9eedbc311f37b5652e16"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:635e6bd31c9fb3d475c8f44a089569070d10a9ef18ed13738b03049280281267"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4a3d5fb89bfe21be2ef47c0614b9c9c707b7362386c9a3ff1feae63e0267ccb6"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a326af80e86d0e9ce92bcc1e65c8ff88297de4fa14ee936cb2293d414c9ec63"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:791492091744b0fe390a6ce85cc1bf5149968ac7d5f0477288f78c89b385d9af"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0318c465786c1f63ac05d7c4dbcecd4d2d7e13f0959b01b534ea1e92202235c5"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a513bd9c1551894ee3d31369f9b07460ef223694098cf27d399513415855b68"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:91c6f5fc58df1e0a3cc0c3a717bb3308ff850abdaa6d2d802573ee2b11f674a8"}, - {file = "numpy-1.21.1-cp38-cp38-win32.whl", hash = "sha256:978010b68e17150db8765355d1ccdd450f9fc916824e8c4e35ee620590e234cd"}, - {file = "numpy-1.21.1-cp38-cp38-win_amd64.whl", hash = "sha256:9749a40a5b22333467f02fe11edc98f022133ee1bfa8ab99bda5e5437b831214"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d7a4aeac3b94af92a9373d6e77b37691b86411f9745190d2c351f410ab3a791f"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d9e7912a56108aba9b31df688a4c4f5cb0d9d3787386b87d504762b6754fbb1b"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:25b40b98ebdd272bc3020935427a4530b7d60dfbe1ab9381a39147834e985eac"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a92c5aea763d14ba9d6475803fc7904bda7decc2a0a68153f587ad82941fec1"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:05a0f648eb28bae4bcb204e6fd14603de2908de982e761a2fc78efe0f19e96e1"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f01f28075a92eede918b965e86e8f0ba7b7797a95aa8d35e1cc8821f5fc3ad6a"}, - {file = "numpy-1.21.1-cp39-cp39-win32.whl", hash = "sha256:88c0b89ad1cc24a5efbb99ff9ab5db0f9a86e9cc50240177a571fbe9c2860ac2"}, - {file = "numpy-1.21.1-cp39-cp39-win_amd64.whl", hash = "sha256:01721eefe70544d548425a07c80be8377096a54118070b8a62476866d5208e33"}, - {file = "numpy-1.21.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d4d1de6e6fb3d28781c73fbde702ac97f03d79e4ffd6598b880b2d95d62ead4"}, - {file = "numpy-1.21.1.zip", hash = "sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd"}, -] - [[package]] name = "numpy" version = "1.24.4" @@ -2985,37 +2990,71 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" [[package]] name = "pandas" -version = "1.1.0" +version = "2.0.3" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = true -python-versions = ">=3.6.1" +python-versions = ">=3.8" files = [ - {file = "pandas-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:47a03bfef80d6812c91ed6fae43f04f2fa80a4e1b82b35aa4d9002e39529e0b8"}, - {file = "pandas-1.1.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:0210f8fe19c2667a3817adb6de2c4fd92b1b78e1975ca60c0efa908e0985cbdb"}, - {file = "pandas-1.1.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:35db623487f00d9392d8af44a24516d6cb9f274afaf73cfcfe180b9c54e007d2"}, - {file = "pandas-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:4d1a806252001c5db7caecbe1a26e49a6c23421d85a700960f6ba093112f54a1"}, - {file = "pandas-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:9f61cca5262840ff46ef857d4f5f65679b82188709d0e5e086a9123791f721c8"}, - {file = "pandas-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:182a5aeae319df391c3df4740bb17d5300dcd78034b17732c12e62e6dd79e4a4"}, - {file = "pandas-1.1.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:40ec0a7f611a3d00d3c666c4cceb9aa3f5bf9fbd81392948a93663064f527203"}, - {file = "pandas-1.1.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:16504f915f1ae424052f1e9b7cd2d01786f098fbb00fa4e0f69d42b22952d798"}, - {file = "pandas-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:fc714895b6de6803ac9f661abb316853d0cd657f5d23985222255ad76ccedc25"}, - {file = "pandas-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a15835c8409d5edc50b4af93be3377b5dd3eb53517e7f785060df1f06f6da0e2"}, - {file = "pandas-1.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0bc440493cf9dc5b36d5d46bbd5508f6547ba68b02a28234cd8e81fdce42744d"}, - {file = "pandas-1.1.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:4b21d46728f8a6be537716035b445e7ef3a75dbd30bd31aa1b251323219d853e"}, - {file = "pandas-1.1.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0227e3a6e3a22c0e283a5041f1e3064d78fbde811217668bb966ed05386d8a7e"}, - {file = "pandas-1.1.0-cp38-cp38-win32.whl", hash = "sha256:ed60848caadeacecefd0b1de81b91beff23960032cded0ac1449242b506a3b3f"}, - {file = "pandas-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:60e20a4ab4d4fec253557d0fc9a4e4095c37b664f78c72af24860c8adcd07088"}, - {file = "pandas-1.1.0.tar.gz", hash = "sha256:b39508562ad0bb3f384b0db24da7d68a2608b9ddc85b1d931ccaaa92d5e45273"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, ] [package.dependencies] -numpy = ">=1.15.4" -python-dateutil = ">=2.7.3" -pytz = ">=2017.2" +numpy = [ + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" [package.extras] -test = ["hypothesis (>=3.58)", "pytest (>=4.0.2)", "pytest-xdist"] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] [[package]] name = "pandocfilters" @@ -3465,6 +3504,26 @@ files = [ markdown = ">=3.2" pyyaml = "*" +[[package]] +name = "pymilvus" +version = "2.2.13" +description = "Python Sdk for Milvus" +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "pymilvus-2.2.13-py3-none-any.whl", hash = "sha256:ac991863bd63e860c1210d096695297175c6ed09f4de762cf42394cb5aecd1f6"}, + {file = "pymilvus-2.2.13.tar.gz", hash = "sha256:72da36cb5f4f84d7a8307202fcaa9a7fc4497d28d2d2235045ba93a430691ef1"}, +] + +[package.dependencies] +environs = "<=9.5.0" +grpcio = ">=1.49.1,<=1.56.0" +numpy = {version = "<1.25.0", markers = "python_version <= \"3.8\""} +pandas = ">=1.2.4" +protobuf = ">=3.20.0" +ujson = ">=2.0.0" + [[package]] name = "pyparsing" version = "3.0.9" @@ -3588,6 +3647,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-dotenv" +version = "1.0.0" +description = "Read key-value pairs from a .env file and set them as environment variables" +category = "main" +optional = true +python-versions = ">=3.8" +files = [ + {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, + {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "python-jose" version = "3.3.0" @@ -4674,6 +4748,89 @@ files = [ mypy-extensions = ">=0.3.0" typing-extensions = ">=3.7.4" +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +category = "main" +optional = true +python-versions = ">=2" +files = [ + {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, + {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, +] + +[[package]] +name = "ujson" +version = "5.8.0" +description = "Ultra fast JSON encoder and decoder for Python" +category = "main" +optional = true +python-versions = ">=3.8" +files = [ + {file = "ujson-5.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f4511560d75b15ecb367eef561554959b9d49b6ec3b8d5634212f9fed74a6df1"}, + {file = "ujson-5.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9399eaa5d1931a0ead49dce3ffacbea63f3177978588b956036bfe53cdf6af75"}, + {file = "ujson-5.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4e7bb7eba0e1963f8b768f9c458ecb193e5bf6977090182e2b4f4408f35ac76"}, + {file = "ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40931d7c08c4ce99adc4b409ddb1bbb01635a950e81239c2382cfe24251b127a"}, + {file = "ujson-5.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d53039d39de65360e924b511c7ca1a67b0975c34c015dd468fca492b11caa8f7"}, + {file = "ujson-5.8.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bdf04c6af3852161be9613e458a1fb67327910391de8ffedb8332e60800147a2"}, + {file = "ujson-5.8.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a70f776bda2e5072a086c02792c7863ba5833d565189e09fabbd04c8b4c3abba"}, + {file = "ujson-5.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f26629ac531d712f93192c233a74888bc8b8212558bd7d04c349125f10199fcf"}, + {file = "ujson-5.8.0-cp310-cp310-win32.whl", hash = "sha256:7ecc33b107ae88405aebdb8d82c13d6944be2331ebb04399134c03171509371a"}, + {file = "ujson-5.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:3b27a8da7a080add559a3b73ec9ebd52e82cc4419f7c6fb7266e62439a055ed0"}, + {file = "ujson-5.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:193349a998cd821483a25f5df30b44e8f495423840ee11b3b28df092ddfd0f7f"}, + {file = "ujson-5.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ddeabbc78b2aed531f167d1e70387b151900bc856d61e9325fcdfefb2a51ad8"}, + {file = "ujson-5.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ce24909a9c25062e60653073dd6d5e6ec9d6ad7ed6e0069450d5b673c854405"}, + {file = "ujson-5.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27a2a3c7620ebe43641e926a1062bc04e92dbe90d3501687957d71b4bdddaec4"}, + {file = "ujson-5.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b852bdf920fe9f84e2a2c210cc45f1b64f763b4f7d01468b33f7791698e455e"}, + {file = "ujson-5.8.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:20768961a6a706170497129960762ded9c89fb1c10db2989c56956b162e2a8a3"}, + {file = "ujson-5.8.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e0147d41e9fb5cd174207c4a2895c5e24813204499fd0839951d4c8784a23bf5"}, + {file = "ujson-5.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e3673053b036fd161ae7a5a33358ccae6793ee89fd499000204676baafd7b3aa"}, + {file = "ujson-5.8.0-cp311-cp311-win32.whl", hash = "sha256:a89cf3cd8bf33a37600431b7024a7ccf499db25f9f0b332947fbc79043aad879"}, + {file = "ujson-5.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:3659deec9ab9eb19e8646932bfe6fe22730757c4addbe9d7d5544e879dc1b721"}, + {file = "ujson-5.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:102bf31c56f59538cccdfec45649780ae00657e86247c07edac434cb14d5388c"}, + {file = "ujson-5.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:299a312c3e85edee1178cb6453645217ba23b4e3186412677fa48e9a7f986de6"}, + {file = "ujson-5.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e385a7679b9088d7bc43a64811a7713cc7c33d032d020f757c54e7d41931ae"}, + {file = "ujson-5.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad24ec130855d4430a682c7a60ca0bc158f8253ec81feed4073801f6b6cb681b"}, + {file = "ujson-5.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16fde596d5e45bdf0d7de615346a102510ac8c405098e5595625015b0d4b5296"}, + {file = "ujson-5.8.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6d230d870d1ce03df915e694dcfa3f4e8714369cce2346686dbe0bc8e3f135e7"}, + {file = "ujson-5.8.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:9571de0c53db5cbc265945e08f093f093af2c5a11e14772c72d8e37fceeedd08"}, + {file = "ujson-5.8.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7cba16b26efe774c096a5e822e4f27097b7c81ed6fb5264a2b3f5fd8784bab30"}, + {file = "ujson-5.8.0-cp312-cp312-win32.whl", hash = "sha256:48c7d373ff22366eecfa36a52b9b55b0ee5bd44c2b50e16084aa88b9de038916"}, + {file = "ujson-5.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:5ac97b1e182d81cf395ded620528c59f4177eee024b4b39a50cdd7b720fdeec6"}, + {file = "ujson-5.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2a64cc32bb4a436e5813b83f5aab0889927e5ea1788bf99b930fad853c5625cb"}, + {file = "ujson-5.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e54578fa8838ddc722539a752adfce9372474114f8c127bb316db5392d942f8b"}, + {file = "ujson-5.8.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9721cd112b5e4687cb4ade12a7b8af8b048d4991227ae8066d9c4b3a6642a582"}, + {file = "ujson-5.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d9707e5aacf63fb919f6237d6490c4e0244c7f8d3dc2a0f84d7dec5db7cb54c"}, + {file = "ujson-5.8.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0be81bae295f65a6896b0c9030b55a106fb2dec69ef877253a87bc7c9c5308f7"}, + {file = "ujson-5.8.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ae7f4725c344bf437e9b881019c558416fe84ad9c6b67426416c131ad577df67"}, + {file = "ujson-5.8.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:9ab282d67ef3097105552bf151438b551cc4bedb3f24d80fada830f2e132aeb9"}, + {file = "ujson-5.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:94c7bd9880fa33fcf7f6d7f4cc032e2371adee3c5dba2922b918987141d1bf07"}, + {file = "ujson-5.8.0-cp38-cp38-win32.whl", hash = "sha256:bf5737dbcfe0fa0ac8fa599eceafae86b376492c8f1e4b84e3adf765f03fb564"}, + {file = "ujson-5.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:11da6bed916f9bfacf13f4fc6a9594abd62b2bb115acfb17a77b0f03bee4cfd5"}, + {file = "ujson-5.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:69b3104a2603bab510497ceabc186ba40fef38ec731c0ccaa662e01ff94a985c"}, + {file = "ujson-5.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9249fdefeb021e00b46025e77feed89cd91ffe9b3a49415239103fc1d5d9c29a"}, + {file = "ujson-5.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2873d196725a8193f56dde527b322c4bc79ed97cd60f1d087826ac3290cf9207"}, + {file = "ujson-5.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a4dafa9010c366589f55afb0fd67084acd8added1a51251008f9ff2c3e44042"}, + {file = "ujson-5.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a42baa647a50fa8bed53d4e242be61023bd37b93577f27f90ffe521ac9dc7a3"}, + {file = "ujson-5.8.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f3554eaadffe416c6f543af442066afa6549edbc34fe6a7719818c3e72ebfe95"}, + {file = "ujson-5.8.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:fb87decf38cc82bcdea1d7511e73629e651bdec3a43ab40985167ab8449b769c"}, + {file = "ujson-5.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:407d60eb942c318482bbfb1e66be093308bb11617d41c613e33b4ce5be789adc"}, + {file = "ujson-5.8.0-cp39-cp39-win32.whl", hash = "sha256:0fe1b7edaf560ca6ab023f81cbeaf9946a240876a993b8c5a21a1c539171d903"}, + {file = "ujson-5.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:3f9b63530a5392eb687baff3989d0fb5f45194ae5b1ca8276282fb647f8dcdb3"}, + {file = "ujson-5.8.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:efeddf950fb15a832376c0c01d8d7713479fbeceaed1eaecb2665aa62c305aec"}, + {file = "ujson-5.8.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d8283ac5d03e65f488530c43d6610134309085b71db4f675e9cf5dff96a8282"}, + {file = "ujson-5.8.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb0142f6f10f57598655340a3b2c70ed4646cbe674191da195eb0985a9813b83"}, + {file = "ujson-5.8.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07d459aca895eb17eb463b00441986b021b9312c6c8cc1d06880925c7f51009c"}, + {file = "ujson-5.8.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:d524a8c15cfc863705991d70bbec998456a42c405c291d0f84a74ad7f35c5109"}, + {file = "ujson-5.8.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:d6f84a7a175c75beecde53a624881ff618e9433045a69fcfb5e154b73cdaa377"}, + {file = "ujson-5.8.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b748797131ac7b29826d1524db1cc366d2722ab7afacc2ce1287cdafccddbf1f"}, + {file = "ujson-5.8.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e72ba76313d48a1a3a42e7dc9d1db32ea93fac782ad8dde6f8b13e35c229130"}, + {file = "ujson-5.8.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f504117a39cb98abba4153bf0b46b4954cc5d62f6351a14660201500ba31fe7f"}, + {file = "ujson-5.8.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a8c91b6f4bf23f274af9002b128d133b735141e867109487d17e344d38b87d94"}, + {file = "ujson-5.8.0.tar.gz", hash = "sha256:78e318def4ade898a461b3d92a79f9441e7e0e4d2ad5419abed4336d702c7425"}, +] + [[package]] name = "urllib3" version = "1.26.14" @@ -5103,6 +5260,7 @@ image = ["pillow", "types-pillow"] jac = ["jina-hubble-sdk"] jax = ["jax"] mesh = ["trimesh"] +milvus = ["pymilvus"] pandas = ["pandas"] proto = ["lz4", "protobuf"] qdrant = ["qdrant-client"] @@ -5115,4 +5273,4 @@ web = ["fastapi"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "7b92f58355832b250432c909539267349a32496c47e7ee5fa5fddfc59b843d90" +content-hash = "3f67aa7266b35860429a1911f5acdbef0db4b0ec2b5151ae2f563030c177c19e" diff --git a/pyproject.toml b/pyproject.toml index 3e0e2ee40a9..e23e21d4ada 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ classifiers = [ python = ">=3.8,<4.0" pydantic = ">=1.10.2,<2.0.0" numpy = ">=1.17.3" -protobuf = { version = ">=3.19.0", optional = true } +protobuf = { version = ">=3.20.0", optional = true } torch = { version = ">=1.0.0", optional = true } orjson = ">=3.8.2" pillow = {version = ">=9.3.0", optional = true } @@ -58,6 +58,7 @@ smart-open = {version = ">=6.3.0", extras = ["s3"], optional = true} jina-hubble-sdk = {version = ">=0.34.0", optional = true} elastic-transport = {version ="^8.4.0", optional = true } qdrant-client = {version = ">=1.1.4", python = "<3.12", optional = true } +pymilvus = {version = "^2.2.12", optional = true } redis = {version = "^4.6.0", optional = true} jax = {version = ">=0.4.10", optional = true} @@ -76,6 +77,7 @@ torch = ["torch"] web = ["fastapi"] qdrant = ["qdrant-client"] weaviate = ["weaviate-client"] +milvus = ["pymilvus"] redis = ['redis'] jax = ["jaxlib","jax"] @@ -157,4 +159,4 @@ markers = [ "index: marks test using a document index", "benchmark: marks slow benchmarking tests", "elasticv8: marks test that run with ElasticSearch v8", -] +] \ No newline at end of file diff --git a/tests/index/milvus/__init__.py b/tests/index/milvus/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/index/milvus/docker-compose.yml b/tests/index/milvus/docker-compose.yml new file mode 100644 index 00000000000..ea5367c4188 --- /dev/null +++ b/tests/index/milvus/docker-compose.yml @@ -0,0 +1,49 @@ +version: '3.5' + +services: + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.5 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.2.11 + command: ["milvus", "run", "standalone"] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + +networks: + default: + name: milvus \ No newline at end of file diff --git a/tests/index/milvus/fixtures.py b/tests/index/milvus/fixtures.py new file mode 100644 index 00000000000..7a1ffe3dd1c --- /dev/null +++ b/tests/index/milvus/fixtures.py @@ -0,0 +1,26 @@ +import string +import random + +import pytest +import time +import os + + +cur_dir = os.path.dirname(os.path.abspath(__file__)) +milvus_yml = os.path.abspath(os.path.join(cur_dir, 'docker-compose.yml')) + + +@pytest.fixture(scope='session', autouse=True) +def start_storage(): + os.system(f"docker compose -f {milvus_yml} up -d --remove-orphans") + time.sleep(2) + + yield + os.system(f"docker compose -f {milvus_yml} down --remove-orphans") + + +@pytest.fixture(scope='function') +def tmp_index_name(): + letters = string.ascii_lowercase + random_string = ''.join(random.choice(letters) for _ in range(15)) + return random_string diff --git a/tests/index/milvus/test_configuration.py b/tests/index/milvus/test_configuration.py new file mode 100644 index 00000000000..ada12fcaa99 --- /dev/null +++ b/tests/index/milvus/test_configuration.py @@ -0,0 +1,67 @@ +import numpy as np +import pytest +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import MilvusDocumentIndex +from docarray.typing import NdArray +from tests.index.milvus.fixtures import start_storage, tmp_index_name # noqa: F401 + + +pytestmark = [pytest.mark.slow, pytest.mark.index] + + +def test_configure_dim(): + class Schema1(BaseDoc): + tens: NdArray[10] = Field(is_embedding=True) + + index = MilvusDocumentIndex[Schema1]() + + docs = [Schema1(tens=np.random.random((10,))) for _ in range(10)] + index.index(docs) + + assert index.num_docs() == 10 + + class Schema2(BaseDoc): + tens: NdArray = Field(is_embedding=True, dim=10) + + index = MilvusDocumentIndex[Schema2]() + + docs = [Schema2(tens=np.random.random((10,))) for _ in range(10)] + index.index(docs) + + assert index.num_docs() == 10 + + class Schema3(BaseDoc): + tens: NdArray = Field(is_embedding=True) + + with pytest.raises(ValueError, match='The dimension information is missing'): + MilvusDocumentIndex[Schema3]() + + +def test_incorrect_vector_field(): + class Schema1(BaseDoc): + tens: NdArray[10] + + with pytest.raises(ValueError, match='Unable to find any vector columns'): + MilvusDocumentIndex[Schema1]() + + class Schema2(BaseDoc): + tens1: NdArray[10] = Field(is_embedding=True) + tens2: NdArray[20] = Field(is_embedding=True) + + with pytest.raises( + ValueError, match='Specifying multiple vector fields is not supported' + ): + MilvusDocumentIndex[Schema2]() + + +def test_runtime_config(): + class Schema(BaseDoc): + tens: NdArray = Field(dim=10, is_embedding=True) + + index = MilvusDocumentIndex[Schema]() + assert index._runtime_config.batch_size == 100 + + index.configure(batch_size=10) + assert index._runtime_config.batch_size == 10 diff --git a/tests/index/milvus/test_find.py b/tests/index/milvus/test_find.py new file mode 100644 index 00000000000..1fcc8f1ec1c --- /dev/null +++ b/tests/index/milvus/test_find.py @@ -0,0 +1,288 @@ +import numpy as np +import pytest +from pydantic import Field + +from docarray import BaseDoc, DocList +from docarray.index import MilvusDocumentIndex +from docarray.typing import NdArray, TorchTensor +from tests.index.milvus.fixtures import start_storage, tmp_index_name # noqa: F401 + +pytestmark = [pytest.mark.slow, pytest.mark.index] + + +class SimpleDoc(BaseDoc): + tens: NdArray[10] = Field(is_embedding=True, dim=1000) # type: ignore[valid-type] + + +class FlatDoc(BaseDoc): + tens_one: NdArray = Field(is_embedding=True, dim=10) + tens_two: NdArray = Field(dim=50) + + +class TorchDoc(BaseDoc): + tens: TorchTensor[10] = Field(is_embedding=True) # type: ignore[valid-type] + + +@pytest.mark.parametrize('space', ['l2', 'ip']) +def test_find_simple_schema(space, tmp_index_name): + class SimpleSchema(BaseDoc): + tens: NdArray[10] = Field(is_embedding=True, space=space) # type: ignore[valid-type] + + index = MilvusDocumentIndex[SimpleSchema](index_name=tmp_index_name) + + index_docs = [SimpleDoc(tens=np.zeros(10)) for _ in range(10)] + index_docs.append(SimpleDoc(tens=np.ones(10))) + index.index(index_docs) + + query = SimpleDoc(tens=np.ones(10)) + + docs, scores = index.find(query, limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + + +def test_find_torch(tmp_index_name): + index = MilvusDocumentIndex[TorchDoc](index_name=tmp_index_name) + + index_docs = [TorchDoc(tens=np.zeros(10)) for _ in range(10)] + index_docs.append(TorchDoc(tens=np.ones(10))) + index.index(index_docs) + + for doc in index_docs: + assert isinstance(doc.tens, TorchTensor) + + query = TorchDoc(tens=np.ones(10)) + + result_docs, scores = index.find(query, limit=5) + + assert len(result_docs) == 5 + assert len(scores) == 5 + for doc in result_docs: + assert isinstance(doc.tens, TorchTensor) + + +@pytest.mark.tensorflow +def test_find_tensorflow(): + from docarray.typing import TensorFlowTensor + + class TfDoc(BaseDoc): + tens: TensorFlowTensor[10] = Field(is_embedding=True) # type: ignore[valid-type] + + index = MilvusDocumentIndex[TfDoc]() + + index_docs = [TfDoc(tens=np.random.rand(10)) for _ in range(10)] + index.index(index_docs) + + for doc in index_docs: + assert isinstance(doc.tens, TensorFlowTensor) + + query = index_docs[-1] + docs, scores = index.find(query, limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + for doc in docs: + assert isinstance(doc.tens, TensorFlowTensor) + + +def test_find_batched(tmp_index_name): # noqa: F811 + class SimpleSchema(BaseDoc): + tens: NdArray[10] = Field(is_embedding=True) + + index = MilvusDocumentIndex[SimpleSchema](index_name=tmp_index_name) + + index_docs = [SimpleDoc(tens=vector) for vector in np.identity(10)] + index.index(index_docs) + + queries = DocList[SimpleDoc]( + [ + SimpleDoc( + tens=np.array([0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + ), + SimpleDoc( + tens=np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1]) + ), + ] + ) + + docs, scores = index.find_batched(queries, limit=1) + + assert len(docs) == 2 + assert len(docs[0]) == 1 + assert len(docs[1]) == 1 + assert len(scores) == 2 + assert len(scores[0]) == 1 + assert len(scores[1]) == 1 + + +def test_contain(tmp_index_name): + class SimpleDoc(BaseDoc): + tens: NdArray[10] = Field(is_embedding=True) + + class SimpleSchema(BaseDoc): + tens: NdArray[10] = Field(is_embedding=True) + + index = MilvusDocumentIndex[SimpleSchema](index_name=tmp_index_name) + index_docs = [SimpleDoc(tens=np.zeros(10)) for _ in range(10)] + + assert (index_docs[0] in index) is False + + index.index(index_docs) + + for doc in index_docs: + assert (doc in index) is True + + index_docs_new = [SimpleDoc(tens=np.zeros(10)) for _ in range(10)] + for doc in index_docs_new: + assert (doc in index) is False + + +@pytest.mark.parametrize('space', ['l2', 'ip']) +def test_find_flat_schema(space, tmp_index_name): + class FlatSchema(BaseDoc): + tens_one: NdArray[10] = Field(space=space, is_embedding=True) + tens_two: NdArray[50] = Field(space=space) + + index = MilvusDocumentIndex[FlatSchema](index_name=tmp_index_name) + + index_docs = [ + FlatDoc(tens_one=np.zeros(10), tens_two=np.zeros(50)) for _ in range(10) + ] + index_docs.append(FlatDoc(tens_one=np.zeros(10), tens_two=np.ones(50))) + index_docs.append(FlatDoc(tens_one=np.ones(10), tens_two=np.zeros(50))) + index.index(index_docs) + + query = FlatDoc(tens_one=np.ones(10), tens_two=np.ones(50)) + + # find on tens_one + docs, scores = index.find(query, limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + + +def test_find_nested_schema(tmp_index_name): + class SimpleDoc(BaseDoc): + tens: NdArray[10] # type: ignore[valid-type] + + class NestedDoc(BaseDoc): + d: SimpleDoc + tens: NdArray[10] # type: ignore[valid-type] + + class DeepNestedDoc(BaseDoc): + d: NestedDoc + tens: NdArray[10] = Field(is_embedding=True) + + index = MilvusDocumentIndex[DeepNestedDoc](index_name=tmp_index_name) + + index_docs = [ + DeepNestedDoc( + d=NestedDoc(d=SimpleDoc(tens=np.zeros(10)), tens=np.zeros(10)), + tens=np.zeros(10), + ) + for _ in range(10) + ] + index_docs.append( + DeepNestedDoc( + d=NestedDoc(d=SimpleDoc(tens=np.ones(10)), tens=np.zeros(10)), + tens=np.zeros(10), + ) + ) + index_docs.append( + DeepNestedDoc( + d=NestedDoc(d=SimpleDoc(tens=np.zeros(10)), tens=np.ones(10)), + tens=np.zeros(10), + ) + ) + index_docs.append( + DeepNestedDoc( + d=NestedDoc(d=SimpleDoc(tens=np.zeros(10)), tens=np.zeros(10)), + tens=np.ones(10), + ) + ) + index.index(index_docs) + + query = DeepNestedDoc( + d=NestedDoc(d=SimpleDoc(tens=np.ones(10)), tens=np.ones(10)), tens=np.ones(10) + ) + + # find on root level (only support one level now) + docs, scores = index.find(query, limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + + +def test_find_empty_index(tmp_index_name): + empty_index = MilvusDocumentIndex[SimpleDoc](index_name=tmp_index_name) + query = SimpleDoc(tens=np.random.rand(10)) + + docs, scores = empty_index.find(query, limit=5) + assert len(docs) == 0 + assert len(scores) == 0 + + +def test_simple_usage(tmp_index_name): + class MyDoc(BaseDoc): + text: str + embedding: NdArray[128] = Field(is_embedding=True) + + docs = [MyDoc(text='hey', embedding=np.random.rand(128)) for _ in range(200)] + queries = docs[0:3] + index = MilvusDocumentIndex[MyDoc](index_name=tmp_index_name) + index.index(docs=DocList[MyDoc](docs)) + resp = index.find_batched(queries=queries, limit=5) + docs_responses = resp.documents + assert len(docs_responses) == 3 + for q, matches in zip(queries, docs_responses): + assert len(matches) == 5 + assert q.id == matches[0].id + + +def test_filter_range(tmp_index_name): # noqa: F811 + class SimpleSchema(BaseDoc): + embedding: NdArray[10] = Field(space='l2', is_embedding=True) # type: ignore[valid-type] + number: int + + index = MilvusDocumentIndex[SimpleSchema](index_name=tmp_index_name) + + index_docs = [ + SimpleSchema( + embedding=np.zeros(10), + number=i, + ) + for i in range(10) + ] + index.index(index_docs) + + docs = index.filter("number > 8", limit=5) + + assert len(docs) == 1 + + docs = index.filter(f"id == '{index_docs[0].id}'", limit=5) + assert docs[0].id == index_docs[0].id + + +def test_query_builder(tmp_index_name): + class SimpleSchema(BaseDoc): + tensor: NdArray[10] = Field(is_embedding=True) + price: int + + db = MilvusDocumentIndex[SimpleSchema](index_name=tmp_index_name) + + index_docs = [ + SimpleSchema(tensor=np.array([i + 1] * 10), price=i + 1) for i in range(10) + ] + db.index(index_docs) + + q = ( + db.build_query() + .find(query=np.ones(10), limit=5) + .filter(filter_query='price <= 3') + .build() + ) + + docs, scores = db.execute_query(q) + + assert len(docs) == 3 + for doc in docs: + assert doc.price <= 3 diff --git a/tests/index/milvus/test_index_get_del.py b/tests/index/milvus/test_index_get_del.py new file mode 100644 index 00000000000..b10c5843a15 --- /dev/null +++ b/tests/index/milvus/test_index_get_del.py @@ -0,0 +1,147 @@ +import numpy as np +import pytest +import torch +from pydantic import Field + +from docarray import BaseDoc, DocList +from docarray.index import MilvusDocumentIndex +from docarray.typing import NdArray, TorchTensor +from tests.index.milvus.fixtures import start_storage, tmp_index_name # noqa: F401 + +pytestmark = [pytest.mark.slow, pytest.mark.index] + + +class SimpleDoc(BaseDoc): + tens: NdArray[10] = Field(is_embedding=True) + + +class FlatDoc(BaseDoc): + tens_one: NdArray[10] = Field(is_embedding=True) + tens_two: NdArray[50] + + +class NestedDoc(BaseDoc): + d: SimpleDoc + + +class DeepNestedDoc(BaseDoc): + d: NestedDoc + + +class TorchDoc(BaseDoc): + tens: TorchTensor[10] = Field(is_embedding=True) # type: ignore[valid-type] + + +@pytest.fixture +def ten_simple_docs(): + return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] + + +@pytest.fixture +def ten_flat_docs(): + return [ + FlatDoc(tens_one=np.random.randn(10), tens_two=np.random.randn(50)) + for _ in range(10) + ] + + +@pytest.fixture +def ten_nested_docs(): + return [NestedDoc(d=SimpleDoc(tens=np.random.randn(10))) for _ in range(10)] + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_simple_schema( + ten_simple_docs, use_docarray, tmp_index_name +): # noqa: F811 + index = MilvusDocumentIndex[SimpleDoc](index_name=tmp_index_name) + if use_docarray: + ten_simple_docs = DocList[SimpleDoc](ten_simple_docs) + + index.index(ten_simple_docs) + assert index.num_docs() == 10 + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_flat_schema(ten_flat_docs, use_docarray, tmp_index_name): # noqa: F811 + index = MilvusDocumentIndex[FlatDoc](index_name=tmp_index_name) + if use_docarray: + ten_flat_docs = DocList[FlatDoc](ten_flat_docs) + + index.index(ten_flat_docs) + assert index.num_docs() == 10 + + +def test_index_torch(tmp_index_name): + docs = [TorchDoc(tens=np.random.randn(10)) for _ in range(10)] + assert isinstance(docs[0].tens, torch.Tensor) + assert isinstance(docs[0].tens, TorchTensor) + + index = MilvusDocumentIndex[TorchDoc](index_name=tmp_index_name) + + index.index(docs) + assert index.num_docs() == 10 + + +def test_del_single(ten_simple_docs, tmp_index_name): # noqa: F811 + index = MilvusDocumentIndex[SimpleDoc](index_name=tmp_index_name) + index.index(ten_simple_docs) + # delete once + assert index.num_docs() == 10 + del index[ten_simple_docs[0].id] + assert index.num_docs() == 9 + for i, d in enumerate(ten_simple_docs): + id_ = d.id + if i == 0: # deleted + with pytest.raises(KeyError): + index[id_] + else: + assert index[id_].id == id_ + # delete again + del index[ten_simple_docs[3].id] + assert index.num_docs() == 8 + for i, d in enumerate(ten_simple_docs): + id_ = d.id + if i in (0, 3): # deleted + with pytest.raises(KeyError): + index[id_] + else: + assert index[id_].id == id_ + + +def test_del_multiple(ten_simple_docs, tmp_index_name): + docs_to_del_idx = [0, 2, 4, 6, 8] + + index = MilvusDocumentIndex[SimpleDoc](index_name=tmp_index_name) + index.index(ten_simple_docs) + + assert index.num_docs() == 10 + docs_to_del = [ten_simple_docs[i] for i in docs_to_del_idx] + ids_to_del = [d.id for d in docs_to_del] + del index[ids_to_del] + for i, doc in enumerate(ten_simple_docs): + if i in docs_to_del_idx: + with pytest.raises(KeyError): + index[doc.id] + else: + assert index[doc.id].id == doc.id + + +def test_num_docs(ten_simple_docs, tmp_index_name): # noqa: F811 + index = MilvusDocumentIndex[SimpleDoc](index_name=tmp_index_name) + index.index(ten_simple_docs) + + assert index.num_docs() == 10 + + del index[ten_simple_docs[0].id] + assert index.num_docs() == 9 + + del index[ten_simple_docs[3].id, ten_simple_docs[5].id] + assert index.num_docs() == 7 + + more_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(5)] + index.index(more_docs) + assert index.num_docs() == 12 + + del index[more_docs[2].id, ten_simple_docs[7].id] # type: ignore[arg-type] + assert index.num_docs() == 10 diff --git a/tests/index/milvus/test_persist_data.py b/tests/index/milvus/test_persist_data.py new file mode 100644 index 00000000000..b1ac4984fc5 --- /dev/null +++ b/tests/index/milvus/test_persist_data.py @@ -0,0 +1,42 @@ +import numpy as np +import pytest +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import MilvusDocumentIndex +from docarray.typing import NdArray +from tests.index.milvus.fixtures import start_storage, tmp_index_name # noqa: F401 + + +pytestmark = [pytest.mark.slow, pytest.mark.index] + + +class SimpleDoc(BaseDoc): + tens: NdArray[10] = Field(is_embedding=True) + + +def test_persist(tmp_index_name): + query = SimpleDoc(tens=np.random.random((10,))) + + # create index + index = MilvusDocumentIndex[SimpleDoc](index_name=tmp_index_name) + + index_name = index.index_name + + assert index.num_docs() == 0 + + index.index([SimpleDoc(tens=np.random.random((10,))) for _ in range(10)]) + assert index.num_docs() == 10 + find_results_before = index.find(query, limit=5) + + # load existing index + index = MilvusDocumentIndex[SimpleDoc](index_name=index_name) + assert index.num_docs() == 10 + find_results_after = index.find(query, limit=5) + for doc_before, doc_after in zip(find_results_before[0], find_results_after[0]): + assert doc_before.id == doc_after.id + assert (doc_before.tens == doc_after.tens).all() + + # add new data + index.index([SimpleDoc(tens=np.random.random((10,))) for _ in range(5)]) + assert index.num_docs() == 15 diff --git a/tests/index/milvus/test_subindex.py b/tests/index/milvus/test_subindex.py new file mode 100644 index 00000000000..ccf89c8d6b8 --- /dev/null +++ b/tests/index/milvus/test_subindex.py @@ -0,0 +1,183 @@ +import numpy as np +import pytest +from pydantic import Field + +from docarray import BaseDoc, DocList +from docarray.index import MilvusDocumentIndex +from docarray.typing import NdArray +from tests.index.milvus.fixtures import start_storage # noqa: F401 + + +pytestmark = [pytest.mark.slow, pytest.mark.index] + + +class SimpleDoc(BaseDoc): + simple_tens: NdArray[10] = Field(space='l2', is_embedding=True) + simple_text: str + + +class ListDoc(BaseDoc): + docs: DocList[SimpleDoc] + list_tens: NdArray[20] = Field(space='l2', is_embedding=True) + + +class NestedDoc(BaseDoc): + docs: DocList[SimpleDoc] + list_docs: DocList[ListDoc] + my_tens: NdArray[30] = Field(space='l2', is_embedding=True) + + +@pytest.fixture(scope='session') +def index(): + index = MilvusDocumentIndex[NestedDoc]() + return index + + +@pytest.fixture(scope='session') +def data(): + my_docs = [ + NestedDoc( + id=f'{i}', + docs=DocList[SimpleDoc]( + [ + SimpleDoc( + id=f'docs_{i}_{j}', + simple_tens=np.ones(10) * (j + 1), + simple_text=f'hello {j}', + ) + for j in range(5) + ] + ), + list_docs=DocList[ListDoc]( + [ + ListDoc( + id=f'list_docs_{i}_{j}', + docs=DocList[SimpleDoc]( + [ + SimpleDoc( + id=f'list_docs_docs_{i}_{j}_{k}', + simple_tens=np.ones(10) * (k + 1), + simple_text=f'hello {k}', + ) + for k in range(5) + ] + ), + list_tens=np.ones(20) * (j + 1), + ) + for j in range(5) + ] + ), + my_tens=np.ones((30,)) * (i + 1), + ) + for i in range(5) + ] + return my_docs + + +def test_subindex_init(index): + assert isinstance(index._subindices['docs'], MilvusDocumentIndex) + assert isinstance(index._subindices['list_docs'], MilvusDocumentIndex) + assert isinstance( + index._subindices['list_docs']._subindices['docs'], MilvusDocumentIndex + ) + + +def test_subindex_index(index, data): + index.index(data) + assert index.num_docs() == 5 + assert index._subindices['docs'].num_docs() == 25 + assert index._subindices['list_docs'].num_docs() == 25 + assert index._subindices['list_docs']._subindices['docs'].num_docs() == 125 + + +def test_subindex_get(index, data): + index.index(data) + doc = index['1'] + assert type(doc) == NestedDoc + assert doc.id == '1' + assert len(doc.docs) == 5 + assert type(doc.docs[0]) == SimpleDoc + assert doc.docs[0].id == 'docs_1_0' + assert np.allclose(doc.docs[0].simple_tens, np.ones(10)) + + assert len(doc.list_docs) == 5 + assert type(doc.list_docs[0]) == ListDoc + assert doc.list_docs[0].id == 'list_docs_1_0' + assert len(doc.list_docs[0].docs) == 5 + assert type(doc.list_docs[0].docs[0]) == SimpleDoc + assert doc.list_docs[0].docs[0].id == 'list_docs_docs_1_0_0' + assert np.allclose(doc.list_docs[0].docs[0].simple_tens, np.ones(10)) + assert doc.list_docs[0].docs[0].simple_text == 'hello 0' + assert np.allclose(doc.list_docs[0].list_tens, np.ones(20)) + + assert np.allclose(doc.my_tens, np.ones(30) * 2) + + +def test_subindex_del(index, data): + index.index(data) + del index['0'] + assert index.num_docs() == 4 + assert index._subindices['docs'].num_docs() == 20 + assert index._subindices['list_docs'].num_docs() == 20 + assert index._subindices['list_docs']._subindices['docs'].num_docs() == 100 + + +def test_subindex_contain(index, data): + index.index(data) + # Checks for individual simple_docs within list_docs + for i in range(4): + doc = index[f'{i + 1}'] + for simple_doc in doc.list_docs: + assert index.subindex_contains(simple_doc) + for nested_doc in simple_doc.docs: + assert index.subindex_contains(nested_doc) + + invalid_doc = SimpleDoc( + id='non_existent', + simple_tens=np.zeros(10), + simple_text='invalid', + ) + assert not index.subindex_contains(invalid_doc) + + # Checks for an empty doc + empty_doc = SimpleDoc( + id='', + simple_tens=np.zeros(10), + simple_text='', + ) + assert not index.subindex_contains(empty_doc) + + # Empty index + empty_index = MilvusDocumentIndex[NestedDoc]() + assert empty_doc not in empty_index + + +def test_find_subindex(index, data): + index.index(data) + # root level + query = np.ones((30,)) + with pytest.raises(ValueError): + _, _ = index.find_subindex(query, subindex='', limit=5) + + # sub level + query = np.ones((10,)) + root_docs, docs, scores = index.find_subindex(query, subindex='docs', limit=5) + assert type(root_docs[0]) == NestedDoc + assert type(docs[0]) == SimpleDoc + assert len(scores) == 5 + for root_doc, doc in zip(root_docs, docs): + assert np.allclose(doc.simple_tens, np.ones(10)) + assert root_doc.id == f'{doc.id.split("_")[-2]}' + + # sub sub level + query = np.ones((10,)) + root_docs, docs, scores = index.find_subindex( + query, subindex='list_docs__docs', limit=5 + ) + assert len(docs) == 5 + assert len(scores) == 5 + assert type(root_docs[0]) == NestedDoc + assert type(docs[0]) == SimpleDoc + for root_doc, doc in zip(root_docs, docs): + assert np.allclose(doc.simple_tens, np.ones(10)) + assert root_doc.id == f'{doc.id.split("_")[-3]}' pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy