Skip to content

refactor: hnswlib performance #1727

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jul 31, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
chore: apply optim
Signed-off-by: Joan Fontanals Martinez <joan.martinez@jina.ai>
  • Loading branch information
Joan Fontanals Martinez committed Jul 28, 2023
commit 8c0bf6cf536c99b0e7dee665822cc210018ffd19
31 changes: 21 additions & 10 deletions docarray/index/backends/hnswlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,11 @@ def __init__(self, db_config=None, **kwargs):
if col.config
}
self._hnsw_indices = {}
sub_docs_exist = False
cosine_metric_index_exist = False
for col_name, col in self._column_infos.items():
if '__' in col_name:
sub_docs_exist = True
if safe_issubclass(col.docarray_type, AnyDocArray):
continue
if not col.config:
Expand All @@ -128,7 +132,12 @@ def __init__(self, db_config=None, **kwargs):
else:
self._hnsw_indices[col_name] = self._create_index(col_name, col)
self._logger.info(f'Created a new index for column `{col_name}`')
if self._hnsw_indices[col_name].space == 'cosine':
cosine_metric_index_exist = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we care about this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because for cosine, HNSWLib normalizes the vectors, and then if we retrieve, they have chanded, so no consistent API can be provided


self._apply_optim_no_embedding_in_sqlite = (
not sub_docs_exist and not cosine_metric_index_exist
) # optimization consisting in not serializing embeddings to SQLite because they are expensive to send and they can be reconstructed from the HNSW index itself.
# SQLite setup
self._sqlite_db_path = os.path.join(self._work_dir, 'docs_sqlite.db')
self._logger.debug(f'DB path set to {self._sqlite_db_path}')
Expand Down Expand Up @@ -565,9 +574,10 @@ def _get_num_docs_sqlite(self) -> int:
# serialization helpers
def _doc_to_bytes(self, doc: BaseDoc) -> bytes:
pb = doc.to_protobuf()
for col_name in self._hnsw_indices.keys():
pb.data[col_name].Clear()
pb.data[col_name].Clear()
if self._apply_optim_no_embedding_in_sqlite:
for col_name in self._hnsw_indices.keys():
pb.data[col_name].Clear()
pb.data[col_name].Clear()
return pb.SerializeToString()

def _doc_from_bytes(
Expand All @@ -578,13 +588,14 @@ def _doc_from_bytes(
pb = DocProto.FromString(
data
) # I cannot reconstruct directly the DA object because it may fail at validation because embedding may not be Optional
for k, v in reconstruct_embeddings.items():
node_proto = (
self.out_schema.__fields__[k]
.type_._docarray_from_ndarray(np.array(v))
._to_node_protobuf()
)
pb.data[k].MergeFrom(node_proto)
if self._apply_optim_no_embedding_in_sqlite:
for k, v in reconstruct_embeddings.items():
node_proto = (
schema_cls._get_field_type(k)
._docarray_from_ndarray(np.array(v))
._to_node_protobuf()
)
pb.data[k].MergeFrom(node_proto)

doc = schema_cls.from_protobuf(pb)
return doc
Expand Down
2 changes: 1 addition & 1 deletion tests/index/hnswlib/test_index_get_del.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,5 +410,5 @@ class TextSimpleDoc(SimpleDoc):
for doc in res.documents:
if doc.id == docs[0].id:
found = True
assert (doc.tens == new_tensor).all()
assert np.allclose(doc.tens, new_tensor)
assert found
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy