Skip to content

Commit f218202

Browse files
authored
HNSW and Migrations Done (#988)
1 parent 3dafbdb commit f218202

21 files changed

+706
-236
lines changed

pgml-sdks/pgml/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pgml-sdks/pgml/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ serde_json = "1.0.9"
2020
anyhow = "1.0.9"
2121
tokio = { version = "1.28.2", features = [ "macros" ] }
2222
chrono = "0.4.9"
23-
pyo3 = { version = "0.18.3", optional = true, features = ["extension-module"] }
23+
pyo3 = { version = "0.18.3", optional = true, features = ["extension-module", "anyhow"] }
2424
pyo3-asyncio = { version = "0.18", features = ["attributes", "tokio-runtime"], optional = true }
2525
neon = { version = "0.10", optional = true, default-features = false, features = ["napi-6", "promise-api", "channel-api"] }
2626
itertools = "0.10.5"

pgml-sdks/pgml/build.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@ use std::fs::OpenOptions;
33
use std::io::Write;
44

55
const ADDITIONAL_DEFAULTS_FOR_PYTHON: &[u8] = br#"
6-
def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
6+
def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
7+
async def migrate() -> None
78
89
Json = Any
910
DateTime = int
1011
"#;
1112

1213
const ADDITIONAL_DEFAULTS_FOR_JAVASCRIPT: &[u8] = br#"
13-
export function js_init_logger(level?: string, format?: string): void;
14+
export function init_logger(level?: string, format?: string): void;
15+
export function migrate(): Promise<void>;
1416
1517
export type Json = { [key: string]: any };
1618
export type DateTime = Date;

pgml-sdks/pgml/javascript/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,24 @@ const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
519519
await collection.add_pipeline(pipeline)
520520
```
521521
522+
### Configuring HNSW Indexing Parameters
523+
524+
Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.
525+
526+
Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.
527+
528+
```javascript
529+
const model = pgml.newModel()
530+
const splitter = pgml.newSplitter()
531+
const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
532+
hnsw: {
533+
m: 100,
534+
ef_construction: 200
535+
}
536+
})
537+
await collection.add_pipeline(pipeline)
538+
```
539+
522540
### Searching with Pipelines
523541
524542
Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.

pgml-sdks/pgml/javascript/examples/extractive_question_answering.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
const pgml = require("pgml");
22
require("dotenv").config();
33

4-
pgml.js_init_logger();
54

65
const main = async () => {
76
// Initialize the collection

pgml-sdks/pgml/javascript/examples/summarizing_question_answering.js

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
const pgml = require("pgml");
22
require("dotenv").config();
33

4-
pgml.js_init_logger();
5-
64
const main = async () => {
75
// Initialize the collection
86
const collection = pgml.newCollection("my_javascript_sqa_collection");

pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import pgml from "../../index.js";
1010
////////////////////////////////////////////////////////////////////////////////////
1111

1212
const LOG_LEVEL = process.env.LOG_LEVEL ? process.env.LOG_LEVEL : "ERROR";
13-
pgml.js_init_logger(LOG_LEVEL);
13+
pgml.init_logger(LOG_LEVEL);
1414

1515
const generate_dummy_documents = (count: number) => {
1616
let docs = [];
@@ -143,6 +143,52 @@ it("can vector search with query builder and metadata filtering", async () => {
143143
await collection.archive();
144144
});
145145

146+
it("can vector search with query builder and custom hnsfw ef_search value", async () => {
147+
let model = pgml.newModel();
148+
let splitter = pgml.newSplitter();
149+
let pipeline = pgml.newPipeline("test_j_p_cvswqbachesv_0", model, splitter);
150+
let collection = pgml.newCollection("test_j_c_cvswqbachesv_0");
151+
await collection.upsert_documents(generate_dummy_documents(3));
152+
await collection.add_pipeline(pipeline);
153+
let results = await collection
154+
.query()
155+
.vector_recall("Here is some query", pipeline)
156+
.filter({
157+
hnsw: {
158+
ef_search: 2,
159+
},
160+
})
161+
.limit(10)
162+
.fetch_all();
163+
expect(results).toHaveLength(3);
164+
await collection.archive();
165+
});
166+
167+
it("can vector search with query builder and custom hnsfw ef_search value and remote embeddings", async () => {
168+
let model = pgml.newModel("text-embedding-ada-002", "openai");
169+
let splitter = pgml.newSplitter();
170+
let pipeline = pgml.newPipeline(
171+
"test_j_p_cvswqbachesvare_0",
172+
model,
173+
splitter,
174+
);
175+
let collection = pgml.newCollection("test_j_c_cvswqbachesvare_0");
176+
await collection.upsert_documents(generate_dummy_documents(3));
177+
await collection.add_pipeline(pipeline);
178+
let results = await collection
179+
.query()
180+
.vector_recall("Here is some query", pipeline)
181+
.filter({
182+
hnsw: {
183+
ef_search: 2,
184+
},
185+
})
186+
.limit(10)
187+
.fetch_all();
188+
expect(results).toHaveLength(3);
189+
await collection.archive();
190+
});
191+
146192
///////////////////////////////////////////////////
147193
// Test user output facing functions //////////////
148194
///////////////////////////////////////////////////
@@ -220,3 +266,11 @@ it("can delete documents", async () => {
220266

221267
await collection.archive();
222268
});
269+
270+
///////////////////////////////////////////////////
271+
// Test migrations ////////////////////////////////
272+
///////////////////////////////////////////////////
273+
274+
it("can migrate", async () => {
275+
await pgml.migrate();
276+
});

pgml-sdks/pgml/python/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,24 @@ pipeline = Pipeline("test_pipeline", model, splitter, {
530530
await collection.add_pipeline(pipeline)
531531
```
532532

533+
### Configuring HNSW Indexing Parameters
534+
535+
Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.
536+
537+
Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.
538+
539+
```python
540+
model = Model()
541+
splitter = Splitter()
542+
pipeline = Pipeline("test_pipeline", model, splitter, {
543+
"hnsw": {
544+
"m": 100,
545+
"ef_construction": 200
546+
}
547+
})
548+
await collection.add_pipeline(pipeline)
549+
```
550+
533551
### Searching with Pipelines
534552

535553
Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.

pgml-sdks/pgml/python/examples/summarizing_question_answering.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from pgml import Collection, Model, Splitter, Pipeline, Builtins, py_init_logger
1+
from pgml import Collection, Model, Splitter, Pipeline, Builtins
22
import json
33
from datasets import load_dataset
44
from time import time
@@ -7,9 +7,6 @@
77
import asyncio
88

99

10-
py_init_logger()
11-
12-
1310
async def main():
1411
load_dotenv()
1512
console = Console()

pgml-sdks/pgml/python/pgml/pgml.pyi

Lines changed: 2 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,91 +1,6 @@
11

2-
def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
2+
def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
3+
async def migrate() -> None
34

45
Json = Any
56
DateTime = int
6-
7-
# Top of file key: A12BECOD!
8-
from typing import List, Dict, Optional, Self, Any
9-
10-
11-
class Builtins:
12-
def __init__(self, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
13-
...
14-
def query(self, query: str) -> QueryRunner
15-
...
16-
async def transform(self, task: Json, inputs: List[str], args: Optional[Json] = Any) -> Json
17-
...
18-
19-
class Collection:
20-
def __init__(self, name: str, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
21-
...
22-
async def add_pipeline(self, pipeline: Pipeline) -> None
23-
...
24-
async def remove_pipeline(self, pipeline: Pipeline) -> None
25-
...
26-
async def enable_pipeline(self, pipeline: Pipeline) -> None
27-
...
28-
async def disable_pipeline(self, pipeline: Pipeline) -> None
29-
...
30-
async def upsert_documents(self, documents: List[Json]) -> None
31-
...
32-
async def get_documents(self, args: Optional[Json] = Any) -> List[Json]
33-
...
34-
async def delete_documents(self, filter: Json) -> None
35-
...
36-
async def vector_search(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any, top_k: Optional[int] = 1) -> List[tuple[float, str, Json]]
37-
...
38-
async def archive(self) -> None
39-
...
40-
def query(self) -> QueryBuilder
41-
...
42-
async def get_pipelines(self) -> List[Pipeline]
43-
...
44-
async def get_pipeline(self, name: str) -> Pipeline
45-
...
46-
async def exists(self) -> bool
47-
...
48-
49-
class Model:
50-
def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", source: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
51-
...
52-
53-
class Pipeline:
54-
def __init__(self, name: str, model: Optional[Model] = Any, splitter: Optional[Splitter] = Any, parameters: Optional[Json] = Any) -> Self
55-
...
56-
async def get_status(self) -> PipelineSyncData
57-
...
58-
async def to_dict(self) -> Json
59-
...
60-
61-
class QueryBuilder:
62-
def limit(self, limit: int) -> Self
63-
...
64-
def filter(self, filter: Json) -> Self
65-
...
66-
def vector_recall(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any) -> Self
67-
...
68-
async def fetch_all(self) -> List[tuple[float, str, Json]]
69-
...
70-
def to_full_string(self) -> str
71-
...
72-
73-
class QueryRunner:
74-
async def fetch_all(self) -> Json
75-
...
76-
async def execute(self) -> None
77-
...
78-
def bind_string(self, bind_value: str) -> Self
79-
...
80-
def bind_int(self, bind_value: int) -> Self
81-
...
82-
def bind_float(self, bind_value: float) -> Self
83-
...
84-
def bind_bool(self, bind_value: bool) -> Self
85-
...
86-
def bind_json(self, bind_value: Json) -> Self
87-
...
88-
89-
class Splitter:
90-
def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
91-
...

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy