Content-Length: 956479 | pFad | http://github.com/postgresml/postgresml/commit/f21820227ff42b0f75710fcf64e5e91c3c2383f6

39 HNSW and Migrations Done (#988) · postgresml/postgresml@f218202 · GitHub
Skip to content

Commit f218202

Browse files
authored
HNSW and Migrations Done (#988)
1 parent 3dafbdb commit f218202

21 files changed

+706
-236
lines changed

pgml-sdks/pgml/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pgml-sdks/pgml/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ serde_json = "1.0.9"
2020
anyhow = "1.0.9"
2121
tokio = { version = "1.28.2", features = [ "macros" ] }
2222
chrono = "0.4.9"
23-
pyo3 = { version = "0.18.3", optional = true, features = ["extension-module"] }
23+
pyo3 = { version = "0.18.3", optional = true, features = ["extension-module", "anyhow"] }
2424
pyo3-asyncio = { version = "0.18", features = ["attributes", "tokio-runtime"], optional = true }
2525
neon = { version = "0.10", optional = true, default-features = false, features = ["napi-6", "promise-api", "channel-api"] }
2626
itertools = "0.10.5"

pgml-sdks/pgml/build.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@ use std::fs::OpenOptions;
33
use std::io::Write;
44

55
const ADDITIONAL_DEFAULTS_FOR_PYTHON: &[u8] = br#"
6-
def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
6+
def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
7+
async def migrate() -> None
78
89
Json = Any
910
DateTime = int
1011
"#;
1112

1213
const ADDITIONAL_DEFAULTS_FOR_JAVASCRIPT: &[u8] = br#"
13-
export function js_init_logger(level?: string, format?: string): void;
14+
export function init_logger(level?: string, format?: string): void;
15+
export function migrate(): Promise<void>;
1416
1517
export type Json = { [key: string]: any };
1618
export type DateTime = Date;

pgml-sdks/pgml/javascript/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,24 @@ const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
519519
await collection.add_pipeline(pipeline)
520520
```
521521
522+
### Configuring HNSW Indexing Parameters
523+
524+
Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.
525+
526+
Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.
527+
528+
```javascript
529+
const model = pgml.newModel()
530+
const splitter = pgml.newSplitter()
531+
const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
532+
hnsw: {
533+
m: 100,
534+
ef_construction: 200
535+
}
536+
})
537+
await collection.add_pipeline(pipeline)
538+
```
539+
522540
### Searching with Pipelines
523541
524542
Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.

pgml-sdks/pgml/javascript/examples/extractive_question_answering.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
const pgml = require("pgml");
22
require("dotenv").config();
33

4-
pgml.js_init_logger();
54

65
const main = async () => {
76
// Initialize the collection

pgml-sdks/pgml/javascript/examples/summarizing_question_answering.js

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
const pgml = require("pgml");
22
require("dotenv").config();
33

4-
pgml.js_init_logger();
5-
64
const main = async () => {
75
// Initialize the collection
86
const collection = pgml.newCollection("my_javascript_sqa_collection");

pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import pgml from "../../index.js";
1010
//github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com/
1111

1212
const LOG_LEVEL = process.env.LOG_LEVEL ? process.env.LOG_LEVEL : "ERROR";
13-
pgml.js_init_logger(LOG_LEVEL);
13+
pgml.init_logger(LOG_LEVEL);
1414

1515
const generate_dummy_documents = (count: number) => {
1616
let docs = [];
@@ -143,6 +143,52 @@ it("can vector search with query builder and metadata filtering", async () => {
143143
await collection.archive();
144144
});
145145

146+
it("can vector search with query builder and custom hnsfw ef_search value", async () => {
147+
let model = pgml.newModel();
148+
let splitter = pgml.newSplitter();
149+
let pipeline = pgml.newPipeline("test_j_p_cvswqbachesv_0", model, splitter);
150+
let collection = pgml.newCollection("test_j_c_cvswqbachesv_0");
151+
await collection.upsert_documents(generate_dummy_documents(3));
152+
await collection.add_pipeline(pipeline);
153+
let results = await collection
154+
.query()
155+
.vector_recall("Here is some query", pipeline)
156+
.filter({
157+
hnsw: {
158+
ef_search: 2,
159+
},
160+
})
161+
.limit(10)
162+
.fetch_all();
163+
expect(results).toHaveLength(3);
164+
await collection.archive();
165+
});
166+
167+
it("can vector search with query builder and custom hnsfw ef_search value and remote embeddings", async () => {
168+
let model = pgml.newModel("text-embedding-ada-002", "openai");
169+
let splitter = pgml.newSplitter();
170+
let pipeline = pgml.newPipeline(
171+
"test_j_p_cvswqbachesvare_0",
172+
model,
173+
splitter,
174+
);
175+
let collection = pgml.newCollection("test_j_c_cvswqbachesvare_0");
176+
await collection.upsert_documents(generate_dummy_documents(3));
177+
await collection.add_pipeline(pipeline);
178+
let results = await collection
179+
.query()
180+
.vector_recall("Here is some query", pipeline)
181+
.filter({
182+
hnsw: {
183+
ef_search: 2,
184+
},
185+
})
186+
.limit(10)
187+
.fetch_all();
188+
expect(results).toHaveLength(3);
189+
await collection.archive();
190+
});
191+
146192
//github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com/
147193
// Test user output facing functions //github.com///github.com///github.com///github.com///
148194
//github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com/
@@ -220,3 +266,11 @@ it("can delete documents", async () => {
220266

221267
await collection.archive();
222268
});
269+
270+
//github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com/
271+
// Test migrations //github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///
272+
//github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com///github.com/
273+
274+
it("can migrate", async () => {
275+
await pgml.migrate();
276+
});

pgml-sdks/pgml/python/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,24 @@ pipeline = Pipeline("test_pipeline", model, splitter, {
530530
await collection.add_pipeline(pipeline)
531531
```
532532

533+
### Configuring HNSW Indexing Parameters
534+
535+
Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.
536+
537+
Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.
538+
539+
```python
540+
model = Model()
541+
splitter = Splitter()
542+
pipeline = Pipeline("test_pipeline", model, splitter, {
543+
"hnsw": {
544+
"m": 100,
545+
"ef_construction": 200
546+
}
547+
})
548+
await collection.add_pipeline(pipeline)
549+
```
550+
533551
### Searching with Pipelines
534552

535553
Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.

pgml-sdks/pgml/python/examples/summarizing_question_answering.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from pgml import Collection, Model, Splitter, Pipeline, Builtins, py_init_logger
1+
from pgml import Collection, Model, Splitter, Pipeline, Builtins
22
import json
33
from datasets import load_dataset
44
from time import time
@@ -7,9 +7,6 @@
77
import asyncio
88

99

10-
py_init_logger()
11-
12-
1310
async def main():
1411
load_dotenv()
1512
console = Console()

pgml-sdks/pgml/python/pgml/pgml.pyi

Lines changed: 2 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,91 +1,6 @@
11

2-
def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
2+
def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
3+
async def migrate() -> None
34

45
Json = Any
56
DateTime = int
6-
7-
# Top of file key: A12BECOD!
8-
from typing import List, Dict, Optional, Self, Any
9-
10-
11-
class Builtins:
12-
def __init__(self, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
13-
...
14-
def query(self, query: str) -> QueryRunner
15-
...
16-
async def transform(self, task: Json, inputs: List[str], args: Optional[Json] = Any) -> Json
17-
...
18-
19-
class Collection:
20-
def __init__(self, name: str, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
21-
...
22-
async def add_pipeline(self, pipeline: Pipeline) -> None
23-
...
24-
async def remove_pipeline(self, pipeline: Pipeline) -> None
25-
...
26-
async def enable_pipeline(self, pipeline: Pipeline) -> None
27-
...
28-
async def disable_pipeline(self, pipeline: Pipeline) -> None
29-
...
30-
async def upsert_documents(self, documents: List[Json]) -> None
31-
...
32-
async def get_documents(self, args: Optional[Json] = Any) -> List[Json]
33-
...
34-
async def delete_documents(self, filter: Json) -> None
35-
...
36-
async def vector_search(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any, top_k: Optional[int] = 1) -> List[tuple[float, str, Json]]
37-
...
38-
async def archive(self) -> None
39-
...
40-
def query(self) -> QueryBuilder
41-
...
42-
async def get_pipelines(self) -> List[Pipeline]
43-
...
44-
async def get_pipeline(self, name: str) -> Pipeline
45-
...
46-
async def exists(self) -> bool
47-
...
48-
49-
class Model:
50-
def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", source: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
51-
...
52-
53-
class Pipeline:
54-
def __init__(self, name: str, model: Optional[Model] = Any, splitter: Optional[Splitter] = Any, parameters: Optional[Json] = Any) -> Self
55-
...
56-
async def get_status(self) -> PipelineSyncData
57-
...
58-
async def to_dict(self) -> Json
59-
...
60-
61-
class QueryBuilder:
62-
def limit(self, limit: int) -> Self
63-
...
64-
def filter(self, filter: Json) -> Self
65-
...
66-
def vector_recall(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any) -> Self
67-
...
68-
async def fetch_all(self) -> List[tuple[float, str, Json]]
69-
...
70-
def to_full_string(self) -> str
71-
...
72-
73-
class QueryRunner:
74-
async def fetch_all(self) -> Json
75-
...
76-
async def execute(self) -> None
77-
...
78-
def bind_string(self, bind_value: str) -> Self
79-
...
80-
def bind_int(self, bind_value: int) -> Self
81-
...
82-
def bind_float(self, bind_value: float) -> Self
83-
...
84-
def bind_bool(self, bind_value: bool) -> Self
85-
...
86-
def bind_json(self, bind_value: Json) -> Self
87-
...
88-
89-
class Splitter:
90-
def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
91-
...

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/postgresml/postgresml/commit/f21820227ff42b0f75710fcf64e5e91c3c2383f6

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy