postgresml
diff --git a/‎pgml-sdks/pgml/Cargo.lock
Lines changed: 1 addition & 0 deletions b/‎pgml-sdks/pgml/Cargo.lock
Lines changed: 1 addition & 0 deletions
diff --git a/‎pgml-sdks/pgml/Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎pgml-sdks/pgml/Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎pgml-sdks/pgml/build.rs
Lines changed: 4 additions & 2 deletions b/‎pgml-sdks/pgml/build.rs
Lines changed: 4 additions & 2 deletions
diff --git a/‎pgml-sdks/pgml/javascript/README.md
Lines changed: 18 additions & 0 deletions b/‎pgml-sdks/pgml/javascript/README.md
Lines changed: 18 additions & 0 deletions
diff --git a/‎pgml-sdks/pgml/javascript/examples/extractive_question_answering.js
Lines changed: 0 additions & 1 deletion b/‎pgml-sdks/pgml/javascript/examples/extractive_question_answering.js
Lines changed: 0 additions & 1 deletion
diff --git a/‎pgml-sdks/pgml/javascript/examples/summarizing_question_answering.js
Lines changed: 0 additions & 2 deletions b/‎pgml-sdks/pgml/javascript/examples/summarizing_question_answering.js
Lines changed: 0 additions & 2 deletions
diff --git a/‎pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts
Lines changed: 55 additions & 1 deletion b/‎pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts
Lines changed: 55 additions & 1 deletion
diff --git a/‎pgml-sdks/pgml/python/README.md
Lines changed: 18 additions & 0 deletions b/‎pgml-sdks/pgml/python/README.md
Lines changed: 18 additions & 0 deletions
diff --git a/‎pgml-sdks/pgml/python/examples/summarizing_question_answering.py
Lines changed: 1 addition & 4 deletions b/‎pgml-sdks/pgml/python/examples/summarizing_question_answering.py
Lines changed: 1 addition & 4 deletions
diff --git a/‎pgml-sdks/pgml/python/pgml/pgml.pyi
Lines changed: 2 additions & 87 deletions b/‎pgml-sdks/pgml/python/pgml/pgml.pyi
Lines changed: 2 additions & 87 deletions
@@ -20,7 +20,7 @@ serde_json = "1.0.9"
 anyhow = "1.0.9"
 tokio = { version = "1.28.2", features = [ "macros" ] }
 chrono = "0.4.9"
-pyo3 = { version = "0.18.3", optional = true, features = ["extension-module"] }
+pyo3 = { version = "0.18.3", optional = true, features = ["extension-module", "anyhow"] }
 pyo3-asyncio = { version = "0.18", features = ["attributes", "tokio-runtime"], optional = true }
 neon = { version = "0.10", optional = true, default-features = false, features = ["napi-6", "promise-api", "channel-api"] }
 itertools = "0.10.5"
 
@@ -3,14 +3,16 @@ use std::fs::OpenOptions;
 use std::io::Write;
 
 const ADDITIONAL_DEFAULTS_FOR_PYTHON: &[u8] = br#"
-def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
+def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
+async def migrate() -> None
 
 Json = Any
 DateTime = int
 "#;
 
 const ADDITIONAL_DEFAULTS_FOR_JAVASCRIPT: &[u8] = br#"
-export function js_init_logger(level?: string, format?: string): void;
+export function init_logger(level?: string, format?: string): void;
+export function migrate(): Promise<void>;
 
 export type Json = { [key: string]: any };
 export type DateTime = Date;
 
@@ -519,6 +519,24 @@ const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
 await collection.add_pipeline(pipeline)
 ```
 
+### Configuring HNSW Indexing Parameters
+
+Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.
+
+Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.
+
+```javascript
+const model = pgml.newModel()
+const splitter = pgml.newSplitter()
+const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
+    hnsw: {
+        m: 100,
+        ef_construction: 200
+    }
+})
+await collection.add_pipeline(pipeline)
+```
+
 ### Searching with Pipelines
 
 Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.
 
@@ -1,7 +1,6 @@
 const pgml = require("pgml");
 require("dotenv").config();
 
-pgml.js_init_logger();
 
 const main = async () => {
   // Initialize the collection
 
@@ -1,8 +1,6 @@
 const pgml = require("pgml");
 require("dotenv").config();
 
-pgml.js_init_logger();
-
 const main = async () => {
   // Initialize the collection
   const collection = pgml.newCollection("my_javascript_sqa_collection");
 
@@ -10,7 +10,7 @@ import pgml from "../../index.js";
 ////////////////////////////////////////////////////////////////////////////////////
 
 const LOG_LEVEL = process.env.LOG_LEVEL ? process.env.LOG_LEVEL : "ERROR";
-pgml.js_init_logger(LOG_LEVEL);
+pgml.init_logger(LOG_LEVEL);
 
 const generate_dummy_documents = (count: number) => {
   let docs = [];
@@ -143,6 +143,52 @@ it("can vector search with query builder and metadata filtering", async () => {
   await collection.archive();
 });
 
+it("can vector search with query builder and custom hnsfw ef_search value", async () => {
+  let model = pgml.newModel();
+  let splitter = pgml.newSplitter();
+  let pipeline = pgml.newPipeline("test_j_p_cvswqbachesv_0", model, splitter);
+  let collection = pgml.newCollection("test_j_c_cvswqbachesv_0");
+  await collection.upsert_documents(generate_dummy_documents(3));
+  await collection.add_pipeline(pipeline);
+  let results = await collection
+    .query()
+    .vector_recall("Here is some query", pipeline)
+    .filter({
+      hnsw: {
+        ef_search: 2,
+      },
+    })
+    .limit(10)
+    .fetch_all();
+  expect(results).toHaveLength(3);
+  await collection.archive();
+});
+
+it("can vector search with query builder and custom hnsfw ef_search value and remote embeddings", async () => {
+  let model = pgml.newModel("text-embedding-ada-002", "openai");
+  let splitter = pgml.newSplitter();
+  let pipeline = pgml.newPipeline(
+    "test_j_p_cvswqbachesvare_0",
+    model,
+    splitter,
+  );
+  let collection = pgml.newCollection("test_j_c_cvswqbachesvare_0");
+  await collection.upsert_documents(generate_dummy_documents(3));
+  await collection.add_pipeline(pipeline);
+  let results = await collection
+    .query()
+    .vector_recall("Here is some query", pipeline)
+    .filter({
+      hnsw: {
+        ef_search: 2,
+      },
+    })
+    .limit(10)
+    .fetch_all();
+  expect(results).toHaveLength(3);
+  await collection.archive();
+});
+
 ///////////////////////////////////////////////////
 // Test user output facing functions //////////////
 ///////////////////////////////////////////////////
@@ -220,3 +266,11 @@ it("can delete documents", async () => {
 
   await collection.archive();
 });
+
+///////////////////////////////////////////////////
+// Test migrations ////////////////////////////////
+///////////////////////////////////////////////////
+
+it("can migrate", async () => {
+  await pgml.migrate();
+});
@@ -530,6 +530,24 @@ pipeline = Pipeline("test_pipeline", model, splitter, {
 await collection.add_pipeline(pipeline)
 ```
 
+### Configuring HNSW Indexing Parameters
+
+Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.
+
+Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.
+
+```python
+model = Model()
+splitter = Splitter()
+pipeline = Pipeline("test_pipeline", model, splitter, {
+    "hnsw": {
+        "m": 100,
+        "ef_construction": 200 
+    }
+})
+await collection.add_pipeline(pipeline)
+```
+
 ### Searching with Pipelines
 
 Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.
 
@@ -1,4 +1,4 @@
-from pgml import Collection, Model, Splitter, Pipeline, Builtins, py_init_logger
+from pgml import Collection, Model, Splitter, Pipeline, Builtins
 import json
 from datasets import load_dataset
 from time import time
@@ -7,9 +7,6 @@
 import asyncio
 
 
-py_init_logger()
-
-
 async def main():
     load_dotenv()
     console = Console()
 
@@ -1,91 +1,6 @@
 
-def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
+def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
+async def migrate() -> None
 
 Json = Any
 DateTime = int
-
-# Top of file key: A12BECOD!
-from typing import List, Dict, Optional, Self, Any
-
-
-class Builtins:
-	def __init__(self, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
-		...
-	def query(self, query: str) -> QueryRunner
-		...
-	async def transform(self, task: Json, inputs: List[str], args: Optional[Json] = Any) -> Json
-		...
-
-class Collection:
-	def __init__(self, name: str, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
-		...
-	async def add_pipeline(self, pipeline: Pipeline) -> None
-		...
-	async def remove_pipeline(self, pipeline: Pipeline) -> None
-		...
-	async def enable_pipeline(self, pipeline: Pipeline) -> None
-		...
-	async def disable_pipeline(self, pipeline: Pipeline) -> None
-		...
-	async def upsert_documents(self, documents: List[Json]) -> None
-		...
-	async def get_documents(self, args: Optional[Json] = Any) -> List[Json]
-		...
-	async def delete_documents(self, filter: Json) -> None
-		...
-	async def vector_search(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any, top_k: Optional[int] = 1) -> List[tuple[float, str, Json]]
-		...
-	async def archive(self) -> None
-		...
-	def query(self) -> QueryBuilder
-		...
-	async def get_pipelines(self) -> List[Pipeline]
-		...
-	async def get_pipeline(self, name: str) -> Pipeline
-		...
-	async def exists(self) -> bool
-		...
-
-class Model:
-	def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", source: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
-		...
-
-class Pipeline:
-	def __init__(self, name: str, model: Optional[Model] = Any, splitter: Optional[Splitter] = Any, parameters: Optional[Json] = Any) -> Self
-		...
-	async def get_status(self) -> PipelineSyncData
-		...
-	async def to_dict(self) -> Json
-		...
-
-class QueryBuilder:
-	def limit(self, limit: int) -> Self
-		...
-	def filter(self, filter: Json) -> Self
-		...
-	def vector_recall(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any) -> Self
-		...
-	async def fetch_all(self) -> List[tuple[float, str, Json]]
-		...
-	def to_full_string(self) -> str
-		...
-
-class QueryRunner:
-	async def fetch_all(self) -> Json
-		...
-	async def execute(self) -> None
-		...
-	def bind_string(self, bind_value: str) -> Self
-		...
-	def bind_int(self, bind_value: int) -> Self
-		...
-	def bind_float(self, bind_value: float) -> Self
-		...
-	def bind_bool(self, bind_value: bool) -> Self
-		...
-	def bind_json(self, bind_value: Json) -> Self
-		...
-
-class Splitter:
-	def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
-		...