diff --git a/pgml-sdks/pgml/Cargo.toml b/pgml-sdks/pgml/Cargo.toml index 7404acc8d..5db2bbefd 100644 --- a/pgml-sdks/pgml/Cargo.toml +++ b/pgml-sdks/pgml/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pgml" -version = "0.9.5" +version = "0.9.6" edition = "2021" authors = ["PosgresML "] homepage = "https://postgresml.org/" diff --git a/pgml-sdks/pgml/javascript/package.json b/pgml-sdks/pgml/javascript/package.json index 1126b1782..93d41f9ab 100644 --- a/pgml-sdks/pgml/javascript/package.json +++ b/pgml-sdks/pgml/javascript/package.json @@ -1,6 +1,6 @@ { "name": "pgml", - "version": "0.9.5", + "version": "0.9.6", "description": "Open Source Alternative for Building End-to-End Vector Search Applications without OpenAI & Pinecone", "keywords": [ "postgres", diff --git a/pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts b/pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts index 07ce62093..affb314fa 100644 --- a/pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts +++ b/pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts @@ -280,6 +280,28 @@ it("can order documents", async () => { await collection.archive(); }); +/////////////////////////////////////////////////// +// Transformer Pipeline Tests ///////////////////// +/////////////////////////////////////////////////// + +it("can transformer pipeline", async () => { + const t = pgml.newTransformerPipeline("text-generation"); + const it = await t.transform(["AI is going to"], {max_new_tokens: 5}); + expect(it.length).toBeGreaterThan(0) +}); + +it("can transformer pipeline stream", async () => { + const t = pgml.newTransformerPipeline("text-generation"); + const it = await t.transform_stream("AI is going to", {max_new_tokens: 5}); + let result = await it.next(); + let output = []; + while (!result.done) { + output.push(result.value); + result = await it.next(); + } + expect(output.length).toBeGreaterThan(0) +}); + /////////////////////////////////////////////////// // Test migrations //////////////////////////////// /////////////////////////////////////////////////// diff --git a/pgml-sdks/pgml/pyproject.toml b/pgml-sdks/pgml/pyproject.toml index ffd3b959d..df80ecb74 100644 --- a/pgml-sdks/pgml/pyproject.toml +++ b/pgml-sdks/pgml/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "maturin" [project] name = "pgml" requires-python = ">=3.7" -version = "0.9.5" +version = "0.9.6" description = "Python SDK is designed to facilitate the development of scalable vector search applications on PostgreSQL databases." authors = [ {name = "PostgresML", email = "team@postgresml.org"}, diff --git a/pgml-sdks/pgml/python/pgml/pgml.pyi b/pgml-sdks/pgml/python/pgml/pgml.pyi deleted file mode 100644 index 5352132a9..000000000 --- a/pgml-sdks/pgml/python/pgml/pgml.pyi +++ /dev/null @@ -1,96 +0,0 @@ - -def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None -async def migrate() -> None - -Json = Any -DateTime = int - -# Top of file key: A12BECOD! -from typing import List, Dict, Optional, Self, Any - - -class Builtins: - def __init__(self, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self - ... - def query(self, query: str) -> QueryRunner - ... - async def transform(self, task: Json, inputs: List[str], args: Optional[Json] = Any) -> Json - ... - -class Collection: - def __init__(self, name: str, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self - ... - async def add_pipeline(self, pipeline: Pipeline) -> None - ... - async def remove_pipeline(self, pipeline: Pipeline) -> None - ... - async def enable_pipeline(self, pipeline: Pipeline) -> None - ... - async def disable_pipeline(self, pipeline: Pipeline) -> None - ... - async def upsert_documents(self, documents: List[Json], args: Optional[Json] = Any) -> None - ... - async def get_documents(self, args: Optional[Json] = Any) -> List[Json] - ... - async def delete_documents(self, filter: Json) -> None - ... - async def vector_search(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any, top_k: Optional[int] = 1) -> List[tuple[float, str, Json]] - ... - async def archive(self) -> None - ... - def query(self) -> QueryBuilder - ... - async def get_pipelines(self) -> List[Pipeline] - ... - async def get_pipeline(self, name: str) -> Pipeline - ... - async def exists(self) -> bool - ... - async def upsert_directory(self, path: str, args: Json) -> None - ... - async def upsert_file(self, path: str) -> None - ... - -class Model: - def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", source: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self - ... - -class Pipeline: - def __init__(self, name: str, model: Optional[Model] = Any, splitter: Optional[Splitter] = Any, parameters: Optional[Json] = Any) -> Self - ... - async def get_status(self) -> PipelineSyncData - ... - async def to_dict(self) -> Json - ... - -class QueryBuilder: - def limit(self, limit: int) -> Self - ... - def filter(self, filter: Json) -> Self - ... - def vector_recall(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any) -> Self - ... - async def fetch_all(self) -> List[tuple[float, str, Json]] - ... - def to_full_string(self) -> str - ... - -class QueryRunner: - async def fetch_all(self) -> Json - ... - async def execute(self) -> None - ... - def bind_string(self, bind_value: str) -> Self - ... - def bind_int(self, bind_value: int) -> Self - ... - def bind_float(self, bind_value: float) -> Self - ... - def bind_bool(self, bind_value: bool) -> Self - ... - def bind_json(self, bind_value: Json) -> Self - ... - -class Splitter: - def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self - ... diff --git a/pgml-sdks/pgml/python/tests/test.py b/pgml-sdks/pgml/python/tests/test.py index 673b2b876..97ca155f5 100644 --- a/pgml-sdks/pgml/python/tests/test.py +++ b/pgml-sdks/pgml/python/tests/test.py @@ -298,6 +298,27 @@ async def test_order_documents(): await collection.archive() +################################################### +## Transformer Pipeline Tests ##################### +################################################### + + +@pytest.mark.asyncio +async def test_transformer_pipeline(): + t = pgml.TransformerPipeline("text-generation") + it = await t.transform(["AI is going to"], {"max_new_tokens": 5}) + assert (len(it)) > 0 + +@pytest.mark.asyncio +async def test_transformer_pipeline_stream(): + t = pgml.TransformerPipeline("text-generation") + it = await t.transform_stream("AI is going to", {"max_new_tokens": 5}) + total = [] + async for c in it: + total.append(c) + assert (len(total)) > 0 + + ################################################### ## Migration tests ################################ ################################################### diff --git a/pgml-sdks/pgml/src/languages/javascript.rs b/pgml-sdks/pgml/src/languages/javascript.rs index 2830ff8a1..1aafd654b 100644 --- a/pgml-sdks/pgml/src/languages/javascript.rs +++ b/pgml-sdks/pgml/src/languages/javascript.rs @@ -1,8 +1,11 @@ +use futures::StreamExt; use neon::prelude::*; use rust_bridge::javascript::{FromJsType, IntoJsResult}; +use std::sync::Arc; use crate::{ pipeline::PipelineSyncData, + transformer_pipeline::TransformerStream, types::{DateTime, Json}, }; @@ -16,8 +19,9 @@ impl IntoJsResult for DateTime { self, cx: &mut C, ) -> JsResult<'b, Self::Output> { - let date = neon::types::JsDate::new(cx, self.0.assume_utc().unix_timestamp() as f64 * 1000.0) - .expect("Error converting to JS Date"); + let date = + neon::types::JsDate::new(cx, self.0.assume_utc().unix_timestamp() as f64 * 1000.0) + .expect("Error converting to JS Date"); Ok(date) } } @@ -69,6 +73,64 @@ impl IntoJsResult for PipelineSyncData { } } +#[derive(Clone)] +struct TransformerStreamArcMutex(Arc>); + +impl Finalize for TransformerStreamArcMutex {} + +fn transform_stream_iterate_next(mut cx: FunctionContext) -> JsResult { + let this = cx.this(); + let s: Handle> = this + .get(&mut cx, "s") + .expect("Error getting self in transformer_stream_iterate_next"); + let ts: &TransformerStreamArcMutex = &s; + let ts: TransformerStreamArcMutex = ts.clone(); + + let channel = cx.channel(); + let (deferred, promise) = cx.promise(); + crate::get_or_set_runtime().spawn(async move { + let mut ts = ts.0.lock().await; + let v = ts.next().await; + deferred + .try_settle_with(&channel, move |mut cx| { + let o = cx.empty_object(); + if let Some(v) = v { + let v: String = v.expect("Error calling next on TransformerStream"); + let v = cx.string(v); + let d = cx.boolean(false); + o.set(&mut cx, "value", v) + .expect("Error setting object value in transformer_sream_iterate_next"); + o.set(&mut cx, "done", d) + .expect("Error setting object value in transformer_sream_iterate_next"); + } else { + let d = cx.boolean(true); + o.set(&mut cx, "done", d) + .expect("Error setting object value in transformer_sream_iterate_next"); + } + Ok(o) + }) + .expect("Error sending js"); + }); + Ok(promise) +} + +impl IntoJsResult for TransformerStream { + type Output = JsObject; + fn into_js_result<'a, 'b, 'c: 'b, C: Context<'c>>( + self, + cx: &mut C, + ) -> JsResult<'b, Self::Output> { + let o = cx.empty_object(); + let f: Handle = JsFunction::new(cx, transform_stream_iterate_next)?; + o.set(cx, "next", f)?; + let s = cx.boxed(TransformerStreamArcMutex(Arc::new( + tokio::sync::Mutex::new(self), + ))); + o.set(cx, "s", s)?; + Ok(o) + } +} + //////////////////////////////////////////////////////////////////////////////// // JS To Rust ////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// diff --git a/pgml-sdks/pgml/src/languages/python.rs b/pgml-sdks/pgml/src/languages/python.rs index 3d81c9377..2cf1bcf9c 100644 --- a/pgml-sdks/pgml/src/languages/python.rs +++ b/pgml-sdks/pgml/src/languages/python.rs @@ -1,65 +1,99 @@ +use futures::StreamExt; use pyo3::conversion::IntoPy; use pyo3::types::{PyDict, PyFloat, PyInt, PyList, PyString}; use pyo3::{prelude::*, types::PyBool}; +use std::sync::Arc; use rust_bridge::python::CustomInto; -use crate::{pipeline::PipelineSyncData, types::Json}; +use crate::{pipeline::PipelineSyncData, transformer_pipeline::TransformerStream, types::Json}; //////////////////////////////////////////////////////////////////////////////// // Rust to PY ////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// -impl ToPyObject for Json { - fn to_object(&self, py: Python) -> PyObject { +impl IntoPy for Json { + fn into_py(self, py: Python) -> PyObject { match &self.0 { - serde_json::Value::Bool(x) => x.to_object(py), + serde_json::Value::Bool(x) => x.into_py(py), serde_json::Value::Number(x) => { if x.is_f64() { x.as_f64() .expect("Error converting to f64 in impl ToPyObject for Json") - .to_object(py) + .into_py(py) } else { x.as_i64() .expect("Error converting to i64 in impl ToPyObject for Json") - .to_object(py) + .into_py(py) } } - serde_json::Value::String(x) => x.to_object(py), + serde_json::Value::String(x) => x.into_py(py), serde_json::Value::Array(x) => { let list = PyList::empty(py); for v in x.iter() { - list.append(Json(v.clone()).to_object(py)).unwrap(); + list.append(Json(v.clone()).into_py(py)).unwrap(); } - list.to_object(py) + list.into_py(py) } serde_json::Value::Object(x) => { let dict = PyDict::new(py); for (k, v) in x.iter() { - dict.set_item(k, Json(v.clone()).to_object(py)).unwrap(); + dict.set_item(k, Json(v.clone()).into_py(py)).unwrap(); } - dict.to_object(py) + dict.into_py(py) } serde_json::Value::Null => py.None(), } } } -impl IntoPy for Json { +impl IntoPy for PipelineSyncData { fn into_py(self, py: Python) -> PyObject { - self.to_object(py) + Json::from(self).into_py(py) } } -impl ToPyObject for PipelineSyncData { - fn to_object(&self, py: Python) -> PyObject { - Json::from(self.clone()).to_object(py) +#[pyclass] +#[derive(Clone)] +struct TransformerStreamPython { + wrapped: Arc>, +} + +#[pymethods] +impl TransformerStreamPython { + fn __aiter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __anext__<'p>(slf: PyRefMut<'_, Self>, py: Python<'p>) -> PyResult> { + let ts = slf.wrapped.clone(); + let fut = pyo3_asyncio::tokio::future_into_py(py, async move { + let mut ts = ts.lock().await; + if let Some(o) = ts.next().await { + Ok(Some(Python::with_gil(|py| { + o.expect("Error calling next on TransformerStream") + .to_object(py) + }))) + } else { + Err(pyo3::exceptions::PyStopAsyncIteration::new_err( + "stream exhausted", + )) + } + })?; + Ok(Some(fut.into())) } } -impl IntoPy for PipelineSyncData { +impl IntoPy for TransformerStream { fn into_py(self, py: Python) -> PyObject { - self.to_object(py) + let f: Py = Py::new( + py, + TransformerStreamPython { + wrapped: Arc::new(tokio::sync::Mutex::new(self)), + }, + ) + .expect("Error converting TransformerStream to TransformerStreamPython"); + f.to_object(py) } } @@ -115,6 +149,12 @@ impl FromPyObject<'_> for PipelineSyncData { } } +impl FromPyObject<'_> for TransformerStream { + fn extract(_ob: &PyAny) -> PyResult { + panic!("We must implement this, but this is impossible to be reached") + } +} + //////////////////////////////////////////////////////////////////////////////// // Rust to Rust ////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// diff --git a/pgml-sdks/pgml/src/transformer_pipeline.rs b/pgml-sdks/pgml/src/transformer_pipeline.rs index f28e3106b..f7b5f417f 100644 --- a/pgml-sdks/pgml/src/transformer_pipeline.rs +++ b/pgml-sdks/pgml/src/transformer_pipeline.rs @@ -1,5 +1,11 @@ -use rust_bridge::{alias, alias_methods}; -use sqlx::Row; +use futures::Stream; +use rust_bridge::{alias, alias_manual, alias_methods}; +use sqlx::{postgres::PgRow, Row}; +use sqlx::{Postgres, Transaction}; +use std::collections::VecDeque; +use std::future::Future; +use std::pin::Pin; +use std::task::Poll; use tracing::instrument; /// Provides access to builtin database methods @@ -14,7 +20,100 @@ use crate::{get_or_initialize_pool, types::Json}; #[cfg(feature = "python")] use crate::types::JsonPython; -#[alias_methods(new, transform)] +#[derive(alias_manual)] +pub struct TransformerStream { + transaction: Option>, + future: Option, sqlx::Error>> + Send + 'static>>>, + commit: Option> + Send + 'static>>>, + done: bool, + query: String, + db_batch_size: i32, + results: VecDeque, +} + +impl std::fmt::Debug for TransformerStream { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TransformerStream").finish() + } +} + +impl TransformerStream { + fn new(transaction: Transaction<'static, Postgres>, db_batch_size: i32) -> Self { + let query = format!("FETCH {} FROM c", db_batch_size); + Self { + transaction: Some(transaction), + future: None, + commit: None, + done: false, + query, + db_batch_size, + results: VecDeque::new(), + } + } +} + +impl Stream for TransformerStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + if self.done { + if let Some(c) = self.commit.as_mut() { + if let Poll::Ready(_) = c.as_mut().poll(cx) { + self.commit = None; + } + } + } else { + if self.future.is_none() { + unsafe { + let s = self.as_mut().get_unchecked_mut(); + let s: *mut Self = s; + let s = Box::leak(Box::from_raw(s)); + s.future = Some(Box::pin( + sqlx::query(&s.query).fetch_all(s.transaction.as_mut().unwrap()), + )); + } + } + + if let Poll::Ready(o) = self.as_mut().future.as_mut().unwrap().as_mut().poll(cx) { + let rows = o?; + if rows.len() < self.db_batch_size as usize { + self.done = true; + unsafe { + let s = self.as_mut().get_unchecked_mut(); + let transaction = std::mem::take(&mut s.transaction).unwrap(); + s.commit = Some(Box::pin(transaction.commit())); + } + } else { + unsafe { + let s = self.as_mut().get_unchecked_mut(); + let s: *mut Self = s; + let s = Box::leak(Box::from_raw(s)); + s.future = Some(Box::pin( + sqlx::query(&s.query).fetch_all(s.transaction.as_mut().unwrap()), + )); + } + } + for r in rows.into_iter() { + self.results.push_back(r) + } + } + } + + if !self.results.is_empty() { + let r = self.results.pop_front().unwrap(); + Poll::Ready(Some(Ok(r.get::(0)))) + } else if self.done { + Poll::Ready(None) + } else { + Poll::Pending + } + } +} + +#[alias_methods(new, transform, transform_stream)] impl TransformerPipeline { pub fn new( task: &str, @@ -54,12 +153,37 @@ impl TransformerPipeline { let results = results.get(0).unwrap().get::(0); Ok(Json(results)) } + + #[instrument(skip(self))] + pub async fn transform_stream( + &self, + input: &str, + args: Option, + batch_size: Option, + ) -> anyhow::Result { + let pool = get_or_initialize_pool(&self.database_url).await?; + let args = args.unwrap_or_default(); + let batch_size = batch_size.unwrap_or(10); + + let mut transaction = pool.begin().await?; + sqlx::query( + "DECLARE c CURSOR FOR SELECT pgml.transform_stream(task => $1, input => $2, args => $3)", + ) + .bind(&self.task) + .bind(input) + .bind(&args) + .execute(&mut *transaction) + .await?; + + Ok(TransformerStream::new(transaction, batch_size)) + } } #[cfg(test)] mod tests { use super::*; use crate::internal_init_logger; + use futures::StreamExt; #[sqlx::test] async fn transformer_pipeline_can_transform() -> anyhow::Result<()> { @@ -99,4 +223,36 @@ mod tests { assert!(results.as_array().is_some()); Ok(()) } + + #[sqlx::test] + async fn transformer_can_transform_stream() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let t = TransformerPipeline::new( + "text-generation", + Some("TheBloke/zephyr-7B-beta-GGUF".to_string()), + Some( + serde_json::json!({ + "model_file": "zephyr-7b-beta.Q5_K_M.gguf", "model_type": "mistral" + }) + .into(), + ), + None, + ); + let mut stream = t + .transform_stream( + "AI is going to", + Some( + serde_json::json!({ + "max_new_tokens": 10 + }) + .into(), + ), + None, + ) + .await?; + while let Some(o) = stream.next().await { + o?; + } + Ok(()) + } } diff --git a/pgml-sdks/rust-bridge/rust-bridge-macros/src/javascript.rs b/pgml-sdks/rust-bridge/rust-bridge-macros/src/javascript.rs index 5d7f76b02..b38fe2dfc 100644 --- a/pgml-sdks/rust-bridge/rust-bridge-macros/src/javascript.rs +++ b/pgml-sdks/rust-bridge/rust-bridge-macros/src/javascript.rs @@ -235,10 +235,7 @@ pub fn generate_javascript_methods( let middle = if method.is_async { quote! { - // let runtime = crate::get_or_set_runtime(); - // let x = runtime.block_on(#wrapped_call); let x = #wrapped_call.await; - } } else { quote! { diff --git a/pgml-sdks/rust-bridge/rust-bridge-macros/src/python.rs b/pgml-sdks/rust-bridge/rust-bridge-macros/src/python.rs index 3060656d0..b0df89c51 100644 --- a/pgml-sdks/rust-bridge/rust-bridge-macros/src/python.rs +++ b/pgml-sdks/rust-bridge/rust-bridge-macros/src/python.rs @@ -14,7 +14,6 @@ from typing import List, Dict, Optional, Self, Any /// This function assumes the user has already impliemented: /// - `FromPyObject` for the wrapped type -/// - `ToPyObject` for the wrapped type /// - `IntoPy` for the wrapped type pub fn generate_alias_manual(parsed: DeriveInput) -> proc_macro::TokenStream { let name_ident = format_ident!("{}Python", parsed.ident); @@ -22,7 +21,6 @@ pub fn generate_alias_manual(parsed: DeriveInput) -> proc_macro::TokenStream { let expanded = quote! { #[cfg(feature = "python")] - #[derive(Clone, Debug)] pub struct #name_ident { pub wrapped: #wrapped_type_ident } @@ -55,17 +53,10 @@ pub fn generate_alias_manual(parsed: DeriveInput) -> proc_macro::TokenStream { // From Rust to Python #[cfg(feature = "python")] - impl pyo3::conversion::ToPyObject for #name_ident { - fn to_object(&self, py: pyo3::Python) -> pyo3::PyObject { - use pyo3::conversion::ToPyObject; - self.wrapped.to_object(py) - } - } - #[cfg(feature = "python")] impl pyo3::conversion::IntoPy for #name_ident { fn into_py(self, py: pyo3::Python) -> pyo3::PyObject { use pyo3::conversion::ToPyObject; - self.wrapped.to_object(py) + self.wrapped.into_py(py) } } }; pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy