add PCA as first decomposition method

postgresml · montanalow · May 6, 2024 · May 5, 2024 · May 5, 2024 · May 5, 2024
commit 3d5d6d575385cf9916e4bc226a8322039cec0926
diff --git a/pgml-extension/.cargo/config → pgml-extension/.cargo/config.toml b/pgml-extension/.cargo/config → pgml-extension/.cargo/config.toml
diff --git a/pgml-extension/Cargo.lock b/pgml-extension/Cargo.lock
diff --git a/pgml-extension/Cargo.toml b/pgml-extension/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pgml"
-version = "2.8.3"
+version = "2.8.4"
 edition = "2021"
 
 [lib]

diff --git a/pgml-extension/examples/cluster.sql → pgml-extension/examples/clustering.sql b/pgml-extension/examples/cluster.sql → pgml-extension/examples/clustering.sql
diff --git a/pgml-extension/examples/decomposition.sql b/pgml-extension/examples/decomposition.sql
@@ -0,0 +1,60 @@
+-- This example reduces the dimensionality of images in the sklean digits dataset
+-- which is a copy of the test set of the UCI ML hand-written digits datasets
+-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
+--
+-- This demonstrates using a table with a single array feature column
+-- for decomposition to reduce dimensionality.
+--
+-- Exit on error (psql)
+-- \set ON_ERROR_STOP true
+\timing on
+
+SELECT pgml.load_dataset('digits');
+
+-- view the dataset
+SELECT left(image::text, 40) || ',...}', target FROM pgml.digits LIMIT 10;
+
+-- create a view of just the vectors for decomposition, without any labels
+CREATE VIEW digit_vectors AS
+SELECT image FROM pgml.digits;
+
+SELECT * FROM pgml.train('Handwritten Digits Reduction', 'decomposition', 'digit_vectors');
+
+-- check out the decomposed vectors
+SELECT target, pgml.decompose('Handwritten Digits Reduction', image) AS pca
+FROM pgml.digits
+LIMIT 10;
+
+--
+-- After a project has been trained, omitted parameters will be reused from previous training runs
+-- In these examples we'll reuse the training data snapshots from the initial call.
+--
+
+-- We can reduce the image vectors from 64 dimensions to 3 components
+SELECT * FROM pgml.train('Handwritten Digits Reduction', hyperparams => '{"n_components": 3}');
+
+-- check out the reduced vectors
+SELECT target, pgml.decompose('Handwritten Digits Reduction', image) AS pca
+FROM pgml.digits
+LIMIT 10;
+
+-- check out all that hard work
+SELECT trained_models.* FROM pgml.trained_models
+                                 JOIN pgml.models on models.id = trained_models.id
+ORDER BY models.metrics->>'cumulative_explained_variance' DESC LIMIT 5;
+
+-- deploy the PCA model for prediction use
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'most_recent', 'pca');
+-- check out that throughput
+SELECT * FROM pgml.deployed_models ORDER BY deployed_at DESC LIMIT 5;
+
+-- deploy the "best" model for prediction use
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'best_score');
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'most_recent');
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'rollback');
+SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'best_score', 'pca');
+
+-- check out the improved predictions
+SELECT target, pgml.predict('Handwritten Digits Reduction', image) AS prediction
+FROM pgml.digits
+LIMIT 10;
diff --git a/pgml-extension/examples/image_classification.sql b/pgml-extension/examples/image_classification.sql
@@ -5,9 +5,8 @@
 -- This demonstrates using a table with a single array feature column
 -- for classification.
 --
--- The final result after a few seconds of training is not terrible. Maybe not perfect
--- enough for mission critical applications, but it's telling how quickly "off the shelf" 
--- solutions can solve problems these days.
+-- Some algorithms converge on this trivial dataset in under a second, demonstrating the
+-- speed with which modern machines can "learn" from example data.
 
 -- Exit on error (psql)
 -- \set ON_ERROR_STOP true

diff --git a/pgml-extension/examples/regression.sql b/pgml-extension/examples/regression.sql
@@ -1,4 +1,4 @@
--- This example trains models on the sklean diabetes dataset
+-- This example trains models on the sklearn diabetes dataset
 -- Source URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
 -- For more information see:
 --   Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004)

diff --git a/pgml-extension/sql/pgml--2.8.3--2.8.4.sql b/pgml-extension/sql/pgml--2.8.3--2.8.4.sql
@@ -0,0 +1,13 @@
+ALTER TYPE pgml.task ADD VALUE IF NOT EXISTS 'decomposition';
+
+ALTER TYPE pgml.algorithm ADD VALUE IF NOT EXISTS 'pca';
+ALTER TYPE pgml.task RENAME VALUE 'cluster' TO 'clustering';
+
+-- pgml::api::decompose
+CREATE FUNCTION pgml."decompose"(
+    "project_name" TEXT, /* alloc::string::String */
+    "vector" FLOAT4[], /* Vec<f32> */
+) RETURNS FLOAT4[] /* Vec<f32> */
+    IMMUTABLE STRICT PARALLEL SAFE
+LANGUAGE c /* Rust */
+AS 'MODULE_PATHNAME', 'decompose_wrapper';
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
@@ -225,8 +225,10 @@ fn train_joint(
     };
 
     // fix up default algorithm for clustering
-    let algorithm = if algorithm == Algorithm::linear && project.task == Task::cluster {
+    let algorithm = if algorithm == Algorithm::linear && project.task == Task::clustering{
         Algorithm::kmeans
+    } else if algorithm == Algorithm::linear && project.task == Task::decomposition {
+        Algorithm::pca
     } else {
         algorithm
     };
@@ -482,6 +484,13 @@ fn predict_batch(project_name: &str, features: Vec<f32>) -> SetOfIterator<'stati
     ))
 }
 
+#[pg_extern(immutable, parallel_safe, strict, name = "decompose")]
+fn decompose(project_name: &str, vector: Vec<f32>) -> Vec<f32> {
+    let model_id = Project::get_deployed_model_id(project_name);
+    let model = unwrap_or_error!(Model::find_cached(model_id));
+    unwrap_or_error!(model.decompose(&vector))
+}
+
 #[pg_extern(immutable, parallel_safe, strict, name = "predict")]
 fn predict_row(project_name: &str, row: pgrx::datum::AnyElement) -> f32 {
     predict_model_row(Project::get_deployed_model_id(project_name), row)

diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs
@@ -78,12 +78,24 @@ pub mod xgboost;
 
 pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Result<Box<dyn Bindings>>;
 
+use std::any::Any;
+
+pub trait AToAny: 'static {
+    fn as_any(&self) -> &dyn Any;
+}
+
+impl<T: 'static> AToAny for T {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
 /// The Bindings trait that has to be implemented by all algorithm
 /// providers we use in PostgresML. We don't rely on Serde serialization,
 /// since scikit-learn estimators were originally serialized in pure Python as
-/// pickled objects, and neither xgboost or linfa estimators completely
+/// pickled objects, and neither xgboost nor linfa estimators completely
 /// implement serde.
-pub trait Bindings: Send + Sync + Debug {
+pub trait Bindings: Send + Sync + Debug + AToAny {
     /// Predict a set of datapoints.
     fn predict(&self, features: &[f32], num_features: usize, num_classes: usize) -> Result<Vec<f32>>;
 

diff --git a/pgml-extension/src/bindings/sklearn/mod.rs b/pgml-extension/src/bindings/sklearn/mod.rs
@@ -14,7 +14,8 @@ use anyhow::Result;
 use pyo3::prelude::*;
 use pyo3::types::PyTuple;
 
-use crate::{bindings::Bindings, create_pymodule, orm::*};
+use crate::{bindings::{Bindings, TracebackError}, create_pymodule, orm::*};
+
 
 create_pymodule!("/src/bindings/sklearn/sklearn.py");
 
@@ -35,8 +36,8 @@ wrap_fit!(random_forest_regression, "random_forest_regression");
 wrap_fit!(xgboost_regression, "xgboost_regression");
 wrap_fit!(xgboost_random_forest_regression, "xgboost_random_forest_regression");
 wrap_fit!(
-    orthogonal_matching_persuit_regression,
-    "orthogonal_matching_persuit_regression"
+    orthogonal_matching_pursuit_regression,
+    "orthogonal_matching_pursuit_regression"
 );
 wrap_fit!(bayesian_ridge_regression, "bayesian_ridge_regression");
 wrap_fit!(
@@ -109,6 +110,8 @@ wrap_fit!(spectral, "spectral_clustering");
 wrap_fit!(spectral_bi, "spectral_biclustering");
 wrap_fit!(spectral_co, "spectral_coclustering");
 
+wrap_fit!(pca, "pca_decomposition");
+
 fn fit(dataset: &Dataset, hyperparams: &Hyperparams, algorithm_task: &'static str) -> Result<Box<dyn Bindings>> {
     let hyperparams = serde_json::to_string(hyperparams).unwrap();
 
@@ -293,9 +296,9 @@ pub fn classification_metrics(ground_truth: &[f32], y_hat: &[f32], num_classes:
     Ok(scores)
 }
 
-pub fn cluster_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> Result<HashMap<String, f32>> {
+pub fn clustering_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> Result<HashMap<String, f32>> {
     Python::with_gil(|py| {
-        let calculate_metric = get_module!(PY_MODULE).getattr(py, "cluster_metrics")?;
+        let calculate_metric = get_module!(PY_MODULE).getattr(py, "clustering_metrics")?;
 
         let scores: HashMap<String, f32> = calculate_metric
             .call1(py, (num_features, PyTuple::new(py, [inputs, labels])))?
@@ -304,3 +307,20 @@ pub fn cluster_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> R
         Ok(scores)
     })
 }
+
+pub fn decomposition_metrics(bindings: &Box<dyn Bindings>) -> Result<HashMap<String, f32>> {
+    Python::with_gil(|py| {
+        match bindings.as_any().downcast_ref::<Estimator>() {
+            Some(estimator) => {
+                let calculate_metric = get_module!(PY_MODULE).getattr(py, "decomposition_metrics")?;
+                let metrics = calculate_metric
+                    .call1(py, PyTuple::new(py, [&estimator.estimator]));
+                let metrics = metrics
+                    .format_traceback(py)?
+                    .extract(py).format_traceback(py)?;
+                Ok(metrics)
+            }
+            None => error!("Can't compute decomposition metrics for bindings other than sklearn")
+        }
+    })
+}
diff --git a/pgml-extension/src/bindings/sklearn/sklearn.py b/pgml-extension/src/bindings/sklearn/sklearn.py
@@ -43,7 +43,7 @@
     "elastic_net_regression": sklearn.linear_model.ElasticNet,
     "least_angle_regression": sklearn.linear_model.Lars,
     "lasso_least_angle_regression": sklearn.linear_model.LassoLars,
-    "orthogonal_matching_persuit_regression": sklearn.linear_model.OrthogonalMatchingPursuit,
+    "orthogonal_matching_pursuit_regression": sklearn.linear_model.OrthogonalMatchingPursuit,
     "bayesian_ridge_regression": sklearn.linear_model.BayesianRidge,
     "automatic_relevance_determination_regression": sklearn.linear_model.ARDRegression,
     "stochastic_gradient_descent_regression": sklearn.linear_model.SGDRegressor,
@@ -95,6 +95,7 @@
     "spectral_clustering": sklearn.cluster.SpectralClustering,
     "spectral_biclustering": sklearn.cluster.SpectralBiclustering,
     "spectral_coclustering": sklearn.cluster.SpectralCoclustering,
+    "pca_decomposition": sklearn.decomposition.PCA,
 }
 
 
@@ -182,7 +183,10 @@ def predictor_joint(estimator, num_targets):
 
     def predict(X):
         X = np.asarray(X).reshape((-1, estimator.n_features_in_))
-        y_hat = estimator.predict(X)
+        if hasattr(estimator.__class__, 'predict'):
+            y_hat = estimator.predict(X)
+        else:
+            y_hat = estimator.transform(X)
 
         # Only support single value models for just now.
         if num_targets == 1:
@@ -238,6 +242,8 @@ def calculate_metric(metric_name):
         func = mean_absolute_error
     elif metric_name == "confusion_matrix":
         func = confusion_matrix
+    elif metric_name == "variance":
+        func = variance
     else:
         raise Exception(f"Unknown metric requested: {metric_name}")
 
@@ -300,10 +306,15 @@ def classification_metrics(y_true, y_hat):
     }
 
 
-def cluster_metrics(num_features, inputs_labels):
+def clustering_metrics(num_features, inputs_labels):
     inputs = np.asarray(inputs_labels[0]).reshape((-1, num_features))
     labels = np.asarray(inputs_labels[1]).reshape((-1, 1))
 
     return {
         "silhouette": silhouette_score(inputs, labels),
     }
+
+def decomposition_metrics(pca):
+    return {
+      "cumulative_explained_variance": sum(pca.explained_variance_ratio_)
+   }
diff --git a/pgml-extension/src/orm/algorithm.rs b/pgml-extension/src/orm/algorithm.rs
@@ -48,6 +48,7 @@ pub enum Algorithm {
     spectral_bi,
     spectral_co,
     catboost,
+    pca,
 }
 
 impl std::str::FromStr for Algorithm {
@@ -99,6 +100,7 @@ impl std::str::FromStr for Algorithm {
             "spectral_bi" => Ok(Algorithm::spectral_bi),
             "spectral_co" => Ok(Algorithm::spectral_co),
             "catboost" => Ok(Algorithm::catboost),
+            "pca" => Ok(Algorithm::pca),
             _ => Err(()),
         }
     }
@@ -151,6 +153,7 @@ impl std::string::ToString for Algorithm {
             Algorithm::spectral_bi => "spectral_bi".to_string(),
             Algorithm::spectral_co => "spectral_co".to_string(),
             Algorithm::catboost => "catboost".to_string(),
+            Algorithm::pca => "pca".to_string(),
         }
     }
 }