Skip to content

add PCA as first decomposition method #1441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
add PCA as first decomposition method
  • Loading branch information
montanalow committed May 5, 2024
commit 3d5d6d575385cf9916e4bc226a8322039cec0926
File renamed without changes.
2 changes: 1 addition & 1 deletion pgml-extension/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pgml-extension/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "pgml"
version = "2.8.3"
version = "2.8.4"
edition = "2021"

[lib]
Expand Down
60 changes: 60 additions & 0 deletions pgml-extension/examples/decomposition.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
-- This example reduces the dimensionality of images in the sklean digits dataset
-- which is a copy of the test set of the UCI ML hand-written digits datasets
-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
--
-- This demonstrates using a table with a single array feature column
-- for decomposition to reduce dimensionality.
--
-- Exit on error (psql)
-- \set ON_ERROR_STOP true
\timing on

SELECT pgml.load_dataset('digits');

-- view the dataset
SELECT left(image::text, 40) || ',...}', target FROM pgml.digits LIMIT 10;

-- create a view of just the vectors for decomposition, without any labels
CREATE VIEW digit_vectors AS
SELECT image FROM pgml.digits;

SELECT * FROM pgml.train('Handwritten Digits Reduction', 'decomposition', 'digit_vectors');

-- check out the decomposed vectors
SELECT target, pgml.decompose('Handwritten Digits Reduction', image) AS pca
FROM pgml.digits
LIMIT 10;

--
-- After a project has been trained, omitted parameters will be reused from previous training runs
-- In these examples we'll reuse the training data snapshots from the initial call.
--

-- We can reduce the image vectors from 64 dimensions to 3 components
SELECT * FROM pgml.train('Handwritten Digits Reduction', hyperparams => '{"n_components": 3}');

-- check out the reduced vectors
SELECT target, pgml.decompose('Handwritten Digits Reduction', image) AS pca
FROM pgml.digits
LIMIT 10;

-- check out all that hard work
SELECT trained_models.* FROM pgml.trained_models
JOIN pgml.models on models.id = trained_models.id
ORDER BY models.metrics->>'cumulative_explained_variance' DESC LIMIT 5;

-- deploy the PCA model for prediction use
SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'most_recent', 'pca');
-- check out that throughput
SELECT * FROM pgml.deployed_models ORDER BY deployed_at DESC LIMIT 5;

-- deploy the "best" model for prediction use
SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'best_score');
SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'most_recent');
SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'rollback');
SELECT * FROM pgml.deploy('Handwritten Digits Reduction', 'best_score', 'pca');

-- check out the improved predictions
SELECT target, pgml.predict('Handwritten Digits Reduction', image) AS prediction
FROM pgml.digits
LIMIT 10;
5 changes: 2 additions & 3 deletions pgml-extension/examples/image_classification.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
-- This demonstrates using a table with a single array feature column
-- for classification.
--
-- The final result after a few seconds of training is not terrible. Maybe not perfect
-- enough for mission critical applications, but it's telling how quickly "off the shelf"
-- solutions can solve problems these days.
-- Some algorithms converge on this trivial dataset in under a second, demonstrating the
-- speed with which modern machines can "learn" from example data.

-- Exit on error (psql)
-- \set ON_ERROR_STOP true
Expand Down
2 changes: 1 addition & 1 deletion pgml-extension/examples/regression.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- This example trains models on the sklean diabetes dataset
-- This example trains models on the sklearn diabetes dataset
-- Source URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
-- For more information see:
-- Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004)
Expand Down
13 changes: 13 additions & 0 deletions pgml-extension/sql/pgml--2.8.3--2.8.4.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
ALTER TYPE pgml.task ADD VALUE IF NOT EXISTS 'decomposition';

ALTER TYPE pgml.algorithm ADD VALUE IF NOT EXISTS 'pca';
ALTER TYPE pgml.task RENAME VALUE 'cluster' TO 'clustering';

-- pgml::api::decompose
CREATE FUNCTION pgml."decompose"(
"project_name" TEXT, /* alloc::string::String */
"vector" FLOAT4[], /* Vec<f32> */
) RETURNS FLOAT4[] /* Vec<f32> */
IMMUTABLE STRICT PARALLEL SAFE
LANGUAGE c /* Rust */
AS 'MODULE_PATHNAME', 'decompose_wrapper';
11 changes: 10 additions & 1 deletion pgml-extension/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,10 @@ fn train_joint(
};

// fix up default algorithm for clustering
let algorithm = if algorithm == Algorithm::linear && project.task == Task::cluster {
let algorithm = if algorithm == Algorithm::linear && project.task == Task::clustering{
Algorithm::kmeans
} else if algorithm == Algorithm::linear && project.task == Task::decomposition {
Algorithm::pca
} else {
algorithm
};
Expand Down Expand Up @@ -482,6 +484,13 @@ fn predict_batch(project_name: &str, features: Vec<f32>) -> SetOfIterator<'stati
))
}

#[pg_extern(immutable, parallel_safe, strict, name = "decompose")]
fn decompose(project_name: &str, vector: Vec<f32>) -> Vec<f32> {
let model_id = Project::get_deployed_model_id(project_name);
let model = unwrap_or_error!(Model::find_cached(model_id));
unwrap_or_error!(model.decompose(&vector))
}

#[pg_extern(immutable, parallel_safe, strict, name = "predict")]
fn predict_row(project_name: &str, row: pgrx::datum::AnyElement) -> f32 {
predict_model_row(Project::get_deployed_model_id(project_name), row)
Expand Down
16 changes: 14 additions & 2 deletions pgml-extension/src/bindings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,24 @@ pub mod xgboost;

pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Result<Box<dyn Bindings>>;

use std::any::Any;

pub trait AToAny: 'static {
fn as_any(&self) -> &dyn Any;
}

impl<T: 'static> AToAny for T {
fn as_any(&self) -> &dyn Any {
self
}
}

/// The Bindings trait that has to be implemented by all algorithm
/// providers we use in PostgresML. We don't rely on Serde serialization,
/// since scikit-learn estimators were originally serialized in pure Python as
/// pickled objects, and neither xgboost or linfa estimators completely
/// pickled objects, and neither xgboost nor linfa estimators completely
/// implement serde.
pub trait Bindings: Send + Sync + Debug {
pub trait Bindings: Send + Sync + Debug + AToAny {
/// Predict a set of datapoints.
fn predict(&self, features: &[f32], num_features: usize, num_classes: usize) -> Result<Vec<f32>>;

Expand Down
30 changes: 25 additions & 5 deletions pgml-extension/src/bindings/sklearn/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ use anyhow::Result;
use pyo3::prelude::*;
use pyo3::types::PyTuple;

use crate::{bindings::Bindings, create_pymodule, orm::*};
use crate::{bindings::{Bindings, TracebackError}, create_pymodule, orm::*};


create_pymodule!("/src/bindings/sklearn/sklearn.py");

Expand All @@ -35,8 +36,8 @@ wrap_fit!(random_forest_regression, "random_forest_regression");
wrap_fit!(xgboost_regression, "xgboost_regression");
wrap_fit!(xgboost_random_forest_regression, "xgboost_random_forest_regression");
wrap_fit!(
orthogonal_matching_persuit_regression,
"orthogonal_matching_persuit_regression"
orthogonal_matching_pursuit_regression,
"orthogonal_matching_pursuit_regression"
);
wrap_fit!(bayesian_ridge_regression, "bayesian_ridge_regression");
wrap_fit!(
Expand Down Expand Up @@ -109,6 +110,8 @@ wrap_fit!(spectral, "spectral_clustering");
wrap_fit!(spectral_bi, "spectral_biclustering");
wrap_fit!(spectral_co, "spectral_coclustering");

wrap_fit!(pca, "pca_decomposition");

fn fit(dataset: &Dataset, hyperparams: &Hyperparams, algorithm_task: &'static str) -> Result<Box<dyn Bindings>> {
let hyperparams = serde_json::to_string(hyperparams).unwrap();

Expand Down Expand Up @@ -293,9 +296,9 @@ pub fn classification_metrics(ground_truth: &[f32], y_hat: &[f32], num_classes:
Ok(scores)
}

pub fn cluster_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> Result<HashMap<String, f32>> {
pub fn clustering_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> Result<HashMap<String, f32>> {
Python::with_gil(|py| {
let calculate_metric = get_module!(PY_MODULE).getattr(py, "cluster_metrics")?;
let calculate_metric = get_module!(PY_MODULE).getattr(py, "clustering_metrics")?;

let scores: HashMap<String, f32> = calculate_metric
.call1(py, (num_features, PyTuple::new(py, [inputs, labels])))?
Expand All @@ -304,3 +307,20 @@ pub fn cluster_metrics(num_features: usize, inputs: &[f32], labels: &[f32]) -> R
Ok(scores)
})
}

pub fn decomposition_metrics(bindings: &Box<dyn Bindings>) -> Result<HashMap<String, f32>> {
Python::with_gil(|py| {
match bindings.as_any().downcast_ref::<Estimator>() {
Some(estimator) => {
let calculate_metric = get_module!(PY_MODULE).getattr(py, "decomposition_metrics")?;
let metrics = calculate_metric
.call1(py, PyTuple::new(py, [&estimator.estimator]));
let metrics = metrics
.format_traceback(py)?
.extract(py).format_traceback(py)?;
Ok(metrics)
}
None => error!("Can't compute decomposition metrics for bindings other than sklearn")
}
})
}
17 changes: 14 additions & 3 deletions pgml-extension/src/bindings/sklearn/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
"elastic_net_regression": sklearn.linear_model.ElasticNet,
"least_angle_regression": sklearn.linear_model.Lars,
"lasso_least_angle_regression": sklearn.linear_model.LassoLars,
"orthogonal_matching_persuit_regression": sklearn.linear_model.OrthogonalMatchingPursuit,
"orthogonal_matching_pursuit_regression": sklearn.linear_model.OrthogonalMatchingPursuit,
"bayesian_ridge_regression": sklearn.linear_model.BayesianRidge,
"automatic_relevance_determination_regression": sklearn.linear_model.ARDRegression,
"stochastic_gradient_descent_regression": sklearn.linear_model.SGDRegressor,
Expand Down Expand Up @@ -95,6 +95,7 @@
"spectral_clustering": sklearn.cluster.SpectralClustering,
"spectral_biclustering": sklearn.cluster.SpectralBiclustering,
"spectral_coclustering": sklearn.cluster.SpectralCoclustering,
"pca_decomposition": sklearn.decomposition.PCA,
}


Expand Down Expand Up @@ -182,7 +183,10 @@ def predictor_joint(estimator, num_targets):

def predict(X):
X = np.asarray(X).reshape((-1, estimator.n_features_in_))
y_hat = estimator.predict(X)
if hasattr(estimator.__class__, 'predict'):
y_hat = estimator.predict(X)
else:
y_hat = estimator.transform(X)

# Only support single value models for just now.
if num_targets == 1:
Expand Down Expand Up @@ -238,6 +242,8 @@ def calculate_metric(metric_name):
func = mean_absolute_error
elif metric_name == "confusion_matrix":
func = confusion_matrix
elif metric_name == "variance":
func = variance
else:
raise Exception(f"Unknown metric requested: {metric_name}")

Expand Down Expand Up @@ -300,10 +306,15 @@ def classification_metrics(y_true, y_hat):
}


def cluster_metrics(num_features, inputs_labels):
def clustering_metrics(num_features, inputs_labels):
inputs = np.asarray(inputs_labels[0]).reshape((-1, num_features))
labels = np.asarray(inputs_labels[1]).reshape((-1, 1))

return {
"silhouette": silhouette_score(inputs, labels),
}

def decomposition_metrics(pca):
return {
"cumulative_explained_variance": sum(pca.explained_variance_ratio_)
}
3 changes: 3 additions & 0 deletions pgml-extension/src/orm/algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pub enum Algorithm {
spectral_bi,
spectral_co,
catboost,
pca,
}

impl std::str::FromStr for Algorithm {
Expand Down Expand Up @@ -99,6 +100,7 @@ impl std::str::FromStr for Algorithm {
"spectral_bi" => Ok(Algorithm::spectral_bi),
"spectral_co" => Ok(Algorithm::spectral_co),
"catboost" => Ok(Algorithm::catboost),
"pca" => Ok(Algorithm::pca),
_ => Err(()),
}
}
Expand Down Expand Up @@ -151,6 +153,7 @@ impl std::string::ToString for Algorithm {
Algorithm::spectral_bi => "spectral_bi".to_string(),
Algorithm::spectral_co => "spectral_co".to_string(),
Algorithm::catboost => "catboost".to_string(),
Algorithm::pca => "pca".to_string(),
}
}
}
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy