Content-Length: 475813 | pFad | http://github.com/mwydmuch/xCOLUMNs/commit/3271e3e266bb53ef2b4d323af640240a2483dbc6

DA Add support for predictions without the k budget constraint · mwydmuch/xCOLUMNs@3271e3e · GitHub
Skip to content

Commit

Permalink
Add support for predictions without the k budget constraint
Browse files Browse the repository at this point in the history
  • Loading branch information
mwydmuch committed Jan 23, 2024
1 parent 42e1179 commit 3271e3e
Show file tree
Hide file tree
Showing 7 changed files with 273 additions and 427 deletions.
28 changes: 16 additions & 12 deletions xcolumns/block_coordinate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Callable

import numpy as np
from scipy.sparse import csr_matrix, load_npz, save_npz
from scipy.sparse import csr_matrix
from tqdm import tqdm, trange

from .default_types import *
Expand Down Expand Up @@ -78,6 +78,7 @@ def _calculate_utility(

return utility_aggregation_func(bin_utilities)


def _calculate_binary_gains(
bin_utility_func,
p_Etp: np.ndarray,
Expand Down Expand Up @@ -161,11 +162,14 @@ def bc_with_0approx_np_step(
if maximize:
gains = -gains

top_k = np.argpartition(gains, k)[:k]

# Update prediction
y_pred_i[:] = 0.0
y_pred_i[top_k] = 1.0

if k > 0:
top_k = np.argpartition(gains, k)[:k]
y_pred_i[top_k] = 1.0
else:
y_pred_i[gains <= 0] = 1.0

# Update local confusion matrix
if not only_pred:
Expand Down Expand Up @@ -496,13 +500,13 @@ def _calculate_coverage_utility(

cov_utility = 1 - np.mean(Ef)
if alpha < 1:
precision_at_k = 0
if isinstance(y_proba, np.ndarray):
precision_at_k = (np.sum(y_pred * y_proba, axis=0) / n / k).sum()
elif isinstance(y_proba, csr_matrix):
precision_at_k = (
np.asarray(calculate_tp_csr(y_proba, y_pred) / n / k).ravel().sum()
)
precision_at_k = (calculate_tp(y_proba, y_pred) / n / k).sum()
# if isinstance(y_proba, np.ndarray):
# precision_at_k = (np.sum(y_pred * y_proba, axis=0) / n / k).sum()
# elif isinstance(y_proba, csr_matrix):
# precision_at_k = (
# np.asarray(calculate_tp_csr(y_proba, y_pred) / n / k).ravel().sum()
# )
cov_utility = alpha * cov_utility + (1 - alpha) * precision_at_k

return cov_utility
Expand Down Expand Up @@ -567,7 +571,7 @@ def bc_coverage(
if isinstance(y_proba, np.ndarray):
Ef = np.product(1 - y_pred * y_proba, axis=0)
elif isinstance(y_proba, csr_matrix):
Ef = numba_calculate_prod_1_sparse_mat_mul_ones_minus_mat(
Ef = numba_calculate_prod_0_sparse_mat_mul_ones_minus_mat(
*unpack_csr_matrices(y_pred, y_proba), n, m
)

Expand Down
26 changes: 16 additions & 10 deletions xcolumns/frank_wolfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from scipy.sparse import csr_matrix

from .default_types import *
from .metrics_on_conf_matrix import *
from .utils import *
from .weighted_prediction import predict_weighted_per_instance

Expand All @@ -17,7 +18,7 @@ def _get_grad_as_numpy(t):
return np.zeros(t.shape, dtype=FLOAT_TYPE)


def _utility_func_with_gradient(utility_func, tp, fp, fn, tn):
def utility_func_with_gradient(utility_func, tp, fp, fn, tn):
tp = torch.tensor(tp, requires_grad=True, dtype=TORCH_FLOAT_TYPE)
fp = torch.tensor(fp, requires_grad=True, dtype=TORCH_FLOAT_TYPE)
fn = torch.tensor(fn, requires_grad=True, dtype=TORCH_FLOAT_TYPE)
Expand Down Expand Up @@ -58,8 +59,8 @@ def find_optimal_randomized_classifier_using_frank_wolfe(
init_classifier: Union[str, Tuple[np.ndarray, np.ndarray]] = "topk", # or "random"
search_for_best_alpha: bool = True,
alpha_search_algo: str = "lin",
alpha_eps: float = 0.001,
alpha_lin_search_step: float = 0.001,
alpha_eps: float = 0.00001,
alpha_lin_search_step: float = 0.00001,
skip_tn=False,
verbose: bool = True,
return_meta: bool = False,
Expand All @@ -84,9 +85,10 @@ def find_optimal_randomized_classifier_using_frank_wolfe(
log(f" Initializing initial {init_classifier} classifier ...")
if init_classifier == "topk":
classifiers_a[0] = np.ones(m, dtype=FLOAT_TYPE)
classifiers_b[0] = np.full(m, -0.5, dtype=FLOAT_TYPE)
elif init_classifier == "random":
classifiers_a[0] = np.random.rand(m, dtype=FLOAT_TYPE)
classifiers_b[0] = np.random.rand(m, dtype=FLOAT_TYPE)
classifiers_a[0] = np.random.rand(m)
classifiers_b[0] = np.random.rand(m)
else:
raise ValueError(f"Unsuported type of init_classifier: {init_classifier}")
y_pred_i = predict_weighted_per_instance(y_proba, k, a=classifiers_a[0], b=classifiers_b[0])
Expand All @@ -107,11 +109,11 @@ def find_optimal_randomized_classifier_using_frank_wolfe(

for i in range(1, max_iters):
log(f" Starting iteration {i} ...")
old_utility, Gtp, Gfp, Gfn, Gtn = _utility_func_with_gradient(utility_func, tp, fp, fn, tn)
old_utility, Gtp, Gfp, Gfn, Gtn = utility_func_with_gradient(utility_func, tp, fp, fn, tn)

classifiers_a[i] = Gtp - Gfp - Gfn + Gtn
classifiers_b[i] = Gfp - Gtn
y_pred_i = predict_weighted_per_instance(y_proba, k, a=classifiers_a[i], b=classifiers_b[i])
y_pred_i = predict_weighted_per_instance(y_proba, k, t=0.0, a=classifiers_a[i], b=classifiers_b[i])
tp_i, fp_i, fn_i, tn_i = calculate_confusion_matrix(
y_true, y_pred_i, normalize=True, skip_tn=skip_tn
)
Expand Down Expand Up @@ -144,7 +146,7 @@ def find_optimal_randomized_classifier_using_frank_wolfe(
meta["utilities"].append(new_utility)
meta["iters"] = i

log(f" Iteration {i} finished, utility: (1 - {alpha}) * {old_utility} + {alpha} * {utility_i} -> {new_utility}")
log(f" Iteration {i} finished, alpha: {alpha}, utility: {old_utility} -> {new_utility}")

if alpha < alpha_eps:
print(f" Stopping because alpha is smaller than {alpha_eps}")
Expand Down Expand Up @@ -179,8 +181,12 @@ def _predict_using_randomized_classifier_np(
for i in range(n):
c_i = np.random.choice(classifiers_range, p=classifiers_proba)
gains = y_proba[i] * classifiers_a[c_i] + classifiers_b[c_i]
top_k = np.argpartition(-gains, k)[:k]
result[i, top_k] = 1.0

if k > 0:
top_k = np.argpartition(-gains, k)[:k]
result[i, top_k] = 1.0
else:
result[i, gains > 0] = 1.0

return result

Expand Down
130 changes: 128 additions & 2 deletions xcolumns/metrics_on_conf_matrix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,132 @@
import numpy as np
from typing import Union
from numbers import Number
from typing import Tuple, Union, Callable

import numpy as np
from scipy.sparse import csr_matrix

from .default_types import *
from .numba_csr_methods import *
from .utils import *


def _calculate_tp_np(y_true: np.ndarray, y_pred: np.ndarray):
return np.sum(y_true * y_pred, axis=0)


# Alternative version, performance is similar
# def _calculate_tp_csr(y_true: csr_matrix, y_pred: csr_matrix):
# return (y_pred.multiply(y_true)).sum(axis=0)


def _calculate_tp_csr(y_true: csr_matrix, y_pred: csr_matrix):
n, m = y_true.shape
return numba_calculate_sum_0_sparse_mat_mul_mat(
*unpack_csr_matrices(y_pred, y_true), n, m
)


def _calculate_fp_np(y_true: np.ndarray, y_pred: np.ndarray):
return np.sum((1 - y_true) * y_pred, axis=0)


def _calculate_fp_csr_slow(y_true: csr_matrix, y_pred: csr_matrix):
n, m = y_true.shape
fp = np.zeros(m, dtype=FLOAT_TYPE)
dense_ones = np.ones(m, dtype=FLOAT_TYPE)
for i in range(n):
fp += y_pred[i].multiply(dense_ones - y_true[i])
return fp


def _calculate_fn_np(y_true: np.ndarray, y_pred: np.ndarray):
return np.sum(y_true * (1 - y_pred), axis=0)


def _calculate_fp_csr(y_true: csr_matrix, y_pred: csr_matrix):
n, m = y_true.shape
return numba_calculate_sum_0_sparse_mat_mul_ones_minus_mat(
*unpack_csr_matrices(y_pred, y_true), n, m
)


def _calculate_fn_csr_slow(y_true: csr_matrix, y_pred: csr_matrix):
n, m = y_true.shape
fn = np.zeros(m, dtype=FLOAT_TYPE)
dense_ones = np.ones(m, dtype=FLOAT_TYPE)
for i in range(n):
fn += y_true[i].multiply(dense_ones - y_pred[i])

return fn


def _calculate_fn_csr(y_true: csr_matrix, y_pred: csr_matrix):
n, m = y_true.shape
return numba_calculate_sum_0_sparse_mat_mul_ones_minus_mat(
*unpack_csr_matrices(y_true, y_pred), n, m
)


def _calculate_tn_np(y_true: np.ndarray, y_pred: np.ndarray):
return np.sum((1 - y_true) * (1 - y_pred), axis=0)


def _calculate_conf_mat_entry(
y_true: Union[np.ndarray, csr_matrix],
y_pred: Union[np.ndarray, csr_matrix],
func_for_np: Callable,
func_for_csr: Callable,
normalize: bool = False):

if y_true.shape != y_pred.shape:
raise ValueError("y_true and y_pred must have the same shape")

if isinstance(y_true, np.ndarray) and isinstance(y_pred, np.ndarray):
val = func_for_np(y_true, y_pred)
elif isinstance(y_true, csr_matrix) and isinstance(y_pred, csr_matrix):
val = func_for_csr(y_true, y_pred)
else:
raise ValueError("y_true and y_pred must be both dense or both sparse")

if normalize:
val /= y_true.shape[0]

return val


def calculate_tp(y_true: Union[np.ndarray, csr_matrix], y_pred: Union[np.ndarray, csr_matrix], normalize: bool = False):
return _calculate_conf_mat_entry(y_true, y_pred, _calculate_tp_np, _calculate_tp_csr, normalize=normalize)


def calculate_fp(y_true: Union[np.ndarray, csr_matrix], y_pred: Union[np.ndarray, csr_matrix], normalize: bool = False):
return _calculate_conf_mat_entry(y_true, y_pred, _calculate_fp_np, _calculate_fp_csr, normalize=normalize)


def calculate_fn(y_true: Union[np.ndarray, csr_matrix], y_pred: Union[np.ndarray, csr_matrix], normalize: bool = False):
return _calculate_conf_mat_entry(y_true, y_pred, _calculate_fn_np, _calculate_fn_csr, normalize=normalize)


def calculate_confusion_matrix(
y_true: Union[np.ndarray, csr_matrix],
y_pred: Union[np.ndarray, csr_matrix],
normalize: bool = False,
skip_tn: bool = False,
):
"""
Calculate confusion matrix for true and prediction.
"""

tp = calculate_tp(y_true, y_pred, normalize=normalize)
fp = calculate_fp(y_true, y_pred, normalize=normalize)
fn = calculate_fn(y_true, y_pred, normalize=normalize)

n, m = y_true.shape

if skip_tn:
tn = np.zeros(m, dtype=FLOAT_TYPE)
else:
tn = np.full(m, 1 if normalize else n, dtype=FLOAT_TYPE) - tp - fp - fn

return tp, fp, fn, tn


def bin_precision_at_k_on_conf_matrix(tp: Union[Number, np.ndarray], fp: Union[Number, np.ndarray], fn: Union[Number, np.ndarray], tn: Union[Number, np.ndarray], k: int):
Expand Down
Loading

0 comments on commit 3271e3e

Please sign in to comment.








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/mwydmuch/xCOLUMNs/commit/3271e3e266bb53ef2b4d323af640240a2483dbc6

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy