From 1d26403f109f06769415fa872c6c877deec5d827 Mon Sep 17 00:00:00 2001 From: VariableVasasMT Date: Fri, 7 Jun 2019 08:54:30 +0530 Subject: [PATCH 1/7] adding initial code for StratifiedShuffleSplit --- src/lib/model_selection/_split.ts | 205 ++++++++++++++++++++++++++++-- src/lib/utils/tensors.ts | 17 ++- src/lib/utils/validation.ts | 14 ++ 3 files changed, 226 insertions(+), 10 deletions(-) diff --git a/src/lib/model_selection/_split.ts b/src/lib/model_selection/_split.ts index 1671c3cc..10fa0d6e 100644 --- a/src/lib/model_selection/_split.ts +++ b/src/lib/model_selection/_split.ts @@ -1,10 +1,18 @@ +import { Tensor } from '@tensorflow/tfjs'; import * as _ from 'lodash'; import * as Random from 'random-js'; import { Type1DMatrix, Type2DMatrix } from '../types'; import { ValidationError } from '../utils/Errors'; -import { inferShape } from '../utils/tensors'; -import { validateFitInputs } from '../utils/validation'; +import { convertToTensor, inferShape } from '../utils/tensors'; +import { _num_samples, validateFitInputs } from '../utils/validation'; +const testShapes = (X: Type1DMatrix | Type2DMatrix | Tensor = null, y: Type1DMatrix | Tensor = null) => { + const xShape = inferShape(X); + const yShape = inferShape(y); + if (xShape.length > 0 && yShape.length > 0 && xShape[0] !== yShape[0]) { + throw new ValidationError('X and y must have an identical size'); + } +}; /** * K-Folds cross-validator * @@ -47,7 +55,6 @@ export class KFold { this.k = k; this.shuffle = shuffle; } - /** * * @param X - Training data, where n_samples is the number of samples and n_features is the number of features. @@ -55,12 +62,7 @@ export class KFold { * @returns {any[]} */ public split(X: Type1DMatrix = null, y: Type1DMatrix = null): any[] { - const xShape = inferShape(X); - const yShape = inferShape(y); - if (xShape.length > 0 && yShape.length > 0 && xShape[0] !== yShape[0]) { - throw new ValidationError('X and y must have an identical size'); - } - + testShapes(X, y); if (this.k > X.length || this.k > y.length) { throw new ValidationError( `Cannot have number of splits k=${this.k} greater than the number of samples: ${_.size(X)}`, @@ -208,3 +210,188 @@ export function train_test_split( yTrain: clean(yTrain), }; } + +const rangeValidationError = (type, size, n_samples) => `${type}=${size} should be either +positive and smaller than number of samples ${n_samples} or a float in (0, 1) range`; + +const testRangeValidationError = (test_size, n_samples) => rangeValidationError('test_size', test_size, n_samples); + +const trainRangeValidationError = (test_size, n_samples) => rangeValidationError('test_size', test_size, n_samples); + +function uniq(array: Type1DMatrix = null): Type2DMatrix { + const uniqArray = _.uniq(_.flatten(array)).sort(); + let min = Number.MAX_VALUE; + let max = Number.MIN_VALUE; + + const valueCount = {}; + const uniqIndexMap = uniqArray.reduce((acc, ele, i) => { + if (min > ele) { + min = ele; + } + + if (max < ele) { + max = ele; + } + + return { + ...acc, + [ele]: i, + }; + }, {}); + + const indexMap = array.map((ele) => { + if (valueCount[ele]) { + valueCount[ele] += 1; + } else { + valueCount[ele] = 1; + } + return uniqIndexMap[ele]; + }); + + return [uniqArray, indexMap]; +} + +function binCount(array: number[]) { + const min: number = _.min(array); + const max: number = _.max(array); + + const arrToObj = array.reduce( + (acc, ele, i) => ({ + ...acc, + [ele]: i, + }), + {}, + ); + const retArray = Array(max - min).fill(0); + for (let i = 0; i < retArray.length; i++) { + if (arrToObj[i + min]) { + retArray[i] = arrToObj[i + min]; + } + } + + return retArray; +} +export class StratifiedShuffleSplit { + // private n_splits: number; + private test_size: number; + private train_size: number; + // private random_state: number; + private default_test_size: number = 0.1; + constructor( + // n_splits: number = 10, + test_size: number = null, + train_size: number = null, + // random_state: number = null + ) { + // this.n_splits = n_splits; + this.test_size = test_size; + this.train_size = train_size; + // this.random_state = random_state; + } + + split = (X: Type1DMatrix | Type2DMatrix = null, y: Type1DMatrix = null): any[] => { + const XTensor = convertToTensor(X); + // const yTensor = convertToTensor(y); + const n_samples = _num_samples(XTensor); + + const [n_test, n_train] = validate_shuffle_split( + n_samples, + this.test_size, + this.train_size, + this.default_test_size, + ); + + const [classes, y_indices] = uniq(y); + const n_classes = classes.length; + const class_counts = binCount(y_indices); + + if (_.min(class_counts) < 2) { + throw new Error( + `The least populated class in y=${y} has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.`, + ); + } + + if (n_train < n_classes) { + throw new Error(`The train_size = ${n_train} should be greater or equal to the number of classes = ${n_classes}`); + } + + if (n_test < n_classes) { + throw new Error(`The test_size = ${n_test} should be greater or equal to the number of classes = ${n_classes}`); + } + + return []; + }; + + private; +} + +function validate_shuffle_split( + n_samples: number, + test_size: number, + train_size: number, + default_test_size: number, +): number[] { + let n_train: number; + let n_test: number; + + if (!test_size && !train_size) { + test_size = default_test_size; + } + + if (test_size) { + if (Number.isInteger(test_size)) { + if (test_size >= n_samples || test_size <= 0) { + throw new ValidationError(testRangeValidationError(test_size, n_samples)); + } + + n_test = test_size; + } else { + if (test_size <= 0 || test_size >= 1) { + throw new ValidationError(testRangeValidationError(test_size, n_samples)); + } + + n_test = Math.ceil(test_size * n_samples); + } + } + + if (train_size) { + if (Number.isInteger(train_size)) { + if (train_size >= n_samples || train_size <= 0) { + throw new ValidationError(trainRangeValidationError(train_size, n_samples)); + } + + n_train = test_size; + } else { + if (test_size <= 0 || test_size >= 1) { + throw new ValidationError(trainRangeValidationError(train_size, n_samples)); + } + + n_train = Math.ceil(train_size * n_samples); + } + } + + if (!train_size) { + n_train = n_samples - n_test; + } else if (!test_size) { + n_test = n_samples - n_train; + } + + const total = n_train + n_test; + if (total > n_samples) { + throw new ValidationError( + `The sum of train_size and test_size = ${total}, ` + + 'should be smaller than the number of ' + + `samples ${n_samples}. Reduce test_size and/or ` + + 'train_size.', + ); + } + + if (n_train === 0) { + throw new ValidationError( + `With n_samples=${n_samples}, test_size=${test_size} and train_size=${train_size}, the ` + + 'resulting train set will be empty. Adjust any of the ' + + 'aforementioned parameters.', + ); + } + return [Math.round(n_test), Math.round(n_train)]; +} diff --git a/src/lib/utils/tensors.ts b/src/lib/utils/tensors.ts index 83eed70e..5398f79d 100644 --- a/src/lib/utils/tensors.ts +++ b/src/lib/utils/tensors.ts @@ -16,14 +16,29 @@ import { validateMatrix1D, validateMatrix2D } from './validation'; * @param X * @ignore */ -export function inferShape(X: TypeMatrix): number[] { +export function inferShape(X: TypeMatrix | tf.Tensor): number[] { try { + if (X instanceof tf.Tensor) { + return X.shape; + } return tf.tensor(X).shape; } catch (e) { throw new ValidationInconsistentShape(e); } } +/** + * + * @param array 1D and 2D + */ +export function convertToTensor(array: TypeMatrix): tf.Tensor { + try { + return tf.tensor(array); + } catch (e) { + throw new ValidationInconsistentShape(e); + } +} + /** * Reshapes any size of array into a new shape. * diff --git a/src/lib/utils/validation.ts b/src/lib/utils/validation.ts index e2992df7..a4e4d24a 100644 --- a/src/lib/utils/validation.ts +++ b/src/lib/utils/validation.ts @@ -197,3 +197,17 @@ export function validateShapesEqual( return [yTrueTensor, yPredTensor]; } + +/** + * get number of samples from an array + * @param array - type matrix or tensor + */ +export function _num_samples(array: TypeMatrix | tf.Tensor = null): number { + if (!array) { + throw new ValidationError(`array cant be null`); + } + if (array instanceof tf.Tensor) { + return array.shape[0]; + } + return array.length; +} From f4dded7fb73b681a5d2460f8bd19574631a71c9c Mon Sep 17 00:00:00 2001 From: VariableVasasMT Date: Sun, 30 Jun 2019 12:10:58 +0530 Subject: [PATCH 2/7] added following * added arraySplit * moved forward in stratifiedShuffleSplit --- src/lib/model_selection/_split.ts | 114 ++++++++------------------- src/lib/utils/tensors.ts | 123 +++++++++++++++++++++++++++++- src/lib/utils/validation.ts | 2 +- 3 files changed, 151 insertions(+), 88 deletions(-) diff --git a/src/lib/model_selection/_split.ts b/src/lib/model_selection/_split.ts index 10fa0d6e..b917c3e0 100644 --- a/src/lib/model_selection/_split.ts +++ b/src/lib/model_selection/_split.ts @@ -1,12 +1,12 @@ -import { Tensor } from '@tensorflow/tfjs'; +import * as tf from '@tensorflow/tfjs'; import * as _ from 'lodash'; import * as Random from 'random-js'; import { Type1DMatrix, Type2DMatrix } from '../types'; import { ValidationError } from '../utils/Errors'; -import { convertToTensor, inferShape } from '../utils/tensors'; -import { _num_samples, validateFitInputs } from '../utils/validation'; +import { arraySplit, convertToTensor, countBin, inferShape, invidualize } from '../utils/tensors'; +import { numSamples, validateFitInputs } from '../utils/validation'; -const testShapes = (X: Type1DMatrix | Type2DMatrix | Tensor = null, y: Type1DMatrix | Tensor = null) => { +const testShapes = (X: Type1DMatrix | Type2DMatrix, y: Type1DMatrix) => { const xShape = inferShape(X); const yShape = inferShape(y); if (xShape.length > 0 && yShape.length > 0 && xShape[0] !== yShape[0]) { @@ -218,114 +218,62 @@ const testRangeValidationError = (test_size, n_samples) => rangeValidationError( const trainRangeValidationError = (test_size, n_samples) => rangeValidationError('test_size', test_size, n_samples); -function uniq(array: Type1DMatrix = null): Type2DMatrix { - const uniqArray = _.uniq(_.flatten(array)).sort(); - let min = Number.MAX_VALUE; - let max = Number.MIN_VALUE; - - const valueCount = {}; - const uniqIndexMap = uniqArray.reduce((acc, ele, i) => { - if (min > ele) { - min = ele; - } - - if (max < ele) { - max = ele; - } - - return { - ...acc, - [ele]: i, - }; - }, {}); - - const indexMap = array.map((ele) => { - if (valueCount[ele]) { - valueCount[ele] += 1; - } else { - valueCount[ele] = 1; - } - return uniqIndexMap[ele]; - }); - - return [uniqArray, indexMap]; -} - -function binCount(array: number[]) { - const min: number = _.min(array); - const max: number = _.max(array); - - const arrToObj = array.reduce( - (acc, ele, i) => ({ - ...acc, - [ele]: i, - }), - {}, - ); - const retArray = Array(max - min).fill(0); - for (let i = 0; i < retArray.length; i++) { - if (arrToObj[i + min]) { - retArray[i] = arrToObj[i + min]; - } - } - - return retArray; -} export class StratifiedShuffleSplit { // private n_splits: number; - private test_size: number; - private train_size: number; - // private random_state: number; - private default_test_size: number = 0.1; + private testSize: number; + private trainSize: number; + // private seed: number; + private defaultTestSize: number = 0.1; constructor( // n_splits: number = 10, - test_size: number = null, - train_size: number = null, - // random_state: number = null + testSize: number = null, + trainSize: number = null, + // seed: number = null, ) { // this.n_splits = n_splits; - this.test_size = test_size; - this.train_size = train_size; - // this.random_state = random_state; + this.testSize = testSize; + this.trainSize = trainSize; + // this.seed = seed; } split = (X: Type1DMatrix | Type2DMatrix = null, y: Type1DMatrix = null): any[] => { const XTensor = convertToTensor(X); // const yTensor = convertToTensor(y); - const n_samples = _num_samples(XTensor); + const nSamples = numSamples(XTensor); - const [n_test, n_train] = validate_shuffle_split( - n_samples, - this.test_size, - this.train_size, - this.default_test_size, - ); + const [nTest, nTrain] = validateShuffleSplit(nSamples, this.testSize, this.trainSize, this.defaultTestSize); - const [classes, y_indices] = uniq(y); - const n_classes = classes.length; - const class_counts = binCount(y_indices); + const [classes, yIndices] = invidualize(y); + const nClasses = classes.length; + const classCounts = countBin(yIndices); - if (_.min(class_counts) < 2) { + if (_.min(classCounts) < 2) { throw new Error( `The least populated class in y=${y} has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.`, ); } - if (n_train < n_classes) { - throw new Error(`The train_size = ${n_train} should be greater or equal to the number of classes = ${n_classes}`); + if (nTrain < nClasses) { + throw new Error(`The train_size = ${nTrain} should be greater or equal to the number of classes = ${nClasses}`); } - if (n_test < n_classes) { - throw new Error(`The test_size = ${n_test} should be greater or equal to the number of classes = ${n_classes}`); + if (nTest < nClasses) { + throw new Error(`The test_size = ${nTest} should be greater or equal to the number of classes = ${nClasses}`); } + const cumsumClassCounts: tf.Tensor1D = tf.cumsum(classCounts); + // const classIndices = arraySplit( + // yIndices.sort(), + // cumsumClassCounts.slice(0, cumsumClassCounts.shape[0] - 1).arraySync(), + // ); + arraySplit(yIndices.sort(), cumsumClassCounts.slice(0, cumsumClassCounts.shape[0] - 1).arraySync()); return []; }; private; } -function validate_shuffle_split( +function validateShuffleSplit( n_samples: number, test_size: number, train_size: number, diff --git a/src/lib/utils/tensors.ts b/src/lib/utils/tensors.ts index 5398f79d..442d650c 100644 --- a/src/lib/utils/tensors.ts +++ b/src/lib/utils/tensors.ts @@ -1,4 +1,5 @@ import * as tf from '@tensorflow/tfjs'; +import { isInt } from '@tensorflow/tfjs-core/dist/util'; import * as _ from 'lodash'; import { Type1DMatrix, Type2DMatrix, TypeMatrix } from '../types'; import { ValidationError, ValidationInconsistentShape } from './Errors'; @@ -16,11 +17,8 @@ import { validateMatrix1D, validateMatrix2D } from './validation'; * @param X * @ignore */ -export function inferShape(X: TypeMatrix | tf.Tensor): number[] { +export function inferShape(X: TypeMatrix): number[] { try { - if (X instanceof tf.Tensor) { - return X.shape; - } return tf.tensor(X).shape; } catch (e) { throw new ValidationInconsistentShape(e); @@ -112,3 +110,120 @@ export const ensure2DMatrix = (X: Type2DMatrix | Type1DMatrix): const matrix1D = validateMatrix1D(X); return _.map(matrix1D, (o) => [o]); }; + +/** + * + * @param array - target matrix + * @ignore + */ +export function invidualize(array: Type1DMatrix = null): Type2DMatrix { + const uniqArray = _.uniq(_.flatten(array)).sort(); + let min = Number.MAX_VALUE; + let max = Number.MIN_VALUE; + + const valueCount = {}; + const uniqIndexMap = uniqArray.reduce((acc, ele, i) => { + if (min > ele) { + min = ele; + } + + if (max < ele) { + max = ele; + } + + return { + ...acc, + [ele]: i, + }; + }, {}); + + const indexMap = array.map((ele) => { + if (valueCount[ele]) { + valueCount[ele] += 1; + } else { + valueCount[ele] = 1; + } + return uniqIndexMap[ele]; + }); + + return [uniqArray, indexMap]; +} + +/** + * + * Count number of occurrences of each value in array of non-negative ints. + * countBin([0, 1, 1, 3, 2, 1, 7]) = [1, 3, 1, 1, 0, 0, 0, 1] + * countBin([0, 1, 1, 2, 2, 2], [0.3, 0.5, 0.2, 0.7, 1., -0.6]) = [ 0.3, 0.7, 1.1] + * countBin([7]) = [0, 0, 0, 0, 0, 0, 0, 1] + * @param array + */ +export function countBin(array: Type1DMatrix, weights?: Type1DMatrix): Type1DMatrix { + if (weights && array.length !== weights.length) { + throw Error(`weights=${weights} and targetArray=${array} should be of same length.`); + } + const min: number = _.min(array); + const max: number = _.max(array); + + const retArray = Array(max - min + 1).fill(0); + if (!weights) { + weights = Array(array.length).fill(1); + } + + const arrToObj = array.reduce((acc, ele, i) => { + if (!isInt(ele)) { + throw Error(`Only integer values are acceptable in the values of ${array}`); + } + return { + ...acc, + [ele]: (acc[ele] || 0) + weights[i], + }; + }, {}); + + for (let i = 0; i < retArray.length; i++) { + if (arrToObj[i + min]) { + retArray[i] = arrToObj[i + min]; + } + } + + return [...Array(min).fill(0), ...retArray]; +} + +/** + * Split an array into multiple sub-arrays. + * @param array + * @param indices_or_sections + */ + +export function arraySplit( + array: Type1DMatrix, + indices_or_sections: number | Type1DMatrix, +): Type2DMatrix { + const nTotal: number = array.length; + let nSections: number = null; + let divPoints: tf.Tensor1D = null; + if (indices_or_sections instanceof Array) { + nSections = indices_or_sections.length + 1; + divPoints = tf.tensor([0, ...indices_or_sections, nTotal]); + } else { + if (indices_or_sections <= 0) { + throw Error('The number of sections can not be less than one'); + } + nSections = Math.floor(indices_or_sections); + const nEachSection = Math.floor(nTotal / nSections); + const extras = nTotal % nSections; + divPoints = tf.cumsum([ + 0, + ...Array(extras).fill(nEachSection + 1), + ...Array(nSections - extras).fill(nEachSection), + ]); + } + + const subArrays: Type2DMatrix = []; + for (let i = 0; i < nSections; i++) { + const st = divPoints.get(i); + const end = divPoints.get(i + 1); + subArrays.push(array.slice(st, end)); + } + + return subArrays; +} diff --git a/src/lib/utils/validation.ts b/src/lib/utils/validation.ts index a4e4d24a..22bc931c 100644 --- a/src/lib/utils/validation.ts +++ b/src/lib/utils/validation.ts @@ -202,7 +202,7 @@ export function validateShapesEqual( * get number of samples from an array * @param array - type matrix or tensor */ -export function _num_samples(array: TypeMatrix | tf.Tensor = null): number { +export function numSamples(array: TypeMatrix | tf.Tensor = null): number { if (!array) { throw new ValidationError(`array cant be null`); } From 6dd151981d4ceefaefcc259019cd5b609627e493 Mon Sep 17 00:00:00 2001 From: VariableVasasMT Date: Fri, 2 Aug 2019 23:00:33 +0530 Subject: [PATCH 3/7] added following: * added random class * adding StratifiedShuffleSplit * adding approximateMode, cumSum, countBin in tensors.ts --- package.json | 1 + src/lib/model_selection/_split.ts | 61 +++++++++++--------- src/lib/utils/random.ts | 94 +++++++++++++++++++++++++++++++ src/lib/utils/tensors.ts | 55 +++++++++++++++++- yarn.lock | 17 ++++++ 5 files changed, 199 insertions(+), 29 deletions(-) create mode 100644 src/lib/utils/random.ts diff --git a/package.json b/package.json index 23f822e6..9af20138 100644 --- a/package.json +++ b/package.json @@ -73,6 +73,7 @@ "lodash": "4.17.11", "numeric": "1.2.6", "random-js": "1.0.8", + "seedrandom": "^3.0.1", "stopword": "0.1.10" }, "devDependencies": { diff --git a/src/lib/model_selection/_split.ts b/src/lib/model_selection/_split.ts index b917c3e0..de5da5da 100644 --- a/src/lib/model_selection/_split.ts +++ b/src/lib/model_selection/_split.ts @@ -1,9 +1,9 @@ import * as tf from '@tensorflow/tfjs'; import * as _ from 'lodash'; -import * as Random from 'random-js'; import { Type1DMatrix, Type2DMatrix } from '../types'; import { ValidationError } from '../utils/Errors'; -import { arraySplit, convertToTensor, countBin, inferShape, invidualize } from '../utils/tensors'; +import RandomState, { RandomStateObj } from '../utils/random'; +import { approximateMode, arraySplit, convertToTensor, countBin, inferShape, invidualize } from '../utils/tensors'; import { numSamples, validateFitInputs } from '../utils/validation'; const testShapes = (X: Type1DMatrix | Type2DMatrix, y: Type1DMatrix) => { @@ -168,8 +168,7 @@ export function train_test_split( throw new ValidationError('Sum of test_size and train_size does not equal 1'); } // Initiate Random engine - const randomEngine = Random.engines.mt19937(); - randomEngine.seed(random_state); + const randomEngine: RandomStateObj = new RandomState(random_state); // split const xTrain = []; @@ -179,7 +178,7 @@ export function train_test_split( // Getting X_train and y_train while (xTrain.length < trainSizeLength && yTrain.length < trainSizeLength) { - const index = Random.integer(0, X.length - 1)(randomEngine); + const index = randomEngine.rangedInt(0, X.length - 1); // X_train xTrain.push(_X[index]); @@ -191,7 +190,7 @@ export function train_test_split( } while (xTest.length < testSizeLength) { - const index = Random.integer(0, _X.length - 1)(randomEngine); + const index = randomEngine.rangedInt(0, _X.length - 1); // X test xTest.push(_X[index]); _X.splice(index, 1); @@ -219,24 +218,19 @@ const testRangeValidationError = (test_size, n_samples) => rangeValidationError( const trainRangeValidationError = (test_size, n_samples) => rangeValidationError('test_size', test_size, n_samples); export class StratifiedShuffleSplit { - // private n_splits: number; + private n_splits: number; private testSize: number; private trainSize: number; - // private seed: number; + private rng: RandomState; private defaultTestSize: number = 0.1; - constructor( - // n_splits: number = 10, - testSize: number = null, - trainSize: number = null, - // seed: number = null, - ) { - // this.n_splits = n_splits; + constructor(n_splits: number = 10, testSize: number = null, trainSize: number = null, seed: number = null) { + this.n_splits = n_splits; this.testSize = testSize; this.trainSize = trainSize; - // this.seed = seed; + this.rng = new RandomState(seed); } - split = (X: Type1DMatrix | Type2DMatrix = null, y: Type1DMatrix = null): any[] => { + split = (X: Type1DMatrix | Type2DMatrix = null, y: Type1DMatrix = null): Type1DMatrix => { const XTensor = convertToTensor(X); // const yTensor = convertToTensor(y); const nSamples = numSamples(XTensor); @@ -244,8 +238,8 @@ export class StratifiedShuffleSplit { const [nTest, nTrain] = validateShuffleSplit(nSamples, this.testSize, this.trainSize, this.defaultTestSize); const [classes, yIndices] = invidualize(y); - const nClasses = classes.length; - const classCounts = countBin(yIndices); + const nClasses: number = classes.length; + const classCounts: Type1DMatrix = countBin(yIndices); if (_.min(classCounts) < 2) { throw new Error( @@ -262,15 +256,28 @@ export class StratifiedShuffleSplit { } const cumsumClassCounts: tf.Tensor1D = tf.cumsum(classCounts); - // const classIndices = arraySplit( - // yIndices.sort(), - // cumsumClassCounts.slice(0, cumsumClassCounts.shape[0] - 1).arraySync(), - // ); - arraySplit(yIndices.sort(), cumsumClassCounts.slice(0, cumsumClassCounts.shape[0] - 1).arraySync()); - return []; - }; + const classIndices = arraySplit( + yIndices.sort(), + cumsumClassCounts.slice(0, cumsumClassCounts.shape[0] - 1).arraySync(), + ); + + const test = []; + const train = []; + for (let i = 0; i <= this.n_splits; i++) { + const n_i: Type1DMatrix = approximateMode(classCounts, nTrain, this.rng); + const classCountsRemaining: Type1DMatrix = classCounts.map((item, index) => n_i[index] - item); + const t_i: Type1DMatrix = approximateMode(classCountsRemaining, nTest, this.rng); + + for (let j = 0; j <= nClasses; j++) { + const permutation: Type1DMatrix = this.rng.permutation(classCounts[i]); + const permIndicesClassI = permutation.map((val) => classIndices[j][val]); + train.concat(permIndicesClassI.slice(0, n_i[i])); + test.concat(permIndicesClassI.slice(n_i[i], n_i[i] + t_i[i])); + } + } - private; + return [this.rng.shuffle(train), this.rng.shuffle(test)]; + }; } function validateShuffleSplit( diff --git a/src/lib/utils/random.ts b/src/lib/utils/random.ts new file mode 100644 index 00000000..85a70435 --- /dev/null +++ b/src/lib/utils/random.ts @@ -0,0 +1,94 @@ +import * as _ from 'lodash'; +import seedrandom from 'seedrandom'; +import { isNumber } from 'util'; +import { Type1DMatrix } from '../types'; + +export default class RandomState { + private random; + constructor(seed: string | number = Math.random()) { + this.random = seedrandom(seed.toString()); + } + + next(): number { + return this.random(); + } + + rangedInt(min, max): number { + return min + Math.floor((max - min) * this.next()); + } + /** + * shuffles 1D array in place + * taken from https://github.com/TimothyGu/knuth-shuffle-seeded/blob/gh-pages/index.js + * var random = new RandomState(4); + * random.shuffle([1, 2, 3, 4, 5]) + * random.shuffle([1, 2, 3, 4, 5]) + * output-1: [5, 3, 4, 1, 2] + * output-2: [3, 4, 2, 5, 1] + * @param array type: any[] + * @returns shuffled array + */ + shuffle(array: Type1DMatrix): Type1DMatrix { + let currentIndex = array.length; + + // While there remain elements to shuffle... + while (0 !== currentIndex) { + // Pick a remaining element... + const randomIndex = Math.floor(this.next() * currentIndex--); + + // And swap it with the current element. + const temporaryValue = array[currentIndex]; + array[currentIndex] = array[randomIndex]; + array[randomIndex] = temporaryValue; + } + + return array; + } + + pickRandomIndex(length: number, probability: Type1DMatrix): number { + const theFate: number = this.next(); + const indexToPick = Math.floor(theFate * length); + if (probability && probability[indexToPick] > theFate) { + return this.pickRandomIndex(length, probability); + } + return indexToPick; + } + + choice( + choiceArray: number | Type1DMatrix, + outputSize: number, + probability?: Type1DMatrix, + ): Type1DMatrix { + if (isNumber(choiceArray)) { + choiceArray = _.range(choiceArray); + } + + const lenChoiceArray: number = choiceArray.length; + + const outPutArray: Type1DMatrix = new Array(outputSize); + for (let i = 0; i < outputSize; i++) { + const index = this.pickRandomIndex(lenChoiceArray, probability); + outPutArray.push(choiceArray[index]); + } + return outPutArray; + } + + /** + * generates an array with number and permutates it. + * const random = new RandomState(4); + * random.shuffle(5) + * random.shuffle(5) + * output-1: [4, 2, 3, 0, 1] + * output-2: [2, 3, 1, 4, 0] + * @param num type: number + * @returns shuffled array + */ + permutation(num: number): Type1DMatrix { + return this.shuffle( + Array(num) + .fill(0) + .map(Number.call, Number), + ); + } +} + +export type RandomStateObj = RandomState; diff --git a/src/lib/utils/tensors.ts b/src/lib/utils/tensors.ts index 442d650c..1aabc0fe 100644 --- a/src/lib/utils/tensors.ts +++ b/src/lib/utils/tensors.ts @@ -1,8 +1,8 @@ import * as tf from '@tensorflow/tfjs'; -import { isInt } from '@tensorflow/tfjs-core/dist/util'; import * as _ from 'lodash'; import { Type1DMatrix, Type2DMatrix, TypeMatrix } from '../types'; import { ValidationError, ValidationInconsistentShape } from './Errors'; +import { RandomStateObj } from './random'; import { validateMatrix1D, validateMatrix2D } from './validation'; /** @@ -170,7 +170,7 @@ export function countBin(array: Type1DMatrix, weights?: Type1DMatrix { - if (!isInt(ele)) { + if (Math.floor(ele) !== ele) { throw Error(`Only integer values are acceptable in the values of ${array}`); } return { @@ -227,3 +227,54 @@ export function arraySplit( return subArrays; } + +export function approximateMode( + classCounts: Type1DMatrix, + nDraws: number, + rng: RandomStateObj, +): Type1DMatrix { + // this computes a bad approximation to the mode of the + // multivariate hypergeometric given by class_counts and n_draws + const countSum = _.sum(classCounts); + let flooredSum = 0; + // floored means we don't overshoot n_samples, but probably undershoot + const { floored, remainder } = classCounts.reduce( + (acc, val) => { + const value = nDraws * val / countSum; + const flooredVal = Math.floor(value); + const diff = value - flooredVal; + acc.continuous.push(value); + acc.floored.push(flooredVal); + acc.remainder.push(diff); + flooredSum += flooredVal; + return acc; + }, + { floored: [], continuous: [], remainder: [] }, + ); + + let needToAdd = Math.floor(nDraws - flooredSum); + // we add samples according to how much "left over" probability + // they had, until we arrive at n_samples + // need_to_add = int(n_draws - floored.sum()) + if (needToAdd > 0) { + const values = _.sortedUniq(remainder); + for (let i = 0; i < values.length; i++) { + const val = values[i]; + let inds = remainder.reduce((acc, rval, j) => { + if (rval === val) { + acc.push(j); + } + return acc; + }, []); + const addNow = Math.min(inds.length, needToAdd); + inds = rng.choice(inds, addNow); + floored[inds] += 1; + needToAdd -= addNow; + if (needToAdd === 0) { + break; + } + } + } + + return floored; +} diff --git a/yarn.lock b/yarn.lock index 28eb68a0..842bdcdc 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6472,6 +6472,13 @@ kleur@^3.0.2: resolved "https://registry.yarnpkg.com/kleur/-/kleur-3.0.2.tgz#83c7ec858a41098b613d5998a7b653962b504f68" integrity sha512-3h7B2WRT5LNXOtQiAaWonilegHcPSf9nLVXlSTci8lu1dZUuui61+EsPEZqSVxY7rXYmB2DVKMQILxaO5WL61Q== +knuth-shuffle-seeded@^1.0.6: + version "1.0.6" + resolved "https://registry.yarnpkg.com/knuth-shuffle-seeded/-/knuth-shuffle-seeded-1.0.6.tgz#01f1b65733aa7540ee08d8b0174164d22081e4e1" + integrity sha1-AfG2VzOqdUDuCNiwF0Fk0iCB5OE= + dependencies: + seed-random "~2.2.0" + koa-compose@^3.0.0, koa-compose@^3.2.1: version "3.2.1" resolved "https://registry.yarnpkg.com/koa-compose/-/koa-compose-3.2.1.tgz#a85ccb40b7d986d8e5a345b3a1ace8eabcf54de7" @@ -9012,10 +9019,20 @@ section-matter@^1.0.0: extend-shallow "^2.0.1" kind-of "^6.0.0" +seed-random@~2.2.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/seed-random/-/seed-random-2.2.0.tgz#2a9b19e250a817099231a5b99a4daf80b7fbed54" + integrity sha1-KpsZ4lCoFwmSMaW5mk2vgLf77VQ= + seedrandom@2.4.3: version "2.4.3" resolved "https://registry.yarnpkg.com/seedrandom/-/seedrandom-2.4.3.tgz#2438504dad33917314bff18ac4d794f16d6aaecc" +seedrandom@^3.0.1: + version "3.0.1" + resolved "https://registry.yarnpkg.com/seedrandom/-/seedrandom-3.0.1.tgz#eb3dde015bcf55df05a233514e5df44ef9dce083" + integrity sha512-1/02Y/rUeU1CJBAGLebiC5Lbo5FnB22gQbIFFYTLkwvp1xdABZJH1sn4ZT1MzXmPpzv+Rf/Lu2NcsLJiK4rcDg== + seedrandom@~2.4.3: version "2.4.4" resolved "https://registry.yarnpkg.com/seedrandom/-/seedrandom-2.4.4.tgz#b25ea98632c73e45f58b77cfaa931678df01f9ba" From 8ce44fde804e8c5408f3495cab8d8d1b6b382fee Mon Sep 17 00:00:00 2001 From: VariableVasasMT Date: Sun, 8 Sep 2019 11:54:57 +0530 Subject: [PATCH 4/7] new random number implemented --- src/lib/linear_model/stochastic_gradient.ts | 20 ++++----- src/lib/model_selection/_split.ts | 20 +++++---- src/lib/utils/random.ts | 39 +++++++++++++++--- src/lib/utils/tensors.ts | 12 ------ src/lib/utils/validation.ts | 4 +- test/model_selection/_split.test.ts | 45 +++++++++++++-------- 6 files changed, 84 insertions(+), 56 deletions(-) diff --git a/src/lib/linear_model/stochastic_gradient.ts b/src/lib/linear_model/stochastic_gradient.ts index 8e60a4e6..3015298d 100644 --- a/src/lib/linear_model/stochastic_gradient.ts +++ b/src/lib/linear_model/stochastic_gradient.ts @@ -1,7 +1,8 @@ import * as tf from '@tensorflow/tfjs'; import { cloneDeep, range } from 'lodash'; -import * as Random from 'random-js'; +// import * as Random from 'random-js'; import { IMlModel, Type1DMatrix, Type2DMatrix } from '../types'; +import RandomState, { RandomStateObj } from '../utils/random'; import { validateFitInputs, validateMatrix2D } from '../utils/validation'; export enum TypeLoss { @@ -29,7 +30,7 @@ export class BaseSGD implements IMlModel { protected regFactor: TypeRegFactor; private clone: boolean = true; private weights: tf.Tensor = null; - private randomEngine: Random.MT19937; // Random engine used to + private randomEngine: RandomStateObj; // Random engine used to private randomState: number; /** * @param preprocess - preprocess methodology can be either minmax or null. Default is minmax. @@ -42,7 +43,7 @@ export class BaseSGD implements IMlModel { learning_rate = 0.0001, epochs = 10000, clone = true, - random_state = null, + random_state, loss = TypeLoss.L2, reg_factor = null, }: { @@ -56,7 +57,7 @@ export class BaseSGD implements IMlModel { learning_rate: 0.0001, epochs: 10000, clone: true, - random_state: null, + random_state: undefined, loss: TypeLoss.L2, reg_factor: null, }, @@ -87,11 +88,7 @@ export class BaseSGD implements IMlModel { } // Random Engine - if (Number.isInteger(this.randomState)) { - this.randomEngine = Random.engines.mt19937().seed(this.randomState); - } else { - this.randomEngine = Random.engines.mt19937().autoSeed(); - } + this.randomEngine = new RandomState(this.randomState); } /** @@ -193,9 +190,8 @@ export class BaseSGD implements IMlModel { */ private initializeWeights(nFeatures: number): void { const limit = 1 / Math.sqrt(nFeatures); - const distribution = Random.real(-limit, limit); - const getRand = () => distribution(this.randomEngine); - this.weights = tf.tensor1d(range(0, nFeatures).map(() => getRand())); + const distribution = this.randomEngine.real(-limit, limit); + this.weights = tf.tensor1d(range(0, nFeatures).map(distribution)); } /** diff --git a/src/lib/model_selection/_split.ts b/src/lib/model_selection/_split.ts index de5da5da..c73ede5b 100644 --- a/src/lib/model_selection/_split.ts +++ b/src/lib/model_selection/_split.ts @@ -3,12 +3,12 @@ import * as _ from 'lodash'; import { Type1DMatrix, Type2DMatrix } from '../types'; import { ValidationError } from '../utils/Errors'; import RandomState, { RandomStateObj } from '../utils/random'; -import { approximateMode, arraySplit, convertToTensor, countBin, inferShape, invidualize } from '../utils/tensors'; +import { approximateMode, arraySplit, countBin, inferShape, invidualize } from '../utils/tensors'; import { numSamples, validateFitInputs } from '../utils/validation'; const testShapes = (X: Type1DMatrix | Type2DMatrix, y: Type1DMatrix) => { - const xShape = inferShape(X); - const yShape = inferShape(y); + const xShape: Type1DMatrix = inferShape(X); + const yShape: Type1DMatrix = inferShape(y); if (xShape.length > 0 && yShape.length > 0 && xShape[0] !== yShape[0]) { throw new ValidationError('X and y must have an identical size'); } @@ -217,11 +217,14 @@ const testRangeValidationError = (test_size, n_samples) => rangeValidationError( const trainRangeValidationError = (test_size, n_samples) => rangeValidationError('test_size', test_size, n_samples); +/** + * StratifiedShuffleSplit + */ export class StratifiedShuffleSplit { private n_splits: number; private testSize: number; private trainSize: number; - private rng: RandomState; + private rng: RandomStateObj; private defaultTestSize: number = 0.1; constructor(n_splits: number = 10, testSize: number = null, trainSize: number = null, seed: number = null) { this.n_splits = n_splits; @@ -230,8 +233,8 @@ export class StratifiedShuffleSplit { this.rng = new RandomState(seed); } - split = (X: Type1DMatrix | Type2DMatrix = null, y: Type1DMatrix = null): Type1DMatrix => { - const XTensor = convertToTensor(X); + split = (X: Type1DMatrix | Type2DMatrix = null, y: Type1DMatrix = null): Type2DMatrix => { + const XTensor = tf.tensor(X); // const yTensor = convertToTensor(y); const nSamples = numSamples(XTensor); @@ -268,9 +271,10 @@ export class StratifiedShuffleSplit { const classCountsRemaining: Type1DMatrix = classCounts.map((item, index) => n_i[index] - item); const t_i: Type1DMatrix = approximateMode(classCountsRemaining, nTest, this.rng); - for (let j = 0; j <= nClasses; j++) { + for (let j = 0; j < nClasses; j++) { const permutation: Type1DMatrix = this.rng.permutation(classCounts[i]); const permIndicesClassI = permutation.map((val) => classIndices[j][val]); + console.log(`permIndicesClassI=${JSON.stringify(permIndicesClassI)}`); //tslint:disable-line train.concat(permIndicesClassI.slice(0, n_i[i])); test.concat(permIndicesClassI.slice(n_i[i], n_i[i] + t_i[i])); } @@ -285,7 +289,7 @@ function validateShuffleSplit( test_size: number, train_size: number, default_test_size: number, -): number[] { +): Type1DMatrix { let n_train: number; let n_test: number; diff --git a/src/lib/utils/random.ts b/src/lib/utils/random.ts index 85a70435..ed89a886 100644 --- a/src/lib/utils/random.ts +++ b/src/lib/utils/random.ts @@ -3,7 +3,29 @@ import seedrandom from 'seedrandom'; import { isNumber } from 'util'; import { Type1DMatrix } from '../types'; -export default class RandomState { +/** + * instance of RandomState + * @ignore + */ +export interface RandomStateObj { + next(): number; + shuffle(array: Type1DMatrix): Type1DMatrix; + rangedInt(min: number, max: number): number; + choice( + choiceArray: number | Type1DMatrix, + outputSize: number, + probability?: Type1DMatrix, + ): Type1DMatrix; + permutation(num: number): Type1DMatrix; + rangedReal(min: number, max: number): number; + real(min: number, max: number): (() => number); +} + +/** + * All of Random works lie here + * @ignore + */ +export default class RandomState implements RandomStateObj { private random; constructor(seed: string | number = Math.random()) { this.random = seedrandom(seed.toString()); @@ -13,9 +35,18 @@ export default class RandomState { return this.random(); } - rangedInt(min, max): number { + rangedInt(min: number, max: number): number { return min + Math.floor((max - min) * this.next()); } + + rangedReal(min: number, max: number): number { + return min + (max - min) * this.next(); + } + + real(min: number, max: number): (() => number) { + const diff = max - min; + return () => min + diff * this.next(); + } /** * shuffles 1D array in place * taken from https://github.com/TimothyGu/knuth-shuffle-seeded/blob/gh-pages/index.js @@ -24,7 +55,7 @@ export default class RandomState { * random.shuffle([1, 2, 3, 4, 5]) * output-1: [5, 3, 4, 1, 2] * output-2: [3, 4, 2, 5, 1] - * @param array type: any[] + * @param array type: Type1DMatrix * @returns shuffled array */ shuffle(array: Type1DMatrix): Type1DMatrix { @@ -90,5 +121,3 @@ export default class RandomState { ); } } - -export type RandomStateObj = RandomState; diff --git a/src/lib/utils/tensors.ts b/src/lib/utils/tensors.ts index 1aabc0fe..3fa7fc1f 100644 --- a/src/lib/utils/tensors.ts +++ b/src/lib/utils/tensors.ts @@ -25,18 +25,6 @@ export function inferShape(X: TypeMatrix): number[] { } } -/** - * - * @param array 1D and 2D - */ -export function convertToTensor(array: TypeMatrix): tf.Tensor { - try { - return tf.tensor(array); - } catch (e) { - throw new ValidationInconsistentShape(e); - } -} - /** * Reshapes any size of array into a new shape. * diff --git a/src/lib/utils/validation.ts b/src/lib/utils/validation.ts index 22bc931c..b56a7851 100644 --- a/src/lib/utils/validation.ts +++ b/src/lib/utils/validation.ts @@ -177,7 +177,7 @@ export const validateFeaturesConsistency = ( export function validateShapesEqual( y_true: Type1DMatrix | Type2DMatrix = null, y_pred: Type1DMatrix | Type2DMatrix = null, -): tf.Tensor[] { +): Type1DMatrix> { const yTrueTensor = tf.tensor(y_true); const yPredTensor = tf.tensor(y_pred); const yTrueShape = yTrueTensor.shape; @@ -202,7 +202,7 @@ export function validateShapesEqual( * get number of samples from an array * @param array - type matrix or tensor */ -export function numSamples(array: TypeMatrix | tf.Tensor = null): number { +export function numSamples(array: TypeMatrix | tf.Tensor = null): number { if (!array) { throw new ValidationError(`array cant be null`); } diff --git a/test/model_selection/_split.test.ts b/test/model_selection/_split.test.ts index 60fcd006..b04b09a2 100644 --- a/test/model_selection/_split.test.ts +++ b/test/model_selection/_split.test.ts @@ -1,5 +1,5 @@ import * as _ from 'lodash'; -import { KFold, train_test_split } from '../../src/lib/model_selection/_split'; +import { KFold, StratifiedShuffleSplit, train_test_split } from '../../src/lib/model_selection/_split'; import { ValidationError } from '../../src/lib/utils/Errors'; describe('_split:KFold', () => { @@ -119,10 +119,10 @@ describe('_split:train_test_split', () => { train_size: 0.67, }); - expect(_.isEqual(xTrain, [[4, 5], [6, 7], [2, 3]])).toBe(true); - expect(_.isEqual(yTrain, [2, 3, 1])).toBe(true); - expect(_.isEqual(xTest, [[0, 1], [8, 9]])).toBe(true); - expect(_.isEqual(yTest, [0, 4])).toBe(true); + expect(_.isEqual(xTrain, [[0, 1], [2, 3]])).toBe(true); + expect(_.isEqual(yTrain, [0, 1])).toBe(true); + expect(_.isEqual(xTest, [[4, 5], [6, 7]])).toBe(true); + expect(_.isEqual(yTest, [2, 3])).toBe(true); }); it('Should split X1, y1 with random_state 100 test_size: .50 train_size: .50', () => { @@ -132,19 +132,19 @@ describe('_split:train_test_split', () => { train_size: 0.5, }); - expect(_.isEqual(xTrain, [[0, 1], [6, 7], [2, 3]])).toBe(true); - expect(_.isEqual(yTrain, [0, 3, 1])).toBe(true); - expect(_.isEqual(xTest, [[8, 9], [4, 5]])).toBe(true); - expect(_.isEqual(yTest, [4, 2])).toBe(true); + expect(_.isEqual(xTrain, [[4, 5], [2, 3], [8, 9]])).toBe(true); + expect(_.isEqual(yTrain, [2, 1, 4])).toBe(true); + expect(_.isEqual(xTest, [[0, 1], [6, 7]])).toBe(true); + expect(_.isEqual(yTest, [0, 3])).toBe(true); }); it('Should use default test and train sizes', () => { const { xTrain, yTrain, xTest, yTest } = train_test_split(X1, y1); - expect(_.isEqual(xTrain, [[8, 9], [6, 7], [0, 1]])).toBe(true); - expect(_.isEqual(yTrain, [4, 3, 0])).toBe(true); - expect(_.isEqual(xTest, [[4, 5]])).toBe(true); - expect(_.isEqual(yTest, [2])).toBe(true); + expect(_.isEqual(xTrain, [[6, 7], [0, 1], [8, 9], [4, 5]])).toBe(true); + expect(_.isEqual(yTrain, [3, 0, 4, 2])).toBe(true); + expect(_.isEqual(xTest, [[2, 3]])).toBe(true); + expect(_.isEqual(yTest, [1])).toBe(true); }); it('Should sum of test_size and train_size attempting to match the input size throw an error', () => { @@ -161,9 +161,20 @@ describe('_split:train_test_split', () => { it('Should split X2 y2 with random_state: 42 test_size: .33 and train_size: .67', () => { const { xTrain, yTrain, xTest, yTest } = train_test_split(X2, y2); - expect(_.isEqual(xTrain, [['five'], ['four'], ['one']])).toBe(true); - expect(_.isEqual(yTrain, ['e', 'd', 'a'])).toBe(true); - expect(_.isEqual(xTest, [['three']])).toBe(true); - expect(_.isEqual(yTest, ['c'])).toBe(true); + + expect(_.isEqual(xTrain, [['four'], ['one'], ['five'], ['three']])).toBe(true); + expect(_.isEqual(yTrain, ['d', 'a', 'e', 'c'])).toBe(true); + expect(_.isEqual(xTest, [['two']])).toBe(true); + expect(_.isEqual(yTest, ['b'])).toBe(true); + }); +}); + +describe('_split:StratifiedShuffleSplit', () => { + const X = [[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]; + const y = [0, 0, 0, 1, 1, 1]; + const sss = new StratifiedShuffleSplit(5, 0.5, 0.5, 0); + it('Should split X2 y2 with random_state: 42 test_size: .33 and train_size: .67', () => { + const [train, test] = sss.split(X, y); + console.log(train, test); // tslint:disable-line }); }); From a08187a77207deb7e35820154a9b759d5971da97 Mon Sep 17 00:00:00 2001 From: VariableVasasMT Date: Tue, 8 Oct 2019 01:09:15 +0530 Subject: [PATCH 5/7] working set of data --- src/lib/model_selection/_split.ts | 38 ++- src/lib/utils/tensors.ts | 12 +- .../manual_sgd_regressor.snap.ts | 300 +++++++++--------- test/linear_model/stochastic_gradient.test.ts | 1 + test/model_selection/_split.test.ts | 69 +++- 5 files changed, 243 insertions(+), 177 deletions(-) diff --git a/src/lib/model_selection/_split.ts b/src/lib/model_selection/_split.ts index c73ede5b..e4d781ac 100644 --- a/src/lib/model_selection/_split.ts +++ b/src/lib/model_selection/_split.ts @@ -25,7 +25,6 @@ const testShapes = (X: Type1DMatrix | Type2DMatrix, y: Type1DMatrix = countBin(yIndices); - if (_.min(classCounts) < 2) { - throw new Error( + throw new ValidationError( `The least populated class in y=${y} has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.`, ); } if (nTrain < nClasses) { - throw new Error(`The train_size = ${nTrain} should be greater or equal to the number of classes = ${nClasses}`); + throw new ValidationError( + `The train_size = ${nTrain} should be greater or equal to the number of classes = ${nClasses}`, + ); } if (nTest < nClasses) { - throw new Error(`The test_size = ${nTest} should be greater or equal to the number of classes = ${nClasses}`); + throw new ValidationError( + `The test_size = ${nTest} should be greater or equal to the number of classes = ${nClasses}`, + ); } const cumsumClassCounts: tf.Tensor1D = tf.cumsum(classCounts); @@ -266,21 +268,25 @@ export class StratifiedShuffleSplit { const test = []; const train = []; - for (let i = 0; i <= this.n_splits; i++) { + for (let i = 0; i < this.n_splits; i++) { const n_i: Type1DMatrix = approximateMode(classCounts, nTrain, this.rng); const classCountsRemaining: Type1DMatrix = classCounts.map((item, index) => n_i[index] - item); const t_i: Type1DMatrix = approximateMode(classCountsRemaining, nTest, this.rng); + const tempTest = []; + const tempTrain = []; + for (let j = 0; j < nClasses; j++) { - const permutation: Type1DMatrix = this.rng.permutation(classCounts[i]); + const permutation: Type1DMatrix = this.rng.permutation(classCounts[j]); const permIndicesClassI = permutation.map((val) => classIndices[j][val]); - console.log(`permIndicesClassI=${JSON.stringify(permIndicesClassI)}`); //tslint:disable-line - train.concat(permIndicesClassI.slice(0, n_i[i])); - test.concat(permIndicesClassI.slice(n_i[i], n_i[i] + t_i[i])); + tempTrain.push.apply(tempTrain, permIndicesClassI.slice(0, n_i[j])); + tempTest.push.apply(tempTest, permIndicesClassI.slice(n_i[j], n_i[j] + t_i[j])); } + test.push(this.rng.shuffle(tempTest)); + train.push(this.rng.shuffle(tempTrain)); } - return [this.rng.shuffle(train), this.rng.shuffle(test)]; + return [train, test]; }; } @@ -319,13 +325,13 @@ function validateShuffleSplit( throw new ValidationError(trainRangeValidationError(train_size, n_samples)); } - n_train = test_size; + n_train = train_size; } else { - if (test_size <= 0 || test_size >= 1) { + if (train_size <= 0 || train_size >= 1) { throw new ValidationError(trainRangeValidationError(train_size, n_samples)); } - n_train = Math.ceil(train_size * n_samples); + n_train = Math.floor(train_size * n_samples); } } @@ -352,5 +358,5 @@ function validateShuffleSplit( 'aforementioned parameters.', ); } - return [Math.round(n_test), Math.round(n_train)]; + return [Math.floor(n_test), Math.floor(n_train)]; } diff --git a/src/lib/utils/tensors.ts b/src/lib/utils/tensors.ts index 3fa7fc1f..97d54420 100644 --- a/src/lib/utils/tensors.ts +++ b/src/lib/utils/tensors.ts @@ -161,10 +161,9 @@ export function countBin(array: Type1DMatrix, weights?: Type1DMatrix { + floored[k] += 1; + }); + needToAdd -= addNow; if (needToAdd === 0) { break; diff --git a/test/linear_model/__snapshots__/manual_sgd_regressor.snap.ts b/test/linear_model/__snapshots__/manual_sgd_regressor.snap.ts index fb87d235..d3dd38ca 100644 --- a/test/linear_model/__snapshots__/manual_sgd_regressor.snap.ts +++ b/test/linear_model/__snapshots__/manual_sgd_regressor.snap.ts @@ -1,158 +1,158 @@ export const reg_l1_snap = [ - -0.0721491202712059, - 1.8499072790145874, - 2.000120162963867, - 1.763584017753601, - -0.06548641622066498, - 1.2569680213928223, - 1.408263087272644, - -0.09963376820087433, - 0.9498693346977234, - 0.9368916153907776, - 0.12602682411670685, - -0.016304979100823402, - 0.8815138339996338, - 1.0217688083648682, - 0.042637307196855545, - 1.2872182130813599, - 2.0576579570770264, - -0.001897446229122579, - 1.928894281387329, - 1.4728261232376099, - 0.838404655456543, - 2.0542337894439697, - -0.05303708836436272, - 2.005605936050415, - 0.024565843865275383, - 1.1904466152191162, - 1.5692096948623657, - 1.0676918029785156, - 1.246755838394165, - 1.692721962928772, - -0.13608022034168243, - -0.008093821816146374, - 1.7138903141021729, - 1.8486846685409546, - -0.0868988186120987, - 1.3307785987854004, - -0.032723069190979004, - -0.052975043654441833, - 1.5985339879989624, - 1.6508700847625732, - 1.2555382251739502, - -0.05963774397969246, - 1.898187279701233, - 1.5247799158096313, - -0.05670524761080742, - 1.8354581594467163, - 1.2046414613723755, - 1.8430787324905396, - -0.08569683134555817, - 1.1684170961380005, + -0.10187516361474991, + 1.9362775087356567, + -0.030817851424217224, + -0.15670014917850494, + -0.06435448676347733, + 1.3769986629486084, + -0.054627493023872375, + 1.4001959562301636, + 1.768455982208252, + 1.1212642192840576, + 1.5794532299041748, + 0.9440510272979736, + -0.09058281779289246, + 0.13453781604766846, + 2.2322444915771484, + 0.8844969868659973, + 1.173557162284851, + 1.5922285318374634, + -0.23827676475048065, + 1.999155879020691, + 1.9670710563659668, + 1.1842509508132935, + 1.8097987174987793, + 0.8924177885055542, + 1.0067024230957031, + 1.9101625680923462, + -0.04956323280930519, + 0.01447377260774374, + 1.1836856603622437, + 0.012781056575477123, + 1.0901933908462524, + 1.5491446256637573, + 1.2899250984191895, + 1.2064342498779297, + 1.4554414749145508, + 2.0235116481781006, + -0.00013770590885542333, + 0.03823625668883324, + -0.16355983912944794, + 1.4931367635726929, + -0.06580637395381927, + 1.3363806009292603, + 1.0733377933502197, + 1.5785175561904907, + 1.7249342203140259, + 1.2076889276504517, + 1.3013920783996582, + 2.0409014225006104, + 1.4154359102249146, + 1.1597988605499268, ]; export const reg_l12_snap = [ - 0.10738064348697662, - 1.7174302339553833, - 1.8375533819198608, - 1.67069411277771, - 0.06189228966832161, - 1.1819779872894287, - 1.3368662595748901, - 0.0677068829536438, - 0.9299587607383728, - 0.9379181265830994, - 0.35877126455307007, - 0.16697554290294647, - 0.8833820223808289, - 1.0770105123519897, - 0.20129679143428802, - 1.3402129411697388, - 2.03820538520813, - 0.11076389998197556, - 1.7500780820846558, - 1.469829797744751, - 0.8211827278137207, - 1.9208850860595703, - 0.09558970481157303, - 1.892223596572876, - 0.17387846112251282, - 1.138923168182373, - 1.5752410888671875, - 1.0453166961669922, - 1.308498501777649, - 1.45609712600708, - -0.03998078405857086, - 0.16131111979484558, - 1.56316339969635, - 1.8202002048492432, - 0.06510242074728012, - 1.346653699874878, - 0.10171855241060257, - 0.048874109983444214, - 1.4369295835494995, - 1.6515308618545532, - 1.2302302122116089, - 0.09436246007680893, - 1.8291091918945312, - 1.407045602798462, - 0.08854561299085617, - 1.6996674537658691, - 1.2097012996673584, - 1.7823750972747803, - 0.08302222937345505, - 1.217348575592041, + -0.09578026086091995, + 1.9175169467926025, + -0.044050805270671844, + -0.1154128909111023, + -0.07411445677280426, + 1.3834723234176636, + -0.04848529398441315, + 1.3901816606521606, + 1.7331737279891968, + 1.1252330541610718, + 1.6151249408721924, + 0.8965901732444763, + -0.09128819406032562, + 0.15496404469013214, + 2.22098445892334, + 0.8438668251037598, + 1.1376556158065796, + 1.5821704864501953, + -0.20684932172298431, + 1.9915281534194946, + 1.9458564519882202, + 1.1916320323944092, + 1.8428572416305542, + 0.8544819355010986, + 0.9816596508026123, + 1.894813895225525, + -0.034065138548612595, + 0.013514656573534012, + 1.1664959192276, + -0.007028540596365929, + 1.0804977416992188, + 1.542830228805542, + 1.2962431907653809, + 1.1974050998687744, + 1.462317943572998, + 1.9718060493469238, + -0.010182066820561886, + 0.04536003991961479, + -0.15250259637832642, + 1.4982532262802124, + -0.05850651115179062, + 1.3294603824615479, + 1.0583312511444092, + 1.564939260482788, + 1.7103776931762695, + 1.2344924211502075, + 1.3150047063827515, + 2.040947198867798, + 1.4060312509536743, + 1.1478352546691895, ]; export const reg_l2_snap = [ - -0.07334784418344498, - 1.8702806234359741, - 2.027863025665283, - 1.757415533065796, - -0.058840565383434296, - 1.2708457708358765, - 1.4022281169891357, - -0.12441510707139969, - 0.9141921401023865, - 0.9153953790664673, - 0.09964509308338165, - -0.012327668257057667, - 0.8524127006530762, - 0.9993201494216919, - 0.05692737177014351, - 1.260292649269104, - 2.080444574356079, - -0.009513204917311668, - 1.9556868076324463, - 1.4598970413208008, - 0.8319887518882751, - 2.089475631713867, - -0.05327172577381134, - 2.0235347747802734, - -0.0016155339544638991, - 1.2159310579299927, - 1.526334285736084, - 1.057511806488037, - 1.2104287147521973, - 1.7403795719146729, - -0.13033176958560944, - -0.025982998311519623, - 1.7425973415374756, - 1.8811771869659424, - -0.08381890505552292, - 1.3183915615081787, - -0.027015937492251396, - -0.058081306517124176, - 1.6068274974822998, - 1.6342045068740845, - 1.2668280601501465, - -0.07258858531713486, - 1.9060038328170776, - 1.5319108963012695, - -0.06700124591588974, - 1.8551995754241943, - 1.1812701225280762, - 1.8531256914138794, - -0.07763924449682236, - 1.150630235671997, + -0.1128714382648468, + 1.927524209022522, + -0.019957974553108215, + -0.15421128273010254, + -0.06358052790164948, + 1.3845850229263306, + -0.05573194846510887, + 1.4279766082763672, + 1.7814819812774658, + 1.1407538652420044, + 1.5841346979141235, + 0.9630727171897888, + -0.08736681193113327, + 0.13895414769649506, + 2.242431879043579, + 0.8999905586242676, + 1.1831176280975342, + 1.597848653793335, + -0.2701653838157654, + 2.004774570465088, + 1.967844009399414, + 1.1722862720489502, + 1.8184894323349, + 0.9122092127799988, + 1.0183953046798706, + 1.9147017002105713, + -0.057779841125011444, + -0.007610837463289499, + 1.1790218353271484, + 0.007723023183643818, + 1.0842615365982056, + 1.569187045097351, + 1.3011060953140259, + 1.2352399826049805, + 1.470099925994873, + 2.021085023880005, + 0.009398099966347218, + 0.029045211151242256, + -0.18687404692173004, + 1.4855189323425293, + -0.07027280330657959, + 1.3430536985397339, + 1.0793861150741577, + 1.584331750869751, + 1.7265660762786865, + 1.1936516761779785, + 1.2927387952804565, + 2.037707567214966, + 1.420951008796692, + 1.1704827547073364, ]; diff --git a/test/linear_model/stochastic_gradient.test.ts b/test/linear_model/stochastic_gradient.test.ts index cba70cf1..bc14a373 100644 --- a/test/linear_model/stochastic_gradient.test.ts +++ b/test/linear_model/stochastic_gradient.test.ts @@ -193,6 +193,7 @@ describe('linear_model:SGDRegressor', () => { const reg = new SGDRegressor(); reg.fit(xTrain, yTrain); const result = reg.predict(xTest); + const similarity = assertArrayAlmostEqual(reg_l2_snap, result, 1); expect(similarity).toBeGreaterThanOrEqual(accuracyExpected1); diff --git a/test/model_selection/_split.test.ts b/test/model_selection/_split.test.ts index b04b09a2..33523522 100644 --- a/test/model_selection/_split.test.ts +++ b/test/model_selection/_split.test.ts @@ -1,6 +1,7 @@ import * as _ from 'lodash'; import { KFold, StratifiedShuffleSplit, train_test_split } from '../../src/lib/model_selection/_split'; import { ValidationError } from '../../src/lib/utils/Errors'; +// import { x_1, y_1 as ySnap } from './__snapshots__/_split.test'; describe('_split:KFold', () => { const X1 = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]; @@ -170,11 +171,67 @@ describe('_split:train_test_split', () => { }); describe('_split:StratifiedShuffleSplit', () => { - const X = [[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]; - const y = [0, 0, 0, 1, 1, 1]; - const sss = new StratifiedShuffleSplit(5, 0.5, 0.5, 0); - it('Should split X2 y2 with random_state: 42 test_size: .33 and train_size: .67', () => { - const [train, test] = sss.split(X, y); - console.log(train, test); // tslint:disable-line + it('Check that error is raised if there is a class with only one sample', () => { + const X = [0, 1, 2, 3, 4, 5, 6]; + const y = [0, 1, 1, 1, 2, 2, 2]; + const initAndCall = (...values) => { + const sss = new StratifiedShuffleSplit(...values); + const [train, test] = sss.split(X, y); + }; + expect(() => initAndCall(3, 0.2)).toThrow(); + expect(() => initAndCall(3, 0.2)).toThrowError( + `The least populated class in y=${y} has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.`, + ); + + // expect(() => initAndCall(3, 2)).toThrow(); + // expect(() => initAndCall(3, 2)).toThrowError( + // `The least populated class in y=${y} has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.`, + // ); + }); + + it('Check that error is raised if the test/train set size is smaller than n_classes', () => { + const X = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + const y = [0, 0, 0, 1, 1, 1, 2, 2, 2]; + const initAndCall = (...values) => { + const sss = new StratifiedShuffleSplit(...values); + const [train, test] = sss.split(X, y); + }; + expect(() => initAndCall(3, 2)).toThrow(); + expect(() => initAndCall(3, 2)).toThrowError( + 'The test_size = 2 should be greater or equal to the number of classes = 3', + ); + + expect(() => initAndCall(3, 3, 2)).toThrow(); + expect(() => initAndCall(3, 3, 2)).toThrowError( + 'The train_size = 2 should be greater or equal to the number of classes = 3', + ); }); + + it('Test stratified shuffle split respects test size.', () => { + const y = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]; + const testSize = 5; + const trainSize = 10; + const [trainSet, testSet] = new StratifiedShuffleSplit(6, testSize, trainSize, 0).split( + new Array(y.length).fill(1), + y, + ); + for (let i = 0; i < trainSet.length; i++) { + const train = trainSet[i]; + const test = testSet[i]; + expect(train.length).toBe(trainSize); + expect(test.length).toBe(testSize); + expect(train.length + test.length).toBe(y.length); + } + }); + + // it('Test stratified shuffle split multilabel many labels.', () => { + // const y = ySnap; + // const X = x_1; + // const [trainSet, testSet] = new StratifiedShuffleSplit(6, 0.5, undefined, 0).split(X, y); + // for (let i = 0; i < trainSet.length; i++) { + // const train = trainSet[i]; + // const test = testSet[i]; + // expect(train.length + test.length).toBe(y.length); + // } + // }); }); From 09912fe7c9c4825967e6832e9eec3d18a616dbc9 Mon Sep 17 00:00:00 2001 From: VariableVasasMT Date: Tue, 8 Oct 2019 01:31:01 +0530 Subject: [PATCH 6/7] working set of data --- src/lib/model_selection/_split.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib/model_selection/_split.ts b/src/lib/model_selection/_split.ts index e4d781ac..23f89565 100644 --- a/src/lib/model_selection/_split.ts +++ b/src/lib/model_selection/_split.ts @@ -234,7 +234,6 @@ export class StratifiedShuffleSplit { split = (X: Type1DMatrix | Type2DMatrix = null, y: Type1DMatrix = null): Type2DMatrix => { const XTensor = tf.tensor(X); - // const yTensor = convertToTensor(y); const nSamples = numSamples(XTensor); const [nTest, nTrain] = validateShuffleSplit(nSamples, this.testSize, this.trainSize, this.defaultTestSize); From 7907b25e50bdf0a072f03fc22be2766b1f935dfd Mon Sep 17 00:00:00 2001 From: VariableVasasMT Date: Tue, 8 Oct 2019 01:32:55 +0530 Subject: [PATCH 7/7] working set of data --- src/lib/model_selection/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/model_selection/index.ts b/src/lib/model_selection/index.ts index 8d387b7d..20d125e3 100644 --- a/src/lib/model_selection/index.ts +++ b/src/lib/model_selection/index.ts @@ -1,3 +1,3 @@ -import { KFold, train_test_split } from './_split'; +import { KFold, StratifiedShuffleSplit, train_test_split } from './_split'; -export { KFold, train_test_split }; +export { KFold, train_test_split, StratifiedShuffleSplit }; pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy