50
50
#include "utils/lsyscache.h"
51
51
#include "utils/memutils.h"
52
52
#include "utils/pg_rusage.h"
53
+ #include "utils/sampling.h"
53
54
#include "utils/sortsupport.h"
54
55
#include "utils/syscache.h"
55
56
#include "utils/timestamp.h"
56
57
#include "utils/tqual.h"
57
58
58
59
59
- /* Data structure for Algorithm S from Knuth 3.4.2 */
60
- typedef struct
61
- {
62
- BlockNumber N ; /* number of blocks, known in advance */
63
- int n ; /* desired sample size */
64
- BlockNumber t ; /* current block number */
65
- int m ; /* blocks selected so far */
66
- } BlockSamplerData ;
67
-
68
- typedef BlockSamplerData * BlockSampler ;
69
-
70
60
/* Per-index data for ANALYZE */
71
61
typedef struct AnlIndexData
72
62
{
@@ -89,10 +79,6 @@ static void do_analyze_rel(Relation onerel, int options,
89
79
VacuumParams * params , List * va_cols ,
90
80
AcquireSampleRowsFunc acquirefunc , BlockNumber relpages ,
91
81
bool inh , bool in_outer_xact , int elevel );
92
- static void BlockSampler_Init (BlockSampler bs , BlockNumber nblocks ,
93
- int samplesize );
94
- static bool BlockSampler_HasMore (BlockSampler bs );
95
- static BlockNumber BlockSampler_Next (BlockSampler bs );
96
82
static void compute_index_stats (Relation onerel , double totalrows ,
97
83
AnlIndexData * indexdata , int nindexes ,
98
84
HeapTuple * rows , int numrows ,
@@ -950,94 +936,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
950
936
return stats ;
951
937
}
952
938
953
- /*
954
- * BlockSampler_Init -- prepare for random sampling of blocknumbers
955
- *
956
- * BlockSampler is used for stage one of our new two-stage tuple
957
- * sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
958
- * "Large DB"). It selects a random sample of samplesize blocks out of
959
- * the nblocks blocks in the table. If the table has less than
960
- * samplesize blocks, all blocks are selected.
961
- *
962
- * Since we know the total number of blocks in advance, we can use the
963
- * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
964
- * algorithm.
965
- */
966
- static void
967
- BlockSampler_Init (BlockSampler bs , BlockNumber nblocks , int samplesize )
968
- {
969
- bs -> N = nblocks ; /* measured table size */
970
-
971
- /*
972
- * If we decide to reduce samplesize for tables that have less or not much
973
- * more than samplesize blocks, here is the place to do it.
974
- */
975
- bs -> n = samplesize ;
976
- bs -> t = 0 ; /* blocks scanned so far */
977
- bs -> m = 0 ; /* blocks selected so far */
978
- }
979
-
980
- static bool
981
- BlockSampler_HasMore (BlockSampler bs )
982
- {
983
- return (bs -> t < bs -> N ) && (bs -> m < bs -> n );
984
- }
985
-
986
- static BlockNumber
987
- BlockSampler_Next (BlockSampler bs )
988
- {
989
- BlockNumber K = bs -> N - bs -> t ; /* remaining blocks */
990
- int k = bs -> n - bs -> m ; /* blocks still to sample */
991
- double p ; /* probability to skip block */
992
- double V ; /* random */
993
-
994
- Assert (BlockSampler_HasMore (bs )); /* hence K > 0 and k > 0 */
995
-
996
- if ((BlockNumber ) k >= K )
997
- {
998
- /* need all the rest */
999
- bs -> m ++ ;
1000
- return bs -> t ++ ;
1001
- }
1002
-
1003
- /*----------
1004
- * It is not obvious that this code matches Knuth's Algorithm S.
1005
- * Knuth says to skip the current block with probability 1 - k/K.
1006
- * If we are to skip, we should advance t (hence decrease K), and
1007
- * repeat the same probabilistic test for the next block. The naive
1008
- * implementation thus requires an anl_random_fract() call for each block
1009
- * number. But we can reduce this to one anl_random_fract() call per
1010
- * selected block, by noting that each time the while-test succeeds,
1011
- * we can reinterpret V as a uniform random number in the range 0 to p.
1012
- * Therefore, instead of choosing a new V, we just adjust p to be
1013
- * the appropriate fraction of its former value, and our next loop
1014
- * makes the appropriate probabilistic test.
1015
- *
1016
- * We have initially K > k > 0. If the loop reduces K to equal k,
1017
- * the next while-test must fail since p will become exactly zero
1018
- * (we assume there will not be roundoff error in the division).
1019
- * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
1020
- * to be doubly sure about roundoff error.) Therefore K cannot become
1021
- * less than k, which means that we cannot fail to select enough blocks.
1022
- *----------
1023
- */
1024
- V = anl_random_fract ();
1025
- p = 1.0 - (double ) k / (double ) K ;
1026
- while (V < p )
1027
- {
1028
- /* skip */
1029
- bs -> t ++ ;
1030
- K -- ; /* keep K == N - t */
1031
-
1032
- /* adjust p to be new cutoff point in reduced range */
1033
- p *= 1.0 - (double ) k / (double ) K ;
1034
- }
1035
-
1036
- /* select */
1037
- bs -> m ++ ;
1038
- return bs -> t ++ ;
1039
- }
1040
-
1041
939
/*
1042
940
* acquire_sample_rows -- acquire a random sample of rows from the table
1043
941
*
@@ -1084,7 +982,7 @@ acquire_sample_rows(Relation onerel, int elevel,
1084
982
BlockNumber totalblocks ;
1085
983
TransactionId OldestXmin ;
1086
984
BlockSamplerData bs ;
1087
- double rstate ;
985
+ ReservoirStateData rstate ;
1088
986
1089
987
Assert (targrows > 0 );
1090
988
@@ -1094,9 +992,9 @@ acquire_sample_rows(Relation onerel, int elevel,
1094
992
OldestXmin = GetOldestXmin (onerel , true);
1095
993
1096
994
/* Prepare for sampling block numbers */
1097
- BlockSampler_Init (& bs , totalblocks , targrows );
995
+ BlockSampler_Init (& bs , totalblocks , targrows , random () );
1098
996
/* Prepare for sampling rows */
1099
- rstate = anl_init_selection_state ( targrows );
997
+ reservoir_init_selection_state ( & rstate , targrows );
1100
998
1101
999
/* Outer loop over blocks to sample */
1102
1000
while (BlockSampler_HasMore (& bs ))
@@ -1244,16 +1142,15 @@ acquire_sample_rows(Relation onerel, int elevel,
1244
1142
* t.
1245
1143
*/
1246
1144
if (rowstoskip < 0 )
1247
- rowstoskip = anl_get_next_S (samplerows , targrows ,
1248
- & rstate );
1145
+ rowstoskip = reservoir_get_next_S (& rstate , samplerows , targrows );
1249
1146
1250
1147
if (rowstoskip <= 0 )
1251
1148
{
1252
1149
/*
1253
1150
* Found a suitable tuple, so save it, replacing one
1254
1151
* old tuple at random
1255
1152
*/
1256
- int k = (int ) (targrows * anl_random_fract ());
1153
+ int k = (int ) (targrows * sampler_random_fract ());
1257
1154
1258
1155
Assert (k >= 0 && k < targrows );
1259
1156
heap_freetuple (rows [k ]);
@@ -1312,116 +1209,6 @@ acquire_sample_rows(Relation onerel, int elevel,
1312
1209
return numrows ;
1313
1210
}
1314
1211
1315
- /* Select a random value R uniformly distributed in (0 - 1) */
1316
- double
1317
- anl_random_fract (void )
1318
- {
1319
- return ((double ) random () + 1 ) / ((double ) MAX_RANDOM_VALUE + 2 );
1320
- }
1321
-
1322
- /*
1323
- * These two routines embody Algorithm Z from "Random sampling with a
1324
- * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
1325
- * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
1326
- * of the count S of records to skip before processing another record.
1327
- * It is computed primarily based on t, the number of records already read.
1328
- * The only extra state needed between calls is W, a random state variable.
1329
- *
1330
- * anl_init_selection_state computes the initial W value.
1331
- *
1332
- * Given that we've already read t records (t >= n), anl_get_next_S
1333
- * determines the number of records to skip before the next record is
1334
- * processed.
1335
- */
1336
- double
1337
- anl_init_selection_state (int n )
1338
- {
1339
- /* Initial value of W (for use when Algorithm Z is first applied) */
1340
- return exp (- log (anl_random_fract ()) / n );
1341
- }
1342
-
1343
- double
1344
- anl_get_next_S (double t , int n , double * stateptr )
1345
- {
1346
- double S ;
1347
-
1348
- /* The magic constant here is T from Vitter's paper */
1349
- if (t <= (22.0 * n ))
1350
- {
1351
- /* Process records using Algorithm X until t is large enough */
1352
- double V ,
1353
- quot ;
1354
-
1355
- V = anl_random_fract (); /* Generate V */
1356
- S = 0 ;
1357
- t += 1 ;
1358
- /* Note: "num" in Vitter's code is always equal to t - n */
1359
- quot = (t - (double ) n ) / t ;
1360
- /* Find min S satisfying (4.1) */
1361
- while (quot > V )
1362
- {
1363
- S += 1 ;
1364
- t += 1 ;
1365
- quot *= (t - (double ) n ) / t ;
1366
- }
1367
- }
1368
- else
1369
- {
1370
- /* Now apply Algorithm Z */
1371
- double W = * stateptr ;
1372
- double term = t - (double ) n + 1 ;
1373
-
1374
- for (;;)
1375
- {
1376
- double numer ,
1377
- numer_lim ,
1378
- denom ;
1379
- double U ,
1380
- X ,
1381
- lhs ,
1382
- rhs ,
1383
- y ,
1384
- tmp ;
1385
-
1386
- /* Generate U and X */
1387
- U = anl_random_fract ();
1388
- X = t * (W - 1.0 );
1389
- S = floor (X ); /* S is tentatively set to floor(X) */
1390
- /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
1391
- tmp = (t + 1 ) / term ;
1392
- lhs = exp (log (((U * tmp * tmp ) * (term + S )) / (t + X )) / n );
1393
- rhs = (((t + X ) / (term + S )) * term ) / t ;
1394
- if (lhs <= rhs )
1395
- {
1396
- W = rhs / lhs ;
1397
- break ;
1398
- }
1399
- /* Test if U <= f(S)/cg(X) */
1400
- y = (((U * (t + 1 )) / term ) * (t + S + 1 )) / (t + X );
1401
- if ((double ) n < S )
1402
- {
1403
- denom = t ;
1404
- numer_lim = term + S ;
1405
- }
1406
- else
1407
- {
1408
- denom = t - (double ) n + S ;
1409
- numer_lim = t + 1 ;
1410
- }
1411
- for (numer = t + S ; numer >= numer_lim ; numer -= 1 )
1412
- {
1413
- y *= numer / denom ;
1414
- denom -= 1 ;
1415
- }
1416
- W = exp (- log (anl_random_fract ()) / n ); /* Generate W in advance */
1417
- if (exp (log (y ) / n ) <= (t + X ) / t )
1418
- break ;
1419
- }
1420
- * stateptr = W ;
1421
- }
1422
- return S ;
1423
- }
1424
-
1425
1212
/*
1426
1213
* qsort comparator for sorting rows[] array
1427
1214
*/
0 commit comments