Content-Length: 1078566 | pFad | http://github.com/tonybelloni/postgres/commit/83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c

15 Separate block sampling functions · tonybelloni/postgres@83e176e · GitHub
Skip to content

Commit 83e176e

Browse files
Separate block sampling functions
Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek
1 parent 5a3022f commit 83e176e

File tree

7 files changed

+287
-232
lines changed

7 files changed

+287
-232
lines changed

contrib/file_fdw/file_fdw.c

+5-4
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "optimizer/var.h"
3535
#include "utils/memutils.h"
3636
#include "utils/rel.h"
37+
#include "utils/sampling.h"
3738

3839
PG_MODULE_MAGIC;
3940

@@ -1006,7 +1007,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
10061007
{
10071008
int numrows = 0;
10081009
double rowstoskip = -1; /* -1 means not set yet */
1009-
double rstate;
1010+
ReservoirStateData rstate;
10101011
TupleDesc tupDesc;
10111012
Datum *values;
10121013
bool *nulls;
@@ -1044,7 +1045,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
10441045
ALLOCSET_DEFAULT_MAXSIZE);
10451046

10461047
/* Prepare for sampling rows */
1047-
rstate = anl_init_selection_state(targrows);
1048+
reservoir_init_selection_state(&rstate, targrows);
10481049

10491050
/* Set up callback to identify error line number. */
10501051
errcallback.callback = CopyFromErrorCallback;
@@ -1088,15 +1089,15 @@ file_acquire_sample_rows(Relation onerel, int elevel,
10881089
* not-yet-incremented value of totalrows as t.
10891090
*/
10901091
if (rowstoskip < 0)
1091-
rowstoskip = anl_get_next_S(*totalrows, targrows, &rstate);
1092+
rowstoskip = reservoir_get_next_S(&rstate, *totalrows, targrows);
10921093

10931094
if (rowstoskip <= 0)
10941095
{
10951096
/*
10961097
* Found a suitable tuple, so save it, replacing one old tuple
10971098
* at random
10981099
*/
1099-
int k = (int) (targrows * anl_random_fract());
1100+
int k = (int) (targrows * sampler_random_fract());
11001101

11011102
Assert(k >= 0 && k < targrows);
11021103
heap_freetuple(rows[k]);

contrib/postgres_fdw/postgres_fdw.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "utils/lsyscache.h"
3838
#include "utils/memutils.h"
3939
#include "utils/rel.h"
40+
#include "utils/sampling.h"
4041

4142
PG_MODULE_MAGIC;
4243

@@ -202,7 +203,7 @@ typedef struct PgFdwAnalyzeState
202203
/* for random sampling */
203204
double samplerows; /* # of rows fetched */
204205
double rowstoskip; /* # of rows to skip before next sample */
205-
double rstate; /* random state */
206+
ReservoirStateData rstate; /* state for reservoir sampling*/
206207

207208
/* working memory contexts */
208209
MemoryContext anl_cxt; /* context for per-analyze lifespan data */
@@ -2411,7 +2412,7 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
24112412
astate.numrows = 0;
24122413
astate.samplerows = 0;
24132414
astate.rowstoskip = -1; /* -1 means not set yet */
2414-
astate.rstate = anl_init_selection_state(targrows);
2415+
reservoir_init_selection_state(&astate.rstate, targrows);
24152416

24162417
/* Remember ANALYZE context, and create a per-tuple temp context */
24172418
astate.anl_cxt = CurrentMemoryContext;
@@ -2551,13 +2552,12 @@ analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate)
25512552
* analyze.c; see Jeff Vitter's paper.
25522553
*/
25532554
if (astate->rowstoskip < 0)
2554-
astate->rowstoskip = anl_get_next_S(astate->samplerows, targrows,
2555-
&astate->rstate);
2555+
astate->rowstoskip = reservoir_get_next_S(&astate->rstate, astate->samplerows, targrows);
25562556

25572557
if (astate->rowstoskip <= 0)
25582558
{
25592559
/* Choose a random reservoir element to replace. */
2560-
pos = (int) (targrows * anl_random_fract());
2560+
pos = (int) (targrows * sampler_random_fract());
25612561
Assert(pos >= 0 && pos < targrows);
25622562
heap_freetuple(astate->rows[pos]);
25632563
}

src/backend/commands/analyze.c

+6-219
Original file line numberDiff line numberDiff line change
@@ -50,23 +50,13 @@
5050
#include "utils/lsyscache.h"
5151
#include "utils/memutils.h"
5252
#include "utils/pg_rusage.h"
53+
#include "utils/sampling.h"
5354
#include "utils/sortsupport.h"
5455
#include "utils/syscache.h"
5556
#include "utils/timestamp.h"
5657
#include "utils/tqual.h"
5758

5859

59-
/* Data structure for Algorithm S from Knuth 3.4.2 */
60-
typedef struct
61-
{
62-
BlockNumber N; /* number of blocks, known in advance */
63-
int n; /* desired sample size */
64-
BlockNumber t; /* current block number */
65-
int m; /* blocks selected so far */
66-
} BlockSamplerData;
67-
68-
typedef BlockSamplerData *BlockSampler;
69-
7060
/* Per-index data for ANALYZE */
7161
typedef struct AnlIndexData
7262
{
@@ -89,10 +79,6 @@ static void do_analyze_rel(Relation onerel, int options,
8979
VacuumParams *params, List *va_cols,
9080
AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
9181
bool inh, bool in_outer_xact, int elevel);
92-
static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
93-
int samplesize);
94-
static bool BlockSampler_HasMore(BlockSampler bs);
95-
static BlockNumber BlockSampler_Next(BlockSampler bs);
9682
static void compute_index_stats(Relation onerel, double totalrows,
9783
AnlIndexData *indexdata, int nindexes,
9884
HeapTuple *rows, int numrows,
@@ -950,94 +936,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
950936
return stats;
951937
}
952938

953-
/*
954-
* BlockSampler_Init -- prepare for random sampling of blocknumbers
955-
*
956-
* BlockSampler is used for stage one of our new two-stage tuple
957-
* sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
958-
* "Large DB"). It selects a random sample of samplesize blocks out of
959-
* the nblocks blocks in the table. If the table has less than
960-
* samplesize blocks, all blocks are selected.
961-
*
962-
* Since we know the total number of blocks in advance, we can use the
963-
* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
964-
* algorithm.
965-
*/
966-
static void
967-
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
968-
{
969-
bs->N = nblocks; /* measured table size */
970-
971-
/*
972-
* If we decide to reduce samplesize for tables that have less or not much
973-
* more than samplesize blocks, here is the place to do it.
974-
*/
975-
bs->n = samplesize;
976-
bs->t = 0; /* blocks scanned so far */
977-
bs->m = 0; /* blocks selected so far */
978-
}
979-
980-
static bool
981-
BlockSampler_HasMore(BlockSampler bs)
982-
{
983-
return (bs->t < bs->N) && (bs->m < bs->n);
984-
}
985-
986-
static BlockNumber
987-
BlockSampler_Next(BlockSampler bs)
988-
{
989-
BlockNumber K = bs->N - bs->t; /* remaining blocks */
990-
int k = bs->n - bs->m; /* blocks still to sample */
991-
double p; /* probability to skip block */
992-
double V; /* random */
993-
994-
Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
995-
996-
if ((BlockNumber) k >= K)
997-
{
998-
/* need all the rest */
999-
bs->m++;
1000-
return bs->t++;
1001-
}
1002-
1003-
/*----------
1004-
* It is not obvious that this code matches Knuth's Algorithm S.
1005-
* Knuth says to skip the current block with probability 1 - k/K.
1006-
* If we are to skip, we should advance t (hence decrease K), and
1007-
* repeat the same probabilistic test for the next block. The naive
1008-
* implementation thus requires an anl_random_fract() call for each block
1009-
* number. But we can reduce this to one anl_random_fract() call per
1010-
* selected block, by noting that each time the while-test succeeds,
1011-
* we can reinterpret V as a uniform random number in the range 0 to p.
1012-
* Therefore, instead of choosing a new V, we just adjust p to be
1013-
* the appropriate fraction of its former value, and our next loop
1014-
* makes the appropriate probabilistic test.
1015-
*
1016-
* We have initially K > k > 0. If the loop reduces K to equal k,
1017-
* the next while-test must fail since p will become exactly zero
1018-
* (we assume there will not be roundoff error in the division).
1019-
* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
1020-
* to be doubly sure about roundoff error.) Therefore K cannot become
1021-
* less than k, which means that we cannot fail to select enough blocks.
1022-
*----------
1023-
*/
1024-
V = anl_random_fract();
1025-
p = 1.0 - (double) k / (double) K;
1026-
while (V < p)
1027-
{
1028-
/* skip */
1029-
bs->t++;
1030-
K--; /* keep K == N - t */
1031-
1032-
/* adjust p to be new cutoff point in reduced range */
1033-
p *= 1.0 - (double) k / (double) K;
1034-
}
1035-
1036-
/* select */
1037-
bs->m++;
1038-
return bs->t++;
1039-
}
1040-
1041939
/*
1042940
* acquire_sample_rows -- acquire a random sample of rows from the table
1043941
*
@@ -1084,7 +982,7 @@ acquire_sample_rows(Relation onerel, int elevel,
1084982
BlockNumber totalblocks;
1085983
TransactionId OldestXmin;
1086984
BlockSamplerData bs;
1087-
double rstate;
985+
ReservoirStateData rstate;
1088986

1089987
Assert(targrows > 0);
1090988

@@ -1094,9 +992,9 @@ acquire_sample_rows(Relation onerel, int elevel,
1094992
OldestXmin = GetOldestXmin(onerel, true);
1095993

1096994
/* Prepare for sampling block numbers */
1097-
BlockSampler_Init(&bs, totalblocks, targrows);
995+
BlockSampler_Init(&bs, totalblocks, targrows, random());
1098996
/* Prepare for sampling rows */
1099-
rstate = anl_init_selection_state(targrows);
997+
reservoir_init_selection_state(&rstate, targrows);
1100998

1101999
/* Outer loop over blocks to sample */
11021000
while (BlockSampler_HasMore(&bs))
@@ -1244,16 +1142,15 @@ acquire_sample_rows(Relation onerel, int elevel,
12441142
* t.
12451143
*/
12461144
if (rowstoskip < 0)
1247-
rowstoskip = anl_get_next_S(samplerows, targrows,
1248-
&rstate);
1145+
rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows);
12491146

12501147
if (rowstoskip <= 0)
12511148
{
12521149
/*
12531150
* Found a suitable tuple, so save it, replacing one
12541151
* old tuple at random
12551152
*/
1256-
int k = (int) (targrows * anl_random_fract());
1153+
int k = (int) (targrows * sampler_random_fract());
12571154

12581155
Assert(k >= 0 && k < targrows);
12591156
heap_freetuple(rows[k]);
@@ -1312,116 +1209,6 @@ acquire_sample_rows(Relation onerel, int elevel,
13121209
return numrows;
13131210
}
13141211

1315-
/* Select a random value R uniformly distributed in (0 - 1) */
1316-
double
1317-
anl_random_fract(void)
1318-
{
1319-
return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);
1320-
}
1321-
1322-
/*
1323-
* These two routines embody Algorithm Z from "Random sampling with a
1324-
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
1325-
* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
1326-
* of the count S of records to skip before processing another record.
1327-
* It is computed primarily based on t, the number of records already read.
1328-
* The only extra state needed between calls is W, a random state variable.
1329-
*
1330-
* anl_init_selection_state computes the initial W value.
1331-
*
1332-
* Given that we've already read t records (t >= n), anl_get_next_S
1333-
* determines the number of records to skip before the next record is
1334-
* processed.
1335-
*/
1336-
double
1337-
anl_init_selection_state(int n)
1338-
{
1339-
/* Initial value of W (for use when Algorithm Z is first applied) */
1340-
return exp(-log(anl_random_fract()) / n);
1341-
}
1342-
1343-
double
1344-
anl_get_next_S(double t, int n, double *stateptr)
1345-
{
1346-
double S;
1347-
1348-
/* The magic constant here is T from Vitter's paper */
1349-
if (t <= (22.0 * n))
1350-
{
1351-
/* Process records using Algorithm X until t is large enough */
1352-
double V,
1353-
quot;
1354-
1355-
V = anl_random_fract(); /* Generate V */
1356-
S = 0;
1357-
t += 1;
1358-
/* Note: "num" in Vitter's code is always equal to t - n */
1359-
quot = (t - (double) n) / t;
1360-
/* Find min S satisfying (4.1) */
1361-
while (quot > V)
1362-
{
1363-
S += 1;
1364-
t += 1;
1365-
quot *= (t - (double) n) / t;
1366-
}
1367-
}
1368-
else
1369-
{
1370-
/* Now apply Algorithm Z */
1371-
double W = *stateptr;
1372-
double term = t - (double) n + 1;
1373-
1374-
for (;;)
1375-
{
1376-
double numer,
1377-
numer_lim,
1378-
denom;
1379-
double U,
1380-
X,
1381-
lhs,
1382-
rhs,
1383-
y,
1384-
tmp;
1385-
1386-
/* Generate U and X */
1387-
U = anl_random_fract();
1388-
X = t * (W - 1.0);
1389-
S = floor(X); /* S is tentatively set to floor(X) */
1390-
/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
1391-
tmp = (t + 1) / term;
1392-
lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
1393-
rhs = (((t + X) / (term + S)) * term) / t;
1394-
if (lhs <= rhs)
1395-
{
1396-
W = rhs / lhs;
1397-
break;
1398-
}
1399-
/* Test if U <= f(S)/cg(X) */
1400-
y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
1401-
if ((double) n < S)
1402-
{
1403-
denom = t;
1404-
numer_lim = term + S;
1405-
}
1406-
else
1407-
{
1408-
denom = t - (double) n + S;
1409-
numer_lim = t + 1;
1410-
}
1411-
for (numer = t + S; numer >= numer_lim; numer -= 1)
1412-
{
1413-
y *= numer / denom;
1414-
denom -= 1;
1415-
}
1416-
W = exp(-log(anl_random_fract()) / n); /* Generate W in advance */
1417-
if (exp(log(y) / n) <= (t + X) / t)
1418-
break;
1419-
}
1420-
*stateptr = W;
1421-
}
1422-
return S;
1423-
}
1424-
14251212
/*
14261213
* qsort comparator for sorting rows[] array
14271214
*/

src/backend/utils/misc/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ include $(top_builddir)/src/Makefile.global
1515
override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
1616

1717
OBJS = guc.o help_config.o pg_rusage.o ps_status.o rls.o \
18-
superuser.o timeout.o tzparser.o
18+
sampling.o superuser.o timeout.o tzparser.o
1919

2020
# This location might depend on the installation directories. Therefore
2121
# we can't subsitute it into pg_config.h.

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/tonybelloni/postgres/commit/83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy