Skip to content

Commit 348104a

Browse files
author
Arseny Kositsyn
committed
[PGPRO-12159] Added the output of tsv lexemes positions.
If you create an index with the operator class rum_tsvector_ops, the positions of the lexemes will be saved as additional information. The positions are stored in compressed form in bytea. There is a problem that is related to the fact that in the posting tree, additional information for the senior keys is stored in a different way, which is why it has not yet been possible to output it. For all other cases, the output of additional information works correctly. Tags: rum
1 parent 18562e2 commit 348104a

File tree

3 files changed

+182
-38
lines changed

3 files changed

+182
-38
lines changed

src/rum.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "storage/bufmgr.h"
2222
#include "utils/datum.h"
2323
#include "utils/memutils.h"
24+
#include "tsearch/ts_type.h"
2425

2526
#include "rumsort.h"
2627

@@ -836,6 +837,8 @@ extern RumItem *rumGetBAEntry(BuildAccumulator *accum,
836837
#define RUM_ADDINFO_JOIN 10
837838
#define RUMNProcs 10
838839

840+
#define LOWERMASK 0x1F
841+
839842
extern PGDLLEXPORT Datum rum_extract_tsvector(PG_FUNCTION_ARGS);
840843
extern PGDLLEXPORT Datum rum_extract_tsquery(PG_FUNCTION_ARGS);
841844
extern PGDLLEXPORT Datum rum_tsvector_config(PG_FUNCTION_ARGS);
@@ -847,6 +850,9 @@ extern PGDLLEXPORT Datum rum_ts_distance_td(PG_FUNCTION_ARGS);
847850

848851
extern PGDLLEXPORT Datum tsquery_to_distance_query(PG_FUNCTION_ARGS);
849852

853+
extern char* decompress_pos(char *ptr, WordEntryPos *pos);
854+
extern unsigned int count_pos(char *ptr, int len);
855+
850856
/* rum_arr_utils.c */
851857
typedef enum SimilarityType
852858
{

src/rum_debug_funcs.c

Lines changed: 174 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "access/relation.h"
2828
#include "utils/varlena.h"
2929
#include "rum.h"
30+
#include "tsearch/ts_type.h"
3031

3132
PG_FUNCTION_INFO_V1(rum_metapage_info);
3233
PG_FUNCTION_INFO_V1(rum_page_opaque_info);
@@ -115,6 +116,8 @@ static Datum category_get_datum_text(RumNullCategory category);
115116
static Oid find_add_info_oid(RumState *rum_state_ptr);
116117
static OffsetNumber find_add_info_atrr_num(RumState *rum_state_ptr);
117118

119+
static Datum get_positions_to_text_datum(Datum add_info);
120+
118121
/*
119122
* The rum_metapage_info() function is used to retrieve
120123
* information stored on the meta page of the rum index.
@@ -386,12 +389,6 @@ rum_leaf_data_page_items(PG_FUNCTION_ARGS)
386389
/* Allocating memory for a long-lived structure */
387390
inter_call_data = palloc(sizeof(rum_page_items_state));
388391

389-
/* Initializing the RumState structure */
390-
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
391-
initRumState(inter_call_data->rum_state_ptr, rel);
392-
393-
relation_close(rel, AccessShareLock);
394-
395392
/* Getting a copy of the page from the raw page */
396393
page = get_page_from_raw(raw_page);
397394

@@ -422,6 +419,12 @@ rum_leaf_data_page_items(PG_FUNCTION_ARGS)
422419
errdetail("Flags %04X, expected %04X",
423420
opaq->flags, (RUM_DATA | RUM_LEAF))));
424421

422+
/* Initializing the RumState structure */
423+
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
424+
initRumState(inter_call_data->rum_state_ptr, rel);
425+
426+
relation_close(rel, AccessShareLock);
427+
425428
/* Build a tuple descriptor for our result type */
426429
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
427430
elog(ERROR, "return type must be a row type");
@@ -494,9 +497,24 @@ rum_leaf_data_page_items(PG_FUNCTION_ARGS)
494497
values[2] = BoolGetDatum(high_key_ptr->addInfoIsNull);
495498

496499
/* Returning add info */
497-
if(!high_key_ptr->addInfoIsNull && inter_call_data->add_info_oid != 0)
500+
if(!(high_key_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
501+
&& inter_call_data->add_info_oid != BYTEAOID)
502+
{
498503
values[3] = get_datum_text_by_oid(high_key_ptr->addInfo,
499504
inter_call_data->add_info_oid);
505+
}
506+
507+
/*
508+
* In this case, we are dealing with the positions
509+
* of tokens and they need to be decoded.
510+
*/
511+
else if (!(high_key_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
512+
&& inter_call_data->add_info_oid == BYTEAOID)
513+
{
514+
/* values[3] = get_positions_to_text_datum(high_key_ptr->addInfo); */
515+
values[3] = CStringGetTextDatum("high key positions in posting tree is not supported");
516+
}
517+
500518
else nulls[3] = true;
501519

502520
/* Forming the returned tuple */
@@ -536,8 +554,23 @@ rum_leaf_data_page_items(PG_FUNCTION_ARGS)
536554
values[2] = BoolGetDatum(rum_item_ptr->addInfoIsNull);
537555

538556
/* Returning add info */
539-
if(!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0)
540-
values[3] = get_datum_text_by_oid(rum_item_ptr->addInfo, inter_call_data->add_info_oid);
557+
if(!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
558+
&& inter_call_data->add_info_oid != BYTEAOID)
559+
{
560+
values[3] = get_datum_text_by_oid(rum_item_ptr->addInfo,
561+
inter_call_data->add_info_oid);
562+
}
563+
564+
/*
565+
* In this case, we are dealing with the positions
566+
* of tokens and they need to be decoded.
567+
*/
568+
else if (!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
569+
&& inter_call_data->add_info_oid == BYTEAOID)
570+
{
571+
values[3] = get_positions_to_text_datum(rum_item_ptr->addInfo);
572+
}
573+
541574
else nulls[3] = true;
542575

543576
/* Forming the returned tuple */
@@ -619,12 +652,6 @@ rum_internal_data_page_items(PG_FUNCTION_ARGS)
619652
/* Allocating memory for a long-lived structure */
620653
inter_call_data = palloc(sizeof(rum_page_items_state));
621654

622-
/* Initializing the RumState structure */
623-
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
624-
initRumState(inter_call_data->rum_state_ptr, rel);
625-
626-
relation_close(rel, AccessShareLock);
627-
628655
/* Getting a copy of the page from the raw page */
629656
page = get_page_from_raw(raw_page);
630657

@@ -655,6 +682,12 @@ rum_internal_data_page_items(PG_FUNCTION_ARGS)
655682
errdetail("Flags %04X, expected %04X",
656683
opaq->flags, (RUM_DATA & ~RUM_LEAF))));
657684

685+
/* Initializing the RumState structure */
686+
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
687+
initRumState(inter_call_data->rum_state_ptr, rel);
688+
689+
relation_close(rel, AccessShareLock);
690+
658691
/* Build a tuple descriptor for our result type */
659692
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
660693
elog(ERROR, "return type must be a row type");
@@ -721,9 +754,24 @@ rum_internal_data_page_items(PG_FUNCTION_ARGS)
721754
values[3] = BoolGetDatum(high_key_ptr->addInfoIsNull);
722755

723756
/* Returning add info */
724-
if(!high_key_ptr->addInfoIsNull && inter_call_data->add_info_oid != 0)
757+
if(!(high_key_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
758+
&& inter_call_data->add_info_oid != BYTEAOID)
759+
{
725760
values[4] = get_datum_text_by_oid(high_key_ptr->addInfo,
726761
inter_call_data->add_info_oid);
762+
}
763+
764+
/*
765+
* In this case, we are dealing with the positions
766+
* of tokens and they need to be decoded.
767+
*/
768+
else if (!(high_key_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
769+
&& inter_call_data->add_info_oid == BYTEAOID)
770+
{
771+
/* values[4] = get_positions_to_text_datum(high_key_ptr->addInfo); */
772+
values[4] = CStringGetTextDatum("high key positions in posting tree is not supported");
773+
}
774+
727775
else nulls[4] = true;
728776

729777
/* Forming the returned tuple */
@@ -745,9 +793,24 @@ rum_internal_data_page_items(PG_FUNCTION_ARGS)
745793
values[3] = BoolGetDatum(posting_item_ptr->item.addInfoIsNull);
746794

747795
/* Returning add info */
748-
if(!posting_item_ptr->item.addInfoIsNull && inter_call_data->add_info_oid != 0)
796+
if(!posting_item_ptr->item.addInfoIsNull && inter_call_data->add_info_oid != 0
797+
&& inter_call_data->add_info_oid != BYTEAOID)
798+
{
749799
values[4] = get_datum_text_by_oid(posting_item_ptr->item.addInfo,
750800
inter_call_data->add_info_oid);
801+
}
802+
803+
/*
804+
* In this case, we are dealing with the positions
805+
* of tokens and they need to be decoded.
806+
*/
807+
else if (!posting_item_ptr->item.addInfoIsNull && inter_call_data->add_info_oid != 0
808+
&& inter_call_data->add_info_oid == BYTEAOID)
809+
{
810+
/* values[4] = get_positions_to_text_datum(posting_item_ptr->item.addInfo); */
811+
values[4] = CStringGetTextDatum("high key positions in posting tree is not supported");
812+
}
813+
751814
else nulls[4] = true;
752815

753816
/* Forming the returned tuple */
@@ -833,12 +896,6 @@ rum_leaf_entry_page_items(PG_FUNCTION_ARGS)
833896
/* Allocating memory for a long-lived structure */
834897
inter_call_data = palloc(sizeof(rum_page_items_state));
835898

836-
/* Initializing the RumState structure */
837-
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
838-
initRumState(inter_call_data->rum_state_ptr, rel);
839-
840-
relation_close(rel, AccessShareLock);
841-
842899
/* Getting a copy of the page from the raw page */
843900
page = get_page_from_raw(raw_page);
844901

@@ -869,6 +926,12 @@ rum_leaf_entry_page_items(PG_FUNCTION_ARGS)
869926
errdetail("Flags %04X, expected %04X",
870927
opaq->flags, RUM_LEAF)));
871928

929+
/* Initializing the RumState structure */
930+
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
931+
initRumState(inter_call_data->rum_state_ptr, rel);
932+
933+
relation_close(rel, AccessShareLock);
934+
872935
/* Build a tuple descriptor for our result type */
873936
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
874937
elog(ERROR, "return type must be a row type");
@@ -1008,10 +1071,23 @@ rum_leaf_entry_page_items(PG_FUNCTION_ARGS)
10081071
values[3] = ItemPointerGetDatum(&(rum_item_ptr->iptr));
10091072
values[4] = BoolGetDatum(rum_item_ptr->addInfoIsNull);
10101073

1011-
10121074
/* Returning add info */
1013-
if(!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0)
1075+
if (!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0 &&
1076+
inter_call_data->add_info_oid != BYTEAOID)
1077+
{
10141078
values[5] = get_datum_text_by_oid(rum_item_ptr->addInfo, inter_call_data->add_info_oid);
1079+
}
1080+
1081+
/*
1082+
* In this case, we are dealing with the positions
1083+
* of tokens and they need to be decoded.
1084+
*/
1085+
else if (!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
1086+
&& inter_call_data->add_info_oid == BYTEAOID)
1087+
{
1088+
values[5] = get_positions_to_text_datum(rum_item_ptr->addInfo);
1089+
}
1090+
10151091
else nulls[5] = true;
10161092

10171093
/* The current IndexTuple does not contain a posting tree */
@@ -1101,12 +1177,6 @@ rum_internal_entry_page_items(PG_FUNCTION_ARGS)
11011177
/* Allocating memory for a long-lived structure */
11021178
inter_call_data = palloc(sizeof(rum_page_items_state));
11031179

1104-
/* Initializing the RumState structure */
1105-
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
1106-
initRumState(inter_call_data->rum_state_ptr, rel);
1107-
1108-
relation_close(rel, AccessShareLock);
1109-
11101180
/* Getting a copy of the page from the raw page */
11111181
page = get_page_from_raw(raw_page);
11121182

@@ -1137,6 +1207,12 @@ rum_internal_entry_page_items(PG_FUNCTION_ARGS)
11371207
errdetail("Flags %04X, expected %04X",
11381208
opaq->flags, 0)));
11391209

1210+
/* Initializing the RumState structure */
1211+
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
1212+
initRumState(inter_call_data->rum_state_ptr, rel);
1213+
1214+
relation_close(rel, AccessShareLock);
1215+
11401216
/* Build a tuple descriptor for our result type */
11411217
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
11421218
elog(ERROR, "return type must be a row type");
@@ -1355,7 +1431,7 @@ get_page_from_raw(bytea *raw_page)
13551431
* TODO: All types accepted by rum must be checked, but
13561432
* perhaps some types are missing or some are superfluous.
13571433
*/
1358-
static Datum
1434+
static Datum
13591435
get_datum_text_by_oid(Datum info, Oid info_oid)
13601436
{
13611437
char *str_info = NULL;
@@ -1602,3 +1678,69 @@ find_add_info_atrr_num(RumState *rum_state_ptr)
16021678
/* Need to add 1 because the attributes are numbered from 1 */
16031679
return add_info_attr_num + 1;
16041680
}
1681+
1682+
#define POS_STR_BUF_LENGHT 1024
1683+
#define POS_MAX_VAL_LENGHT 6
1684+
1685+
/*
1686+
* A function for extracting the positions of tokens from additional
1687+
* information. Returns a string in which the positions of the tokens
1688+
* are recorded. The memory that the string occupies must be cleared later.
1689+
*/
1690+
static Datum
1691+
get_positions_to_text_datum(Datum add_info)
1692+
{
1693+
bytea *positions;
1694+
char *ptrt;
1695+
WordEntryPos position = 0;
1696+
int32 npos;
1697+
1698+
Datum res;
1699+
char *positions_str;
1700+
char *positions_str_cur_ptr;
1701+
int cur_max_str_lenght;
1702+
1703+
positions = DatumGetByteaP(add_info);
1704+
ptrt = (char *) VARDATA_ANY(positions);
1705+
npos = count_pos(VARDATA_ANY(positions),
1706+
VARSIZE_ANY_EXHDR(positions));
1707+
1708+
/* Initialize the string */
1709+
positions_str = (char*) palloc(POS_STR_BUF_LENGHT * sizeof(char));
1710+
positions_str[0] = '\0';
1711+
cur_max_str_lenght = POS_STR_BUF_LENGHT;
1712+
positions_str_cur_ptr = positions_str;
1713+
1714+
/* Extract the positions of the tokens and put them in the string */
1715+
for (int i = 0; i < npos; i++)
1716+
{
1717+
/* At each iteration decode the position */
1718+
ptrt = decompress_pos(ptrt, &position);
1719+
1720+
/* Write this position in the string */
1721+
sprintf(positions_str_cur_ptr, "%d,", position);
1722+
1723+
/* Moving the pointer forward */
1724+
positions_str_cur_ptr += strlen(positions_str_cur_ptr);
1725+
1726+
/*
1727+
* Check that there is not too little left to the
1728+
* end of the line and, if necessary, overspend
1729+
* the memory.
1730+
*/
1731+
if (cur_max_str_lenght - (positions_str_cur_ptr - positions_str) <= POS_MAX_VAL_LENGHT)
1732+
{
1733+
cur_max_str_lenght += POS_STR_BUF_LENGHT;
1734+
positions_str = (char*) repalloc(positions_str, cur_max_str_lenght * sizeof(char));
1735+
positions_str_cur_ptr = positions_str + strlen(positions_str);
1736+
}
1737+
}
1738+
1739+
/* Delete the last comma if there has been at least one iteration of the loop */
1740+
if (npos > 0)
1741+
positions_str[strlen(positions_str) - 1] = '\0';
1742+
1743+
res = CStringGetTextDatum(positions_str);
1744+
pfree(positions_str);
1745+
return res;
1746+
}

src/rum_ts_utils.c

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include "catalog/pg_type.h"
1717
#include "funcapi.h"
1818
#include "miscadmin.h"
19-
#include "tsearch/ts_type.h"
2019
#include "tsearch/ts_utils.h"
2120
#include "utils/array.h"
2221
#include "utils/builtins.h"
@@ -80,8 +79,6 @@ PG_FUNCTION_INFO_V1(rum_ts_join_pos);
8079

8180
PG_FUNCTION_INFO_V1(tsquery_to_distance_query);
8281

83-
static unsigned int count_pos(char *ptr, int len);
84-
static char *decompress_pos(char *ptr, WordEntryPos *pos);
8582
static Datum build_tsvector_entry(TSVector vector, WordEntry *we);
8683
static Datum build_tsvector_hash_entry(TSVector vector, WordEntry *we);
8784
static Datum build_tsquery_entry(TSQuery query, QueryOperand *operand);
@@ -964,7 +961,6 @@ rum_tsquery_timestamp_consistent(PG_FUNCTION_ARGS)
964961
}
965962

966963
#define SIXTHBIT 0x20
967-
#define LOWERMASK 0x1F
968964

969965
static unsigned int
970966
compress_pos(char *target, WordEntryPos *pos, int npos)
@@ -999,7 +995,7 @@ compress_pos(char *target, WordEntryPos *pos, int npos)
999995
return ptr - target;
1000996
}
1001997

1002-
static char *
998+
extern char *
1003999
decompress_pos(char *ptr, WordEntryPos *pos)
10041000
{
10051001
int i;
@@ -1027,7 +1023,7 @@ decompress_pos(char *ptr, WordEntryPos *pos)
10271023
}
10281024
}
10291025

1030-
static unsigned int
1026+
extern unsigned int
10311027
count_pos(char *ptr, int len)
10321028
{
10331029
int count = 0,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy