Skip to content

Commit 5a21285

Browse files
committed
Add new searchAny and searchAll functions
The new functions are repsonsible to search the needle tokens in the tokenized text by the given tokenizer.
1 parent e774b60 commit 5a21285

File tree

6 files changed

+325
-0
lines changed

6 files changed

+325
-0
lines changed

src/Functions/search.cpp

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
#include <Columns/ColumnArray.h>
2+
#include <Columns/ColumnFixedString.h>
3+
#include <Columns/ColumnString.h>
4+
#include <DataTypes/DataTypeArray.h>
5+
#include <DataTypes/DataTypeFixedString.h>
6+
#include <DataTypes/DataTypeString.h>
7+
#include <DataTypes/DataTypesNumber.h>
8+
#include <Functions/FunctionFactory.h>
9+
#include <Functions/FunctionHelpers.h>
10+
#include <Functions/IFunction.h>
11+
#include <Interpreters/Context_fwd.h>
12+
#include <Interpreters/GinFilter.h>
13+
#include <Interpreters/ITokenExtractor.h>
14+
15+
#include <roaring.hh>
16+
17+
namespace DB
18+
{
19+
20+
namespace ErrorCodes
21+
{
22+
extern const int BAD_ARGUMENTS;
23+
}
24+
25+
struct SearchAnyProps
26+
{
27+
static constexpr String name = "searchAny";
28+
static constexpr GinSearchMode search_mode = GinSearchMode::ANY;
29+
};
30+
31+
struct SearchAllProps
32+
{
33+
static constexpr String name = "searchAll";
34+
static constexpr GinSearchMode search_mode = GinSearchMode::ALL;
35+
};
36+
37+
namespace
38+
{
39+
template <typename Type>
40+
std::optional<Type> getArgument(const ColumnsWithTypeAndName & arguments, size_t index)
41+
{
42+
if (index < arguments.size())
43+
{
44+
if constexpr (std::is_same_v<Type, UInt64>)
45+
return arguments[index].column->getUInt(0);
46+
else if constexpr (std::is_same_v<Type, String>)
47+
return arguments[index].column->getDataAt(0).toString();
48+
else if constexpr (std::is_same_v<Type, std::string_view>)
49+
return arguments[index].column->getDataAt(0).toView();
50+
else
51+
{
52+
throw Exception(
53+
ErrorCodes::BAD_ARGUMENTS,
54+
"Search function argument at index '{}' expected to be any of UInt64, String, or std::string_view",
55+
index);
56+
}
57+
}
58+
return std::nullopt;
59+
}
60+
}
61+
62+
template <typename SearchProps>
63+
class FunctionSearchImpl : public IFunction
64+
{
65+
static constexpr size_t arg_input = 0;
66+
static constexpr size_t arg_needle = 1;
67+
static constexpr size_t arg_tokenizer = 2;
68+
static constexpr size_t arg_ngrams = 3;
69+
70+
public:
71+
static constexpr auto name = SearchProps::name;
72+
73+
static FunctionPtr create(ContextPtr)
74+
{
75+
return std::make_shared<FunctionSearchImpl>();
76+
}
77+
78+
String getName() const override { return name; }
79+
size_t getNumberOfArguments() const override { return 0; }
80+
bool isVariadic() const override { return true; }
81+
bool useDefaultImplementationForConstants() const override { return true; }
82+
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
83+
84+
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
85+
{
86+
FunctionArgumentDescriptors mandatory_args{
87+
{"input", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isStringOrFixedString), nullptr, "String or FixedString"},
88+
{"needle", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isString), isColumnConst, "String"}};
89+
90+
FunctionArgumentDescriptors optional_args;
91+
92+
if (arguments.size() > 2)
93+
{
94+
optional_args.emplace_back("tokenizer", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isString), isColumnConst, "String");
95+
96+
validateFunctionArguments(
97+
*this, {arguments[arg_input], arguments[arg_needle], arguments[arg_tokenizer]}, mandatory_args, optional_args);
98+
99+
if (arguments.size() == 4)
100+
{
101+
const auto tokenizer = arguments[arg_tokenizer].column->getDataAt(0).toString();
102+
103+
if (tokenizer == NgramTokenExtractor::getExternalName())
104+
optional_args.emplace_back("ngrams", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isUInt8), isColumnConst, "UInt8");
105+
}
106+
}
107+
108+
validateFunctionArguments(*this, arguments, mandatory_args, optional_args);
109+
110+
return std::make_shared<DataTypeNumber<UInt8>>();
111+
}
112+
113+
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
114+
{
115+
auto col_input = arguments[arg_input].column;
116+
auto col_needle = arguments[arg_needle].column;
117+
auto col_result = ColumnVector<UInt8>::create();
118+
119+
if (input_rows_count == 0)
120+
return col_result;
121+
122+
const auto tokenizer_arg = getArgument<std::string_view>(arguments, arg_tokenizer).value_or(SplitTokenExtractor::getExternalName());
123+
124+
std::unique_ptr<ITokenExtractor> token_extractor;
125+
if (tokenizer_arg == SplitTokenExtractor::getExternalName())
126+
token_extractor = std::make_unique<SplitTokenExtractor>();
127+
else if (tokenizer_arg == NoOpTokenExtractor::getExternalName())
128+
token_extractor = std::make_unique<NoOpTokenExtractor>();
129+
else if (tokenizer_arg == NgramTokenExtractor::getExternalName())
130+
{
131+
auto ngrams = getArgument<UInt64>(arguments, arg_ngrams).value_or(3);
132+
if (ngrams < 2 || ngrams > 8)
133+
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Ngrams argument of function {} should be between 2 and 8, got: {}", name, ngrams);
134+
token_extractor = std::make_unique<NgramTokenExtractor>(ngrams);
135+
}
136+
else
137+
{
138+
throw Exception(
139+
ErrorCodes::BAD_ARGUMENTS,
140+
"Function '{}' supports only tokenizers 'default', 'ngram', and 'noop'", name);
141+
}
142+
143+
const auto & col_needle_tokens = col_needle->getDataAt(0);
144+
std::vector<String> needle_tokens = SplitTokenExtractor().getTokens(col_needle_tokens.data, col_needle_tokens.size);
145+
146+
if (const auto * column_string = checkAndGetColumn<ColumnString>(col_input.get()))
147+
executeImpl(std::move(token_extractor), *column_string, input_rows_count, needle_tokens, col_result->getData());
148+
else if (const auto * column_fixed_string = checkAndGetColumn<ColumnFixedString>(col_input.get()))
149+
executeImpl(std::move(token_extractor), *column_fixed_string, input_rows_count, needle_tokens, col_result->getData());
150+
151+
return col_result;
152+
}
153+
154+
private:
155+
template <typename StringColumnType>
156+
void executeImpl(
157+
std::unique_ptr<ITokenExtractor> token_extractor,
158+
StringColumnType & col_input,
159+
size_t rows_count_input,
160+
const std::vector<String>& needle_tokens,
161+
PaddedPODArray<UInt8> & col_result) const
162+
{
163+
col_result.resize(rows_count_input);
164+
165+
for (size_t i = 0; i < rows_count_input; ++i)
166+
{
167+
const auto value{col_input.getDataAt(i)};
168+
169+
col_result[i] = false;
170+
171+
[[maybe_unused]] roaring::Roaring mask;
172+
for (const auto& token : token_extractor->getTokens(value.data, value.size))
173+
{
174+
for (size_t pos = 0; pos < needle_tokens.size(); ++pos) {
175+
if (token == needle_tokens[pos]) {
176+
if constexpr (SearchProps::search_mode == GinSearchMode::ALL)
177+
{
178+
mask.add(pos);
179+
}
180+
else
181+
{
182+
col_result[i] = true;
183+
break;
184+
}
185+
}
186+
}
187+
}
188+
if constexpr (SearchProps::search_mode == GinSearchMode::ALL)
189+
col_result[i] = mask.cardinality() == needle_tokens.size();
190+
}
191+
}
192+
};
193+
194+
REGISTER_FUNCTION(SearchAny)
195+
{
196+
factory.registerFunction<FunctionSearchImpl<SearchAnyProps>>(FunctionDocumentation{
197+
.description = "Searches the needle tokens in the generated tokens from the text by a given tokenizer. Returns true if any needle "
198+
"tokens exists in the text, otherwise false.",
199+
.category = FunctionDocumentation::Category::StringSearch});
200+
}
201+
202+
REGISTER_FUNCTION(SearchAll)
203+
{
204+
factory.registerFunction<FunctionSearchImpl<SearchAllProps>>(FunctionDocumentation{
205+
.description = "Searches the needle tokens in the generated tokens from the text by a given tokenizer. Returns true if all needle "
206+
"tokens exists in the text, otherwise false.",
207+
.category = FunctionDocumentation::Category::StringSearch});
208+
}
209+
}

src/Interpreters/GinFilter.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ static inline constexpr UInt64 UNLIMITED_ROWS_PER_POSTINGS_LIST = 0;
1313
static inline constexpr UInt64 MIN_ROWS_PER_POSTINGS_LIST = 8 * 1024;
1414
static inline constexpr UInt64 DEFAULT_MAX_ROWS_PER_POSTINGS_LIST = 64 * 1024;
1515

16+
enum class GinSearchMode : uint8_t
17+
{
18+
ANY,
19+
ALL
20+
};
21+
1622
struct GinFilterParameters
1723
{
1824
GinFilterParameters(String tokenizer_, UInt64 max_rows_per_postings_list_);

src/Parsers/CommonParsers.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,8 @@ namespace DB
561561
MR_MACROS(YEARS, "YEARS") \
562562
MR_MACROS(YY, "YY") \
563563
MR_MACROS(YYYY, "YYYY") \
564+
MR_MACROS(SEARCH_ANY, "SEARCH ANY") \
565+
MR_MACROS(SEARCH_ALL, "SEARCH ALL") \
564566
MR_MACROS(ZKPATH, "ZKPATH") \
565567
MR_MACROS(STALENESS, "STALENESS") \
566568

src/Parsers/ExpressionListParsers.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2407,6 +2407,8 @@ const std::vector<std::pair<std::string_view, Operator>> ParserExpressionImpl::o
24072407
{toStringView(Keyword::NOT_IN), Operator("notIn", 9, 2)},
24082408
{toStringView(Keyword::GLOBAL_IN), Operator("globalIn", 9, 2)},
24092409
{toStringView(Keyword::GLOBAL_NOT_IN), Operator("globalNotIn", 9, 2)},
2410+
{toStringView(Keyword::SEARCH_ANY), Operator("searchAny", 9, 2)},
2411+
{toStringView(Keyword::SEARCH_ALL), Operator("searchAll", 9, 2)},
24102412
{"||", Operator("concat", 10, 2, OperatorType::Mergeable)},
24112413
{"+", Operator("plus", 11, 2)},
24122414
{"-", Operator("minus", 11, 2)},
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
Negative tests
2+
Default tokenizer
3+
0
4+
1
5+
1
6+
1
7+
1
8+
0
9+
0
10+
0
11+
0
12+
0
13+
1
14+
1
15+
Ngram tokenizer
16+
0
17+
1
18+
1
19+
0
20+
0
21+
0
22+
0
23+
1
24+
0
25+
1
26+
0
27+
0
28+
NoOp tokenizer
29+
0
30+
0
31+
0
32+
1
33+
0
34+
0
35+
0
36+
1
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
SELECT 'Negative tests';
2+
-- Must accept two to four arguments
3+
SELECT searchAny(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
4+
SELECT searchAny('a', 'b', 'c', 'd', 'e'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
5+
SELECT searchAll(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
6+
SELECT searchAll('a', 'b', 'c', 'd', 'e'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
7+
-- 1st and 2nd arg must be String or FixedString
8+
SELECT searchAny('a', 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
9+
SELECT searchAny(1, 'a'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
10+
SELECT searchAny(1, 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
11+
SELECT searchAll('a', 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
12+
SELECT searchAll(1, 'a'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
13+
SELECT searchAll(1, 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
14+
-- 2nd arg must be const String
15+
SELECT searchAny('a', toFixedString('b', 1)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
16+
SELECT searchAny('a', materialize('b')); -- { serverError ILLEGAL_COLUMN }
17+
SELECT searchAll('a', toFixedString('b', 1)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
18+
SELECT searchAll('a', materialize('b')); -- { serverError ILLEGAL_COLUMN }
19+
-- 3rd arg (if given) must be const String
20+
SELECT searchAny('a', 'b', toFixedString('b', 1)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
21+
SELECT searchAny('a', 'b', materialize('b')); -- { serverError ILLEGAL_COLUMN }
22+
SELECT searchAll('a', 'b', toFixedString('b', 1)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
23+
SELECT searchAll('a', 'b', materialize('b')); -- { serverError ILLEGAL_COLUMN }
24+
-- 3nd arg (if given) must be a supported tokenizer
25+
SELECT searchAny('a', 'b', 'unsupported_tokenizer'); -- { serverError BAD_ARGUMENTS }
26+
SELECT searchAll('a', 'b', 'unsupported_tokenizer'); -- { serverError BAD_ARGUMENTS }
27+
28+
SELECT 'Default tokenizer';
29+
30+
SELECT searchAny('', 'abc');
31+
SELECT searchAny('abc+ def- foo!', 'foo! bar?');
32+
SELECT searchAny('abc+ def- bar?', 'def- foo!', 'default');
33+
SELECT searchAny('abc+ def- foo!', 'abc foo');
34+
SELECT searchAny('abc+ def- bar?', 'abc bar', 'default');
35+
SELECT searchAny('abc+ def- foo!', 'bar ab');
36+
SELECT searchAny('abc+ def- bar?', 'foo de', 'default');
37+
38+
SELECT searchAll('', 'abc');
39+
SELECT searchAll('abc+ def- foo!', 'foo! bar?');
40+
SELECT searchAll('abc+ def- bar?', 'def- foo!', 'default');
41+
SELECT searchAll('abc+ def- foo!', 'foo abc+');
42+
SELECT searchAll('abc+ def- bar?', 'def- bar', 'default');
43+
44+
SELECT 'Ngram tokenizer';
45+
46+
SELECT searchAny('', 'abc', 'ngram');
47+
SELECT searchAny('abc def', 'foo def', 'ngram');
48+
SELECT searchAny('abc def', 'bar abc', 'ngram', 3);
49+
SELECT searchAny('abc def', 'bar abc', 'ngram', 2);
50+
SELECT searchAny('abc def', 'abc def', 'ngram', 8);
51+
52+
SELECT searchAll('', 'abc', 'ngram');
53+
SELECT searchAll('abc def', 'foo def', 'ngram');
54+
SELECT searchAll('abc def', 'abc def', 'ngram');
55+
SELECT searchAll('abc def', 'bar abc', 'ngram', 3);
56+
SELECT searchAll('abc def', 'def abc', 'ngram', 3);
57+
SELECT searchAll('abc def', 'bar abc', 'ngram', 2);
58+
SELECT searchAll('abc def', 'abc def', 'ngram', 8);
59+
60+
SELECT 'NoOp tokenizer';
61+
62+
SELECT searchAny('', 'abc', 'noop');
63+
SELECT searchAny('abc def', 'def abc', 'noop');
64+
SELECT searchAny('abc def', 'abc def', 'noop');
65+
SELECT searchAny('abcdef', 'abcdef', 'noop');
66+
67+
SELECT searchAll('', 'abc', 'noop');
68+
SELECT searchAll('abc def', 'def abc', 'noop');
69+
SELECT searchAll('abc def', 'abc def', 'noop');
70+
SELECT searchAll('abcdef', 'abcdef', 'noop');

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy