Skip to content

Commit 2d0c8ad

Browse files
airborne12claude
andcommitted
[fix](search) Make AND/OR/NOT operators case-sensitive in search DSL
Per specification requirement #5, only uppercase AND/OR/NOT should be recognized as operators in search DSL. Lowercase and/or/not should be treated as regular terms, causing parse errors when used as operators. Changes: - Update SearchLexer.g4 to only match uppercase keywords - Update unit tests to expect parse errors for lowercase operators - Update regression tests accordingly - Add comprehensive DSL operator test cases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 73404df commit 2d0c8ad

File tree

6 files changed

+328
-49
lines changed

6 files changed

+328
-49
lines changed

fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ fragment QUOTED_CHAR
4141

4242
// ============== Default lexer rules ==============
4343

44-
AND : 'AND' | 'and' ;
45-
OR : 'OR' | 'or' ;
46-
NOT : 'NOT' | 'not' | '!' ;
44+
AND : 'AND' ;
45+
OR : 'OR' ;
46+
NOT : 'NOT' | '!' ;
4747

4848
LPAREN : '(' ;
4949
RPAREN : ')' ;

fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -872,16 +872,16 @@ public void testUppercaseAndOperator() {
872872

873873
@Test
874874
public void testLowercaseAndOperator() {
875-
// Test: Currently lowercase 'and' is also treated as operator
876-
// According to PDF requirement, only uppercase should be operators
877-
// This test documents current behavior - may need to change
875+
// Test: lowercase 'and' should NOT be treated as operator (per hubspot.md requirement #5)
876+
// Only uppercase AND is recognized as operator
877+
// 'field:a and field:b' is invalid DSL because 'and' is parsed as term, not operator
878878
String dsl = "field:a and field:b";
879-
QsPlan plan = SearchDslParser.parseDsl(dsl);
880879

881-
Assertions.assertNotNull(plan);
882-
// Current behavior: lowercase 'and' IS an operator
883-
Assertions.assertEquals(QsClauseType.AND, plan.getRoot().getType());
884-
// TODO: If PDF requires only uppercase, this should fail and return OR or different structure
880+
RuntimeException exception = Assertions.assertThrows(RuntimeException.class, () -> {
881+
SearchDslParser.parseDsl(dsl);
882+
});
883+
884+
Assertions.assertTrue(exception.getMessage().contains("Invalid search DSL syntax"));
885885
}
886886

887887
@Test
@@ -897,15 +897,16 @@ public void testUppercaseOrOperator() {
897897

898898
@Test
899899
public void testLowercaseOrOperator() {
900-
// Test: Currently lowercase 'or' is also treated as operator
901-
// According to PDF requirement, only uppercase should be operators
900+
// Test: lowercase 'or' should NOT be treated as operator (per hubspot.md requirement #5)
901+
// Only uppercase OR is recognized as operator
902+
// 'field:a or field:b' is invalid DSL because 'or' is parsed as term, not operator
902903
String dsl = "field:a or field:b";
903-
QsPlan plan = SearchDslParser.parseDsl(dsl);
904904

905-
Assertions.assertNotNull(plan);
906-
// Current behavior: lowercase 'or' IS an operator
907-
Assertions.assertEquals(QsClauseType.OR, plan.getRoot().getType());
908-
// TODO: If PDF requires only uppercase, this should fail
905+
RuntimeException exception = Assertions.assertThrows(RuntimeException.class, () -> {
906+
SearchDslParser.parseDsl(dsl);
907+
});
908+
909+
Assertions.assertTrue(exception.getMessage().contains("Invalid search DSL syntax"));
909910
}
910911

911912
@Test
@@ -920,15 +921,16 @@ public void testUppercaseNotOperator() {
920921

921922
@Test
922923
public void testLowercaseNotOperator() {
923-
// Test: Currently lowercase 'not' is also treated as operator
924-
// According to PDF requirement, only uppercase should be operators
924+
// Test: lowercase 'not' should NOT be treated as operator (per hubspot.md requirement #5)
925+
// Only uppercase NOT is recognized as operator
926+
// 'not field:spam' is invalid DSL because 'not' is parsed as term, not operator
925927
String dsl = "not field:spam";
926-
QsPlan plan = SearchDslParser.parseDsl(dsl);
927928

928-
Assertions.assertNotNull(plan);
929-
// Current behavior: lowercase 'not' IS an operator
930-
Assertions.assertEquals(QsClauseType.NOT, plan.getRoot().getType());
931-
// TODO: If PDF requires only uppercase, this should fail
929+
RuntimeException exception = Assertions.assertThrows(RuntimeException.class, () -> {
930+
SearchDslParser.parseDsl(dsl);
931+
});
932+
933+
Assertions.assertTrue(exception.getMessage().contains("Invalid search DSL syntax"));
932934
}
933935

934936
@Test
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !hubspot_or_chain --
3+
1 aterm bterm
4+
2 bterm cterm
5+
3 cterm dterm
6+
4 dterm eterm aterm
7+
8+
-- !hubspot_and_chain --
9+
4 dterm eterm aterm
10+
11+
-- !hubspot_and_or_mixed --
12+
1 aterm bterm
13+
4 dterm eterm aterm
14+
15+
-- !hubspot_and_not_or --
16+
4 dterm eterm aterm
17+
18+
-- !hubspot_implicit_and --
19+
3 cterm dterm
20+
21+
-- !hubspot_phrase_wrong_order --
22+
23+
-- !hubspot_phrase_correct_order --
24+
4 dterm eterm aterm
25+
26+
-- !hubspot_escaped_space_and --
27+
28+
-- !hubspot_phrase_and_term --
29+
4 dterm eterm aterm
30+
31+
-- !hubspot_phrase_wrong_and_term --
32+
33+
-- !hubspot_phrase_or_term_1 --
34+
2 bterm cterm
35+
3 cterm dterm
36+
37+
-- !hubspot_phrase_or_term_2 --
38+
2 bterm cterm
39+
3 cterm dterm
40+
4 dterm eterm aterm
41+
42+
-- !hubspot_and_or_min_should_1 --
43+
1 aterm bterm
44+

regression-test/data/search/test_search_escape.out

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,6 @@
2626
-- !uppercase_not --
2727
8 second fruit
2828

29-
-- !lowercase_and --
30-
7 first fruit
31-
32-
-- !lowercase_or --
33-
1 first content
34-
2 second content
35-
7 first fruit
36-
8 second fruit
37-
3829
-- !exclamation_not --
3930
8 second fruit
4031

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
/**
19+
* Tests for search DSL operator scenarios
20+
*
21+
* This test suite validates Lucene mode parsing against the exact test cases
22+
* documented in specification to ensure behavior matches Elasticsearch/Lucene semantics.
23+
*
24+
* Test Data Setup:
25+
* | Email | Firstname |
26+
* | test+query+1@gmail.com | "aterm bterm" |
27+
* | test+query+2@gmail.com | "bterm cterm" |
28+
* | test+query+3@gmail.com | "cterm dterm" |
29+
* | test+query+4@gmail.com | "dterm eterm aterm" |
30+
*
31+
* Key Lucene Semantics:
32+
* - Operators are processed left-to-right as modifiers
33+
* - AND marks preceding and current terms as MUST (+)
34+
* - OR marks preceding and current terms as SHOULD
35+
* - NOT marks current term as MUST_NOT (-)
36+
* - With minimum_should_match=0 and MUST clauses present, SHOULD clauses are discarded
37+
*/
38+
suite("test_search_dsl_operators") {
39+
def tableName = "search_dsl_operators_test"
40+
41+
sql "DROP TABLE IF EXISTS ${tableName}"
42+
43+
// Create table with inverted indexes
44+
// Using parser=english to tokenize firstname field
45+
sql """
46+
CREATE TABLE ${tableName} (
47+
id INT,
48+
email VARCHAR(100),
49+
firstname VARCHAR(200),
50+
INDEX idx_firstname(firstname) USING INVERTED PROPERTIES("parser" = "english")
51+
) ENGINE=OLAP
52+
DUPLICATE KEY(id)
53+
DISTRIBUTED BY HASH(id) BUCKETS 1
54+
PROPERTIES ("replication_allocation" = "tag.location.default: 1")
55+
"""
56+
57+
// Insert test data
58+
sql """INSERT INTO ${tableName} VALUES
59+
(1, 'test+query+1@gmail.com', 'aterm bterm'),
60+
(2, 'test+query+2@gmail.com', 'bterm cterm'),
61+
(3, 'test+query+3@gmail.com', 'cterm dterm'),
62+
(4, 'test+query+4@gmail.com', 'dterm eterm aterm')
63+
"""
64+
65+
// Wait for index building
66+
Thread.sleep(3000)
67+
68+
// ============ Test 1: aterm OR bterm OR cterm ============
69+
// All OR operators -> at least one must match (minimum_should_match=1)
70+
// Expected: rows 1,2,3,4 (all match at least one term)
71+
qt_dsl_or_chain """
72+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
73+
FROM ${tableName}
74+
WHERE search('aterm OR bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
75+
ORDER BY id
76+
"""
77+
78+
// ============ Test 2: dterm AND eterm AND aterm ============
79+
// All AND operators -> all must match
80+
// Expected: row 4 only (the only one with all three terms)
81+
qt_dsl_and_chain """
82+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
83+
FROM ${tableName}
84+
WHERE search('dterm AND eterm AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
85+
ORDER BY id
86+
"""
87+
88+
// ============ Test 3: aterm AND bterm OR cterm ============
89+
// Lucene left-to-right parsing with minimum_should_match=0:
90+
// - aterm: MUST (first term, default_operator=AND)
91+
// - bterm: MUST (AND introduces)
92+
// - cterm: SHOULD (OR introduces), bterm becomes SHOULD too
93+
// Final: +aterm bterm cterm
94+
// With minimum_should_match=0 and MUST present, SHOULD discarded
95+
// Result: effectively +aterm only
96+
// Expected: rows 1, 4 (rows containing "aterm")
97+
qt_dsl_and_or_mixed """
98+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
99+
FROM ${tableName}
100+
WHERE search('aterm AND bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":0}')
101+
ORDER BY id
102+
"""
103+
104+
// ============ Test 4: aterm AND NOT bterm OR cterm ============
105+
// Lucene left-to-right parsing:
106+
// - aterm: MUST
107+
// - bterm: MUST_NOT (NOT modifier)
108+
// - cterm: SHOULD (OR introduces)
109+
// Final: +aterm -bterm cterm
110+
// With minimum_should_match=0 and MUST present, SHOULD discarded
111+
// Result: +aterm -bterm
112+
// Expected: row 4 only (has "aterm" but NOT "bterm")
113+
qt_dsl_and_not_or """
114+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
115+
FROM ${tableName}
116+
WHERE search('aterm AND NOT bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":0}')
117+
ORDER BY id
118+
"""
119+
120+
// ============ Test 5: cterm dterm (implicit AND) ============
121+
// No explicit operators, default_operator=AND
122+
// Same as: cterm AND dterm
123+
// Expected: row 3 only (has both "cterm" AND "dterm")
124+
qt_dsl_implicit_and """
125+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
126+
FROM ${tableName}
127+
WHERE search('cterm dterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
128+
ORDER BY id
129+
"""
130+
131+
// ============ Test 6: "aterm eterm" (phrase query, wrong order) ============
132+
// Phrase query requires tokens in exact order
133+
// Data has "dterm eterm aterm" - "aterm" comes AFTER "eterm", not before
134+
// Expected: no match
135+
qt_dsl_phrase_wrong_order """
136+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
137+
FROM ${tableName}
138+
WHERE search('"aterm eterm"', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
139+
ORDER BY id
140+
"""
141+
142+
// ============ Test 7: "eterm aterm" (phrase query, correct order) ============
143+
// Phrase query requires tokens in exact order
144+
// Data has "dterm eterm aterm" - "eterm aterm" appears in this order
145+
// Expected: row 4
146+
qt_dsl_phrase_correct_order """
147+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
148+
FROM ${tableName}
149+
WHERE search('"eterm aterm"', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
150+
ORDER BY id
151+
"""
152+
153+
// ============ Test 8: eterm\ dterm AND aterm (escaped space joins terms) ============
154+
// Escaped space makes "eterm dterm" a single term
155+
// Query: +("eterm dterm") +aterm
156+
// But with tokenized index, "eterm dterm" as single term won't match
157+
// This test verifies escape handling in Lucene mode
158+
// Note: With parser=english, individual tokens are indexed, so this won't match
159+
// Expected: no match (escaped space creates single term that doesn't exist)
160+
qt_dsl_escaped_space_and """
161+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
162+
FROM ${tableName}
163+
WHERE search('eterm\\\\ dterm AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
164+
ORDER BY id
165+
"""
166+
167+
// ============ Test 9: "dterm eterm" AND aterm ============
168+
// Phrase query + AND
169+
// Row 4 has "dterm eterm aterm" - phrase "dterm eterm" matches, and "aterm" is also present
170+
// Expected: row 4
171+
qt_dsl_phrase_and_term """
172+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
173+
FROM ${tableName}
174+
WHERE search('"dterm eterm" AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
175+
ORDER BY id
176+
"""
177+
178+
// ============ Test 10: "eterm dterm" AND aterm (phrase wrong order) ============
179+
// Phrase "eterm dterm" is wrong order (data has "dterm eterm")
180+
// Expected: no match
181+
qt_dsl_phrase_wrong_and_term """
182+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
183+
FROM ${tableName}
184+
WHERE search('"eterm dterm" AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
185+
ORDER BY id
186+
"""
187+
188+
// ============ Test 11: "eterm dterm" OR cterm ============
189+
// Phrase OR term
190+
// Phrase "eterm dterm" won't match (wrong order)
191+
// cterm matches rows 2, 3
192+
// Expected: rows 2, 3
193+
qt_dsl_phrase_or_term_1 """
194+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
195+
FROM ${tableName}
196+
WHERE search('"eterm dterm" OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
197+
ORDER BY id
198+
"""
199+
200+
// ============ Test 12: "dterm eterm" OR cterm ============
201+
// Phrase OR term
202+
// Phrase "dterm eterm" matches row 4
203+
// cterm matches rows 2, 3
204+
// Expected: rows 2, 3, 4
205+
qt_dsl_phrase_or_term_2 """
206+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
207+
FROM ${tableName}
208+
WHERE search('"dterm eterm" OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}')
209+
ORDER BY id
210+
"""
211+
212+
// ============ Test 13: aterm AND bterm OR cterm with minimum_should_match=1 ============
213+
// Same as Test 3 but with minimum_should_match=1
214+
// Final state: +aterm bterm cterm (aterm is MUST, bterm and cterm are SHOULD)
215+
// With minimum_should_match=1, at least 1 SHOULD must match
216+
// Result: aterm AND (bterm OR cterm)
217+
// Expected: rows 1, 2 (row 1 has aterm+bterm, row 2 doesn't have aterm)
218+
// Wait - row 2 doesn't have aterm, so it shouldn't match
219+
// Row 1: has aterm, has bterm -> matches
220+
// Row 4: has aterm, doesn't have bterm or cterm -> doesn't match (no SHOULD satisfied)
221+
// Actually row 4 has aterm but no bterm/cterm...
222+
// Expected: row 1 only
223+
qt_dsl_and_or_min_should_1 """
224+
SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
225+
FROM ${tableName}
226+
WHERE search('aterm AND bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":1}')
227+
ORDER BY id
228+
"""
229+
230+
// Cleanup
231+
sql "DROP TABLE IF EXISTS ${tableName}"
232+
}

0 commit comments

Comments
 (0)