|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +/** |
| 19 | + * Tests for search DSL operator scenarios |
| 20 | + * |
| 21 | + * This test suite validates Lucene mode parsing against the exact test cases |
| 22 | + * documented in specification to ensure behavior matches Elasticsearch/Lucene semantics. |
| 23 | + * |
| 24 | + * Test Data Setup: |
| 25 | + * | Email | Firstname | |
| 26 | + * | test+query+1@gmail.com | "aterm bterm" | |
| 27 | + * | test+query+2@gmail.com | "bterm cterm" | |
| 28 | + * | test+query+3@gmail.com | "cterm dterm" | |
| 29 | + * | test+query+4@gmail.com | "dterm eterm aterm" | |
| 30 | + * |
| 31 | + * Key Lucene Semantics: |
| 32 | + * - Operators are processed left-to-right as modifiers |
| 33 | + * - AND marks preceding and current terms as MUST (+) |
| 34 | + * - OR marks preceding and current terms as SHOULD |
| 35 | + * - NOT marks current term as MUST_NOT (-) |
| 36 | + * - With minimum_should_match=0 and MUST clauses present, SHOULD clauses are discarded |
| 37 | + */ |
| 38 | +suite("test_search_dsl_operators") { |
| 39 | + def tableName = "search_dsl_operators_test" |
| 40 | + |
| 41 | + sql "DROP TABLE IF EXISTS ${tableName}" |
| 42 | + |
| 43 | + // Create table with inverted indexes |
| 44 | + // Using parser=english to tokenize firstname field |
| 45 | + sql """ |
| 46 | + CREATE TABLE ${tableName} ( |
| 47 | + id INT, |
| 48 | + email VARCHAR(100), |
| 49 | + firstname VARCHAR(200), |
| 50 | + INDEX idx_firstname(firstname) USING INVERTED PROPERTIES("parser" = "english") |
| 51 | + ) ENGINE=OLAP |
| 52 | + DUPLICATE KEY(id) |
| 53 | + DISTRIBUTED BY HASH(id) BUCKETS 1 |
| 54 | + PROPERTIES ("replication_allocation" = "tag.location.default: 1") |
| 55 | + """ |
| 56 | + |
| 57 | + // Insert test data |
| 58 | + sql """INSERT INTO ${tableName} VALUES |
| 59 | + (1, 'test+query+1@gmail.com', 'aterm bterm'), |
| 60 | + (2, 'test+query+2@gmail.com', 'bterm cterm'), |
| 61 | + (3, 'test+query+3@gmail.com', 'cterm dterm'), |
| 62 | + (4, 'test+query+4@gmail.com', 'dterm eterm aterm') |
| 63 | + """ |
| 64 | + |
| 65 | + // Wait for index building |
| 66 | + Thread.sleep(3000) |
| 67 | + |
| 68 | + // ============ Test 1: aterm OR bterm OR cterm ============ |
| 69 | + // All OR operators -> at least one must match (minimum_should_match=1) |
| 70 | + // Expected: rows 1,2,3,4 (all match at least one term) |
| 71 | + qt_dsl_or_chain """ |
| 72 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 73 | + FROM ${tableName} |
| 74 | + WHERE search('aterm OR bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 75 | + ORDER BY id |
| 76 | + """ |
| 77 | + |
| 78 | + // ============ Test 2: dterm AND eterm AND aterm ============ |
| 79 | + // All AND operators -> all must match |
| 80 | + // Expected: row 4 only (the only one with all three terms) |
| 81 | + qt_dsl_and_chain """ |
| 82 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 83 | + FROM ${tableName} |
| 84 | + WHERE search('dterm AND eterm AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 85 | + ORDER BY id |
| 86 | + """ |
| 87 | + |
| 88 | + // ============ Test 3: aterm AND bterm OR cterm ============ |
| 89 | + // Lucene left-to-right parsing with minimum_should_match=0: |
| 90 | + // - aterm: MUST (first term, default_operator=AND) |
| 91 | + // - bterm: MUST (AND introduces) |
| 92 | + // - cterm: SHOULD (OR introduces), bterm becomes SHOULD too |
| 93 | + // Final: +aterm bterm cterm |
| 94 | + // With minimum_should_match=0 and MUST present, SHOULD discarded |
| 95 | + // Result: effectively +aterm only |
| 96 | + // Expected: rows 1, 4 (rows containing "aterm") |
| 97 | + qt_dsl_and_or_mixed """ |
| 98 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 99 | + FROM ${tableName} |
| 100 | + WHERE search('aterm AND bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":0}') |
| 101 | + ORDER BY id |
| 102 | + """ |
| 103 | + |
| 104 | + // ============ Test 4: aterm AND NOT bterm OR cterm ============ |
| 105 | + // Lucene left-to-right parsing: |
| 106 | + // - aterm: MUST |
| 107 | + // - bterm: MUST_NOT (NOT modifier) |
| 108 | + // - cterm: SHOULD (OR introduces) |
| 109 | + // Final: +aterm -bterm cterm |
| 110 | + // With minimum_should_match=0 and MUST present, SHOULD discarded |
| 111 | + // Result: +aterm -bterm |
| 112 | + // Expected: row 4 only (has "aterm" but NOT "bterm") |
| 113 | + qt_dsl_and_not_or """ |
| 114 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 115 | + FROM ${tableName} |
| 116 | + WHERE search('aterm AND NOT bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":0}') |
| 117 | + ORDER BY id |
| 118 | + """ |
| 119 | + |
| 120 | + // ============ Test 5: cterm dterm (implicit AND) ============ |
| 121 | + // No explicit operators, default_operator=AND |
| 122 | + // Same as: cterm AND dterm |
| 123 | + // Expected: row 3 only (has both "cterm" AND "dterm") |
| 124 | + qt_dsl_implicit_and """ |
| 125 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 126 | + FROM ${tableName} |
| 127 | + WHERE search('cterm dterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 128 | + ORDER BY id |
| 129 | + """ |
| 130 | + |
| 131 | + // ============ Test 6: "aterm eterm" (phrase query, wrong order) ============ |
| 132 | + // Phrase query requires tokens in exact order |
| 133 | + // Data has "dterm eterm aterm" - "aterm" comes AFTER "eterm", not before |
| 134 | + // Expected: no match |
| 135 | + qt_dsl_phrase_wrong_order """ |
| 136 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 137 | + FROM ${tableName} |
| 138 | + WHERE search('"aterm eterm"', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 139 | + ORDER BY id |
| 140 | + """ |
| 141 | + |
| 142 | + // ============ Test 7: "eterm aterm" (phrase query, correct order) ============ |
| 143 | + // Phrase query requires tokens in exact order |
| 144 | + // Data has "dterm eterm aterm" - "eterm aterm" appears in this order |
| 145 | + // Expected: row 4 |
| 146 | + qt_dsl_phrase_correct_order """ |
| 147 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 148 | + FROM ${tableName} |
| 149 | + WHERE search('"eterm aterm"', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 150 | + ORDER BY id |
| 151 | + """ |
| 152 | + |
| 153 | + // ============ Test 8: eterm\ dterm AND aterm (escaped space joins terms) ============ |
| 154 | + // Escaped space makes "eterm dterm" a single term |
| 155 | + // Query: +("eterm dterm") +aterm |
| 156 | + // But with tokenized index, "eterm dterm" as single term won't match |
| 157 | + // This test verifies escape handling in Lucene mode |
| 158 | + // Note: With parser=english, individual tokens are indexed, so this won't match |
| 159 | + // Expected: no match (escaped space creates single term that doesn't exist) |
| 160 | + qt_dsl_escaped_space_and """ |
| 161 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 162 | + FROM ${tableName} |
| 163 | + WHERE search('eterm\\\\ dterm AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 164 | + ORDER BY id |
| 165 | + """ |
| 166 | + |
| 167 | + // ============ Test 9: "dterm eterm" AND aterm ============ |
| 168 | + // Phrase query + AND |
| 169 | + // Row 4 has "dterm eterm aterm" - phrase "dterm eterm" matches, and "aterm" is also present |
| 170 | + // Expected: row 4 |
| 171 | + qt_dsl_phrase_and_term """ |
| 172 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 173 | + FROM ${tableName} |
| 174 | + WHERE search('"dterm eterm" AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 175 | + ORDER BY id |
| 176 | + """ |
| 177 | + |
| 178 | + // ============ Test 10: "eterm dterm" AND aterm (phrase wrong order) ============ |
| 179 | + // Phrase "eterm dterm" is wrong order (data has "dterm eterm") |
| 180 | + // Expected: no match |
| 181 | + qt_dsl_phrase_wrong_and_term """ |
| 182 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 183 | + FROM ${tableName} |
| 184 | + WHERE search('"eterm dterm" AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 185 | + ORDER BY id |
| 186 | + """ |
| 187 | + |
| 188 | + // ============ Test 11: "eterm dterm" OR cterm ============ |
| 189 | + // Phrase OR term |
| 190 | + // Phrase "eterm dterm" won't match (wrong order) |
| 191 | + // cterm matches rows 2, 3 |
| 192 | + // Expected: rows 2, 3 |
| 193 | + qt_dsl_phrase_or_term_1 """ |
| 194 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 195 | + FROM ${tableName} |
| 196 | + WHERE search('"eterm dterm" OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 197 | + ORDER BY id |
| 198 | + """ |
| 199 | + |
| 200 | + // ============ Test 12: "dterm eterm" OR cterm ============ |
| 201 | + // Phrase OR term |
| 202 | + // Phrase "dterm eterm" matches row 4 |
| 203 | + // cterm matches rows 2, 3 |
| 204 | + // Expected: rows 2, 3, 4 |
| 205 | + qt_dsl_phrase_or_term_2 """ |
| 206 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 207 | + FROM ${tableName} |
| 208 | + WHERE search('"dterm eterm" OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') |
| 209 | + ORDER BY id |
| 210 | + """ |
| 211 | + |
| 212 | + // ============ Test 13: aterm AND bterm OR cterm with minimum_should_match=1 ============ |
| 213 | + // Same as Test 3 but with minimum_should_match=1 |
| 214 | + // Final state: +aterm bterm cterm (aterm is MUST, bterm and cterm are SHOULD) |
| 215 | + // With minimum_should_match=1, at least 1 SHOULD must match |
| 216 | + // Result: aterm AND (bterm OR cterm) |
| 217 | + // Expected: rows 1, 2 (row 1 has aterm+bterm, row 2 doesn't have aterm) |
| 218 | + // Wait - row 2 doesn't have aterm, so it shouldn't match |
| 219 | + // Row 1: has aterm, has bterm -> matches |
| 220 | + // Row 4: has aterm, doesn't have bterm or cterm -> doesn't match (no SHOULD satisfied) |
| 221 | + // Actually row 4 has aterm but no bterm/cterm... |
| 222 | + // Expected: row 1 only |
| 223 | + qt_dsl_and_or_min_should_1 """ |
| 224 | + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname |
| 225 | + FROM ${tableName} |
| 226 | + WHERE search('aterm AND bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":1}') |
| 227 | + ORDER BY id |
| 228 | + """ |
| 229 | + |
| 230 | + // Cleanup |
| 231 | + sql "DROP TABLE IF EXISTS ${tableName}" |
| 232 | +} |
0 commit comments