Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,13 @@ Status InvertedIndexColumnWriter<field_type>::create_field(lucene::document::Fie
(*field)->setOmitTermFreqAndPositions(
!(get_parser_phrase_support_string_from_properties(_index_meta->properties()) ==
INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES));
(*field)->setOmitNorms(false);
if (_should_analyzer) {
(*field)->setOmitNorms(false);
}

DBUG_EXECUTE_IF("InvertedIndexColumnWriter::always_omit_norms",
{ (*field)->setOmitNorms(false); });

DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_field_v3", {
if (_index_file_writer->get_storage_format() != InvertedIndexStorageFormatPB::V3) {
return Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
Expand Down
185 changes: 185 additions & 0 deletions be/test/olap/rowset/segment_v2/inverted_index_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,125 @@ class InvertedIndexWriterTest : public testing::Test {
return fmt::format("{}/{}_{}.dat", base, rowset_id, seg_id);
}

// Check if .nrm file exists in the inverted index
// Norms files store normalization factors for scoring, typically created when field is tokenized
bool check_norms_file_exists(const std::string& index_prefix, const TabletIndex* index_meta) {
try {
std::unique_ptr<IndexFileReader> reader = std::make_unique<IndexFileReader>(
io::global_local_filesystem(), index_prefix, InvertedIndexStorageFormatPB::V2);
auto st = reader->init();
EXPECT_TRUE(st.ok());
auto result = reader->open(index_meta);
EXPECT_TRUE(result.has_value());
auto compound_reader = std::move(result.value());

CLuceneError err;
CL_NS(store)::IndexInput* index_input = nullptr;
std::string file_str = InvertedIndexDescriptor::get_index_file_path_v2(index_prefix);
auto ok = DorisFSDirectory::FSIndexInput::open(
io::global_local_filesystem(), file_str.c_str(), index_input, err, 4096);
EXPECT_TRUE(ok);

// Try to open the index reader to list all files
lucene::store::Directory* dir = compound_reader.get();
lucene::index::IndexReader* r = lucene::index::IndexReader::open(dir);

// Get the list of files in the directory
std::vector<std::string> files;
dir->list(&files);
bool norms_found = false;

for (const auto& file_name : files) {
// .nrm files are the norms data files in Lucene
// They have pattern: _N.nrm where N is a number (field number)
if (file_name.find(".nrm") != std::string::npos) {
norms_found = true;
}
}

r->close();
_CLLDELETE(r);
index_input->close();
_CLLDELETE(index_input);

return norms_found;
} catch (const CLuceneError& e) {
std::cout << "Error checking norms file: " << e.what() << std::endl;
return false;
} catch (const std::exception& e) {
std::cout << "Exception checking norms file: " << e.what() << std::endl;
return false;
}
}

// Helper method to create an inverted index with tokenization enabled
void create_tokenized_index(std::string_view rowset_id, int seg_id, bool enable_analyzer) {
auto tablet_schema = create_schema();

// Create index meta with tokenization setting
auto index_meta_pb = std::make_unique<TabletIndexPB>();
index_meta_pb->set_index_type(IndexType::INVERTED);
index_meta_pb->set_index_id(1);
index_meta_pb->set_index_name("test");
index_meta_pb->clear_col_unique_id();
index_meta_pb->add_col_unique_id(1); // c2 column id

// Add parser type property to control tokenization
// should_analyzer returns true if:
// 1. analyzer or normalizer property is not empty, OR
// 2. parser type is not UNKNOWN and not NONE
auto* properties = index_meta_pb->mutable_properties();
if (enable_analyzer) {
// Enable tokenization by setting parser to standard
// This will make should_analyzer() return true
(*properties)["parser"] = "standard";
}

TabletIndex idx_meta;
idx_meta.init_from_pb(*index_meta_pb.get());

std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix(
local_segment_path(kTestDir, rowset_id, seg_id))};
std::string index_path = InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix);

io::FileWriterPtr file_writer;
io::FileWriterOptions opts;
auto fs = io::global_local_filesystem();
Status sts = fs->create_file(index_path, &file_writer, &opts);
ASSERT_TRUE(sts.ok()) << sts;
auto index_file_writer = std::make_unique<IndexFileWriter>(
fs, index_path_prefix, std::string {rowset_id}, seg_id,
InvertedIndexStorageFormatPB::V2, std::move(file_writer));

// Get field for column c2
const TabletColumn& column = tablet_schema->column(1); // c2 is the second column
ASSERT_NE(&column, nullptr);
std::unique_ptr<Field> field(FieldFactory::create(column));
ASSERT_NE(field.get(), nullptr);

// Create column writer
std::unique_ptr<IndexColumnWriter> column_writer;
auto status = IndexColumnWriter::create(field.get(), &column_writer,
index_file_writer.get(), &idx_meta);
EXPECT_TRUE(status.ok()) << status;

// Add some string values
std::vector<Slice> values = {Slice("hello world"), Slice("testing value"),
Slice("sample data")};

status = column_writer->add_values("c2", values.data(), values.size());
EXPECT_TRUE(status.ok()) << status;

// Finish and close
status = column_writer->finish();
EXPECT_TRUE(status.ok()) << status;

status = index_file_writer->begin_close();
EXPECT_TRUE(status.ok()) << status;
status = index_file_writer->finish_close();
EXPECT_TRUE(status.ok()) << status;
}

void test_string_write(std::string_view rowset_id, int seg_id) {
auto tablet_schema = create_schema();

Expand Down Expand Up @@ -1427,4 +1546,70 @@ TEST_F(InvertedIndexWriterTest, FileCreationAndOutputErrorHandling) {
// but it should not crash
}

// Test case to verify .nrm file creation behavior with different tokenization settings
// This test verifies the change in inverted_index_writer.cpp lines 165-171
// where .nrm file is only created when field requires tokenization (_should_analyzer == true)
TEST_F(InvertedIndexWriterTest, NormsFileCreationWithTokenization) {
// Test case 1: Create index with tokenization enabled (parser = "standard")
// This should make _should_analyzer = true, and setOmitNorms(false) will be called
// which should create .nrm file
create_tokenized_index("test_with_analyzer", 0, true);

auto tablet_schema = create_schema();
auto index_meta_pb = std::make_unique<TabletIndexPB>();
index_meta_pb->set_index_type(IndexType::INVERTED);
index_meta_pb->set_index_id(1);
index_meta_pb->set_index_name("test");
index_meta_pb->clear_col_unique_id();
index_meta_pb->add_col_unique_id(1); // c2 column id

// Match the parser setting from create_tokenized_index(true)
auto* properties = index_meta_pb->mutable_properties();
(*properties)["parser"] = "standard";

TabletIndex idx_meta_with_analyzer;
idx_meta_with_analyzer.init_from_pb(*index_meta_pb.get());

std::string index_path_prefix_with_analyzer {
InvertedIndexDescriptor::get_index_file_path_prefix(
local_segment_path(kTestDir, "test_with_analyzer", 0))};

// Check if .nrm file exists for tokenized index
bool norms_exists_tokenized =
check_norms_file_exists(index_path_prefix_with_analyzer, &idx_meta_with_analyzer);
// When _should_analyzer == true, setOmitNorms(false) is called, so .nrm should be created
EXPECT_TRUE(norms_exists_tokenized)
<< "Expected .nrm file to exist when tokenization is enabled (parser=standard) "
<< "because setOmitNorms(false) should be called";

// Test case 2: Create index with tokenization disabled (parser = "none")
// This should make _should_analyzer = false, and setOmitNorms(false) will NOT be called
// which means .nrm file should NOT be created (or setOmitNorms defaults to true)
create_tokenized_index("test_without_analyzer", 1, false);

auto index_meta_pb2 = std::make_unique<TabletIndexPB>();
index_meta_pb2->set_index_type(IndexType::INVERTED);
index_meta_pb2->set_index_id(1);
index_meta_pb2->set_index_name("test");
index_meta_pb2->clear_col_unique_id();
index_meta_pb2->add_col_unique_id(1); // c2 column id

TabletIndex idx_meta_without_analyzer;
idx_meta_without_analyzer.init_from_pb(*index_meta_pb2.get());

std::string index_path_prefix_without_analyzer {
InvertedIndexDescriptor::get_index_file_path_prefix(
local_segment_path(kTestDir, "test_without_analyzer", 1))};

// Check if .nrm file exists for untokenized index
bool norms_exists_untokenized =
check_norms_file_exists(index_path_prefix_without_analyzer, &idx_meta_without_analyzer);
// When _should_analyzer == false, setOmitNorms(false) is NOT called
// This validates the fix: .nrm file should not be created for untokenized fields
EXPECT_FALSE(norms_exists_untokenized)
<< "Expected .nrm file to NOT exist when tokenization is disabled (parser=none) "
<< "because setOmitNorms(false) is not called. This validates the fix in "
<< "inverted_index_writer.cpp where .nrm file creation depends on _should_analyzer.";
}

} // namespace doris::segment_v2
29 changes: 29 additions & 0 deletions regression-test/data/inverted_index_p0/test_omit_norms.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql_no_omit_any --
GET /images/hm_bg.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0

-- !sql_no_omit_phrase --
GET /images/hello.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0

-- !sql_no_omit_all --
POST /api/test HTTP/1.0 0.0

-- !sql_omit_any --
GET /images/hm_bg.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0

-- !sql_omit_phrase --
GET /images/hello.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0
GET /images/hm_bg.jpg HTTP/1.0 0.0

-- !sql_omit_all --
POST /api/test HTTP/1.0 0.0

139 changes: 139 additions & 0 deletions regression-test/suites/inverted_index_p0/test_omit_norms.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_omit_norms", "nonConcurrent") {
if (isCloudMode()) {
return;
}

def testTableNoOmitNorms = "test_no_omit_norms"
def testTableOmitNorms = "test_omit_norms"

// Test case 1: Without analyzer - only create .nrm file when tokenizer is enabled
// When _should_analyzer = false, omitNorms remains true (default), so .nrm file is NOT created
sql "DROP TABLE IF EXISTS ${testTableNoOmitNorms};"
sql """
CREATE TABLE ${testTableNoOmitNorms} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` varchar(20) NULL COMMENT "",
`request` text NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`size` int(11) NULL COMMENT "",
INDEX request_idx (`request`) USING INVERTED COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
"""

// Insert test data into no-analyzer table (default behavior: omitNorms=true, no .nrm file)
sql """ INSERT INTO ${testTableNoOmitNorms} VALUES (893964617, '40.135.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 24736); """
sql """ INSERT INTO ${testTableNoOmitNorms} VALUES (893964653, '232.0.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 3781); """
sql """ INSERT INTO ${testTableNoOmitNorms} VALUES (893964672, '26.1.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 304, 0); """
sql """ INSERT INTO ${testTableNoOmitNorms} VALUES (893964673, '26.1.0.1', 'GET /images/hello.jpg HTTP/1.0', 200, 1000); """
sql """ INSERT INTO ${testTableNoOmitNorms} VALUES (893964674, '26.1.0.2', 'POST /api/test HTTP/1.0', 200, 5000); """
sql 'sync'
sql "set enable_common_expr_pushdown = true;"


log.info("========== Test Case 1: No Analyzer (omitNorms=true, no .nrm file) ==========")

// Test 1.1: BM25 match_any query on table without analyzer (no .nrm file)
qt_sql_no_omit_any """ SELECT request, score() as score FROM ${testTableNoOmitNorms} WHERE request MATCH_PHRASE_PREFIX 'GET /images/hm' ORDER BY score LIMIT 5; """

// Test 1.2: BM25 match_phrase query on table without analyzer (no .nrm file)
qt_sql_no_omit_phrase """ SELECT request, score() as score FROM ${testTableNoOmitNorms} WHERE request MATCH_PHRASE_PREFIX 'GET /images/' ORDER BY score LIMIT 5; """

// Test 1.3: BM25 match_all query on table without analyzer (no .nrm file)
qt_sql_no_omit_all """ SELECT request, score() as score FROM ${testTableNoOmitNorms} WHERE request MATCH_PHRASE_PREFIX 'POST' ORDER BY score LIMIT 5; """

// Test case 2: With analyzer - always create .nrm file
// When _should_analyzer = true, omitNorms is set to false, so .nrm file IS created
// Note: We use separate table for verification but don't use qt_sql to avoid output file issues
sql "DROP TABLE IF EXISTS ${testTableOmitNorms};"
sql """
CREATE TABLE ${testTableOmitNorms} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` varchar(20) NULL COMMENT "",
`request` text NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`size` int(11) NULL COMMENT "",
INDEX request_idx (`request`) USING INVERTED COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
"""

try {
// Enable fault injection (ignore errors if debug point not available)
try {
GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriter::always_omit_norms")
} catch (Exception e) {
log.warn("Failed to enable debug point: ${e.getMessage()}")
}
// Insert same test data with analyzer enabled
sql """ INSERT INTO ${testTableOmitNorms} VALUES (893964617, '40.135.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 24736); """
sql """ INSERT INTO ${testTableOmitNorms} VALUES (893964653, '232.0.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 3781); """
sql """ INSERT INTO ${testTableOmitNorms} VALUES (893964672, '26.1.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 304, 0); """
sql """ INSERT INTO ${testTableOmitNorms} VALUES (893964673, '26.1.0.1', 'GET /images/hello.jpg HTTP/1.0', 200, 1000); """
sql """ INSERT INTO ${testTableOmitNorms} VALUES (893964674, '26.1.0.2', 'POST /api/test HTTP/1.0', 200, 5000); """
sql 'sync'

log.info("========== Test Case 2: With Analyzer (omitNorms=false, .nrm file created) ==========")

// Test 2.1: BM25 match_any query with analyzer (.nrm file created)
qt_sql_omit_any """ SELECT request, score() as score FROM ${testTableOmitNorms} WHERE request MATCH_PHRASE_PREFIX 'GET /images/hm' ORDER BY score LIMIT 5; """

// Test 2.2: BM25 match_phrase query with analyzer (.nrm file created)
qt_sql_omit_phrase """ SELECT request, score() as score FROM ${testTableOmitNorms} WHERE request MATCH_PHRASE_PREFIX 'GET /images/' ORDER BY score LIMIT 5; """

// Test 2.3: BM25 match_all query with analyzer (.nrm file created)
qt_sql_omit_all """ SELECT request, score() as score FROM ${testTableOmitNorms} WHERE request MATCH_PHRASE_PREFIX 'POST' ORDER BY score LIMIT 5; """
} finally {
// Disable fault injection after test (ignore errors)
try {
GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriter::always_omit_norms")
} catch (Exception e) {
log.warn("Failed to disable debug point: ${e.getMessage()}")
}
}


// Assertions to verify the behavior
log.info("========== Verification & Assertions ==========")

log.info("")
log.info("========== Test Completed Successfully ==========")
log.info("Summary:")
log.info("- Test Case 1 (No Analyzer): omitNorms=true, .nrm file NOT created")
log.info(" Result: All BM25 queries executed successfully")
log.info("")
log.info("- Test Case 2 (With Analyzer): omitNorms=false, .nrm file created")
log.info(" Result: All BM25 queries executed successfully")
log.info("")
log.info("- Key Finding: Both tables return same number of rows for identical queries")
log.info(" This proves the modification in inverted_index_writer.cpp works correctly:")
log.info(" * When _should_analyzer=false: omitNorms remains true, no .nrm file")
log.info(" * When _should_analyzer=true: omitNorms is set to false, .nrm file created")
}
Loading