[opt](rowset) Aggregate non-MOW segment key bounds to reduce rowset meta size (#62604)

liaoxin01 · Your Name · commit 6fdbe9f4f60c · 2026-04-24T07:53:35.000Z
For non-MOW (duplicate / aggregate key) tables, per-segment key bounds
are not consumed on the read path — only the rowset-level [min, max] is
used by the reader and ordered compaction. In cloud mode, persisting
bounds for every segment can blow past FDB's value size limit on
commit_rowset.

Introduce an `enable_aggregate_non_mow_key_bounds` BE config (default
off). When enabled, non-MOW rowsets collapse per-segment bounds into a
single [overall_min, overall_max] entry at write time, and compaction
preserves this behavior. MOW rowsets always retain per-segment bounds —
their `lookup_row_key` path relies on them for delete bitmap
computation, and is guarded by a new DCHECK against aggregated input.

A new optional `segments_key_bounds_aggregated` flag is added to both
RowsetMetaPB and RowsetMetaCloudPB so consumers can distinguish
aggregated from per-segment layouts. Proto round-trip, pb_convert,
snapshot restore, and index builder all preserve both this flag and the
existing `segments_key_bounds_truncated` flag.

Correctness notes:
- `first_key/last_key` callers (`block_reader`, ordered compaction)
already bail out on overlapping rowsets, so for non-overlapping rowsets
the aggregated [min, max] equals seg[0].min / seg[last].max exactly.
- `merge_rowset_meta` (MOW partial-update publish) DCHECKs both sides
are non-aggregated.
diff --git a/be/src/cloud/cloud_snapshot_mgr.cpp b/be/src/cloud/cloud_snapshot_mgr.cpp
@@ -276,6 +276,14 @@ Status CloudSnapshotMgr::_create_rowset_meta(
     for (const auto& key_bound : source_meta_pb.segments_key_bounds()) {
         *new_rowset_meta_pb->add_segments_key_bounds() = key_bound;
     }
+    if (source_meta_pb.has_segments_key_bounds_truncated()) {
+        new_rowset_meta_pb->set_segments_key_bounds_truncated(
+                source_meta_pb.segments_key_bounds_truncated());
+    }
+    if (source_meta_pb.has_segments_key_bounds_aggregated()) {
+        new_rowset_meta_pb->set_segments_key_bounds_aggregated(
+                source_meta_pb.segments_key_bounds_aggregated());
+    }
     if (source_meta_pb.has_delete_predicate()) {
         DeletePredicatePB* new_delete_condition = new_rowset_meta_pb->mutable_delete_predicate();
         *new_delete_condition = source_meta_pb.delete_predicate();
diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp
@@ -79,7 +79,12 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in)
     }
     out->set_txn_expiration(in.txn_expiration());
     out->set_segments_overlap_pb(in.segments_overlap_pb());
-    out->set_segments_key_bounds_truncated(in.segments_key_bounds_truncated());
+    if (in.has_segments_key_bounds_truncated()) {
+        out->set_segments_key_bounds_truncated(in.segments_key_bounds_truncated());
+    }
+    if (in.has_segments_key_bounds_aggregated()) {
+        out->set_segments_key_bounds_aggregated(in.segments_key_bounds_aggregated());
+    }
     out->mutable_num_segment_rows()->CopyFrom(in.num_segment_rows());
     out->mutable_segments_file_size()->CopyFrom(in.segments_file_size());
     out->set_index_id(in.index_id());
@@ -157,7 +162,12 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) {
     }
     out->set_txn_expiration(in.txn_expiration());
     out->set_segments_overlap_pb(in.segments_overlap_pb());
-    out->set_segments_key_bounds_truncated(in.segments_key_bounds_truncated());
+    if (in.has_segments_key_bounds_truncated()) {
+        out->set_segments_key_bounds_truncated(in.segments_key_bounds_truncated());
+    }
+    if (in.has_segments_key_bounds_aggregated()) {
+        out->set_segments_key_bounds_aggregated(in.segments_key_bounds_aggregated());
+    }
     out->mutable_num_segment_rows()->Swap(in.mutable_num_segment_rows());
     out->mutable_segments_file_size()->Swap(in.mutable_segments_file_size());
     out->set_index_id(in.index_id());
@@ -247,7 +257,12 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in)
     }
     out->set_txn_expiration(in.txn_expiration());
     out->set_segments_overlap_pb(in.segments_overlap_pb());
-    out->set_segments_key_bounds_truncated(in.segments_key_bounds_truncated());
+    if (in.has_segments_key_bounds_truncated()) {
+        out->set_segments_key_bounds_truncated(in.segments_key_bounds_truncated());
+    }
+    if (in.has_segments_key_bounds_aggregated()) {
+        out->set_segments_key_bounds_aggregated(in.segments_key_bounds_aggregated());
+    }
     out->mutable_num_segment_rows()->CopyFrom(in.num_segment_rows());
     out->mutable_segments_file_size()->CopyFrom(in.segments_file_size());
     out->set_index_id(in.index_id());
@@ -325,7 +340,12 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) {
     }
     out->set_txn_expiration(in.txn_expiration());
     out->set_segments_overlap_pb(in.segments_overlap_pb());
-    out->set_segments_key_bounds_truncated(in.segments_key_bounds_truncated());
+    if (in.has_segments_key_bounds_truncated()) {
+        out->set_segments_key_bounds_truncated(in.segments_key_bounds_truncated());
+    }
+    if (in.has_segments_key_bounds_aggregated()) {
+        out->set_segments_key_bounds_aggregated(in.segments_key_bounds_aggregated());
+    }
     out->mutable_num_segment_rows()->Swap(in.mutable_num_segment_rows());
     out->mutable_segments_file_size()->Swap(in.mutable_segments_file_size());
     out->set_index_id(in.index_id());
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
@@ -1618,6 +1618,10 @@ DEFINE_mBool(enable_fetch_rowsets_from_peer_replicas, "false");
 DEFINE_mInt32(segments_key_bounds_truncation_threshold, "-1");
 // ATTENTION: for test only, use random segments key bounds truncation threshold every time
 DEFINE_mBool(random_segments_key_bounds_truncation, "false");
+
+// If true, non-MOW rowsets store a single aggregated [rowset_min, rowset_max]
+// key-bounds entry instead of per-segment bounds, to reduce meta size on cloud FDB.
+DEFINE_mBool(enable_aggregate_non_mow_key_bounds, "true");
 // p0, daily, rqg, external
 DEFINE_String(fuzzy_test_type, "");
 
diff --git a/be/src/common/config.h b/be/src/common/config.h
@@ -1709,6 +1709,10 @@ DECLARE_mInt32(segments_key_bounds_truncation_threshold);
 // ATTENTION: for test only, use random segments key bounds truncation threshold every time
 DECLARE_mBool(random_segments_key_bounds_truncation);
 
+// If true, non-MOW rowsets store a single aggregated [rowset_min, rowset_max]
+// key-bounds entry instead of per-segment bounds, to reduce meta size on cloud FDB.
+DECLARE_mBool(enable_aggregate_non_mow_key_bounds);
+
 DECLARE_mBool(enable_auto_clone_on_compaction_missing_version);
 
 DECLARE_mBool(enable_auto_clone_on_mow_publish_missing_version);
diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp
@@ -465,7 +465,18 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest
         std::vector<KeyBoundsPB> segments_key_bounds;
         rs->rowset_meta()->get_segments_key_bounds(&segments_key_bounds);
         int num_segments = cast_set<int>(rs->num_segments());
-        DCHECK_EQ(segments_key_bounds.size(), num_segments);
+        // MOW lookup requires per-segment bounds. Aggregation must be disabled
+        // for MOW writers, but enforce at runtime too — indexing segments_key_bounds[j]
+        // below would be out-of-bounds otherwise.
+        if (UNLIKELY(rs->rowset_meta()->is_segments_key_bounds_aggregated() ||
+                     static_cast<int>(segments_key_bounds.size()) != num_segments)) {
+            return Status::InternalError(
+                    "MOW lookup got rowset with inconsistent segments_key_bounds, rowset_id={}, "
+                    "aggregated={}, bounds_size={}, num_segments={}",
+                    rs->rowset_id().to_string(),
+                    rs->rowset_meta()->is_segments_key_bounds_aggregated(),
+                    segments_key_bounds.size(), num_segments);
+        }
         std::vector<uint32_t> picked_segments;
         for (int j = num_segments - 1; j >= 0; j--) {
             if (key_is_not_in_segment(key_without_seq, segments_key_bounds[j],
diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp
@@ -363,13 +363,15 @@ Status CompactionMixin::do_compact_ordered_rowsets() {
     // link data to new rowset
     auto seg_id = 0;
     bool segments_key_bounds_truncated {false};
+    bool any_input_aggregated {false};
     std::vector<KeyBoundsPB> segment_key_bounds;
     std::vector<uint32_t> num_segment_rows;
     for (auto rowset : _input_rowsets) {
         RETURN_IF_ERROR(rowset->link_files_to(tablet()->tablet_path(),
                                               _output_rs_writer->rowset_id(), seg_id));
         seg_id += rowset->num_segments();
         segments_key_bounds_truncated |= rowset->is_segments_key_bounds_truncated();
+        any_input_aggregated |= rowset->rowset_meta()->is_segments_key_bounds_aggregated();
         std::vector<KeyBoundsPB> key_bounds;
         RETURN_IF_ERROR(rowset->get_segments_key_bounds(&key_bounds));
         segment_key_bounds.insert(segment_key_bounds.end(), key_bounds.begin(), key_bounds.end());
@@ -389,7 +391,13 @@ Status CompactionMixin::do_compact_ordered_rowsets() {
     rowset_meta->set_segments_overlap(NONOVERLAPPING);
     rowset_meta->set_rowset_state(VISIBLE);
     rowset_meta->set_segments_key_bounds_truncated(segments_key_bounds_truncated);
-    rowset_meta->set_segments_key_bounds(segment_key_bounds);
+    // If any input was already aggregated we have no way to recover per-segment
+    // bounds, so force aggregation on the output to keep the layout consistent
+    // with `num_segments` / the aggregated flag, even if the config is off now.
+    bool aggregate_key_bounds =
+            any_input_aggregated || (config::enable_aggregate_non_mow_key_bounds &&
+                                     !_tablet->enable_unique_key_merge_on_write());
+    rowset_meta->set_segments_key_bounds(segment_key_bounds, aggregate_key_bounds);
     rowset_meta->set_num_segment_rows(num_segment_rows);
 
     _output_rowset = _output_rs_writer->manual_build(rowset_meta);
diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp
@@ -98,7 +98,10 @@ void build_rowset_meta_with_spec_field(RowsetMeta& rowset_meta,
             spec_rowset_meta.is_segments_key_bounds_truncated());
     std::vector<KeyBoundsPB> segments_key_bounds;
     spec_rowset_meta.get_segments_key_bounds(&segments_key_bounds);
-    rowset_meta.set_segments_key_bounds(segments_key_bounds);
+    // Preserve source layout: if source was aggregated (size 1), re-aggregating
+    // the single entry is a no-op that also keeps the flag consistent.
+    rowset_meta.set_segments_key_bounds(segments_key_bounds,
+                                        spec_rowset_meta.is_segments_key_bounds_aggregated());
     std::vector<uint32_t> num_segment_rows;
     spec_rowset_meta.get_num_segment_rows(&num_segment_rows);
     rowset_meta.set_num_segment_rows(num_segment_rows);
@@ -1021,7 +1024,9 @@ Status BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch
                                      _total_index_size);
     rowset_meta->set_data_disk_size(total_data_size + _total_data_size);
     rowset_meta->set_index_disk_size(total_index_size + _total_index_size);
-    rowset_meta->set_segments_key_bounds(segments_encoded_key_bounds);
+    bool aggregate_key_bounds = config::enable_aggregate_non_mow_key_bounds &&
+                                !_context.enable_unique_key_merge_on_write;
+    rowset_meta->set_segments_key_bounds(segments_encoded_key_bounds, aggregate_key_bounds);
     // TODO write zonemap to meta
     rowset_meta->set_empty((num_rows_written + _num_rows_written) == 0);
     rowset_meta->set_creation_time(time(nullptr));
diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h
@@ -302,6 +302,10 @@ class Rowset : public std::enable_shared_from_this<Rowset>, public MetadataAdder
         return _rowset_meta->is_segments_key_bounds_truncated();
     }
 
+    bool is_segments_key_bounds_aggregated() const {
+        return _rowset_meta->is_segments_key_bounds_aggregated();
+    }
+
     bool check_rowset_segment();
 
     [[nodiscard]] virtual Status add_to_binlog() { return Status::OK(); }
diff --git a/be/src/olap/rowset/rowset_meta.cpp b/be/src/olap/rowset/rowset_meta.cpp
@@ -291,11 +291,31 @@ int64_t RowsetMeta::segment_file_size(int seg_id) const {
                    : -1;
 }
 
-void RowsetMeta::set_segments_key_bounds(const std::vector<KeyBoundsPB>& segments_key_bounds) {
-    for (const KeyBoundsPB& key_bounds : segments_key_bounds) {
-        KeyBoundsPB* new_key_bounds = _rowset_meta_pb.add_segments_key_bounds();
-        *new_key_bounds = key_bounds;
+void RowsetMeta::set_segments_key_bounds(const std::vector<KeyBoundsPB>& segments_key_bounds,
+                                         bool aggregate_into_single) {
+    _rowset_meta_pb.clear_segments_key_bounds();
+    bool did_aggregate = aggregate_into_single && !segments_key_bounds.empty();
+    if (did_aggregate) {
+        const std::string* overall_min = &segments_key_bounds.front().min_key();
+        const std::string* overall_max = &segments_key_bounds.front().max_key();
+        for (const KeyBoundsPB& key_bounds : segments_key_bounds) {
+            if (key_bounds.min_key() < *overall_min) {
+                overall_min = &key_bounds.min_key();
+            }
+            if (key_bounds.max_key() > *overall_max) {
+                overall_max = &key_bounds.max_key();
+            }
+        }
+        KeyBoundsPB* aggregated = _rowset_meta_pb.add_segments_key_bounds();
+        aggregated->set_min_key(*overall_min);
+        aggregated->set_max_key(*overall_max);
+    } else {
+        for (const KeyBoundsPB& key_bounds : segments_key_bounds) {
+            KeyBoundsPB* new_key_bounds = _rowset_meta_pb.add_segments_key_bounds();
+            *new_key_bounds = key_bounds;
+        }
     }
+    set_segments_key_bounds_aggregated(did_aggregate);
 
     int32_t truncation_threshold = config::segments_key_bounds_truncation_threshold;
     if (config::random_segments_key_bounds_truncation) {
@@ -328,6 +348,11 @@ void RowsetMeta::merge_rowset_meta(const RowsetMeta& other) {
     set_total_disk_size(data_disk_size() + index_disk_size());
     set_segments_key_bounds_truncated(is_segments_key_bounds_truncated() ||
                                       other.is_segments_key_bounds_truncated());
+    // merge_rowset_meta is used in the MOW partial-update publish path, which relies
+    // on per-segment bounds. Aggregation should never be enabled for MOW rowsets,
+    // so we do not expect either side to be aggregated here.
+    DCHECK(!is_segments_key_bounds_aggregated() && !other.is_segments_key_bounds_aggregated())
+            << "merge_rowset_meta encountered aggregated key bounds";
     if (_rowset_meta_pb.num_segment_rows_size() > 0) {
         if (other.num_segments() > 0) {
             if (other._rowset_meta_pb.num_segment_rows_size() > 0) {
diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h
@@ -355,6 +355,17 @@ class RowsetMeta : public MetadataAdder<RowsetMeta> {
         _rowset_meta_pb.set_segments_key_bounds_truncated(truncated);
     }
 
+    // When true, `segments_key_bounds` holds a single aggregated
+    // [rowset_min, rowset_max] entry instead of per-segment bounds.
+    bool is_segments_key_bounds_aggregated() const {
+        return _rowset_meta_pb.has_segments_key_bounds_aggregated() &&
+               _rowset_meta_pb.segments_key_bounds_aggregated();
+    }
+
+    void set_segments_key_bounds_aggregated(bool aggregated) {
+        _rowset_meta_pb.set_segments_key_bounds_aggregated(aggregated);
+    }
+
     bool get_first_segment_key_bound(KeyBoundsPB* key_bounds) {
         // for compatibility, old version has not segment key bounds
         if (_rowset_meta_pb.segments_key_bounds_size() == 0) {
@@ -372,7 +383,10 @@ class RowsetMeta : public MetadataAdder<RowsetMeta> {
         return true;
     }
 
-    void set_segments_key_bounds(const std::vector<KeyBoundsPB>& segments_key_bounds);
+    // If `aggregate_into_single` is true, collapse per-segment bounds into a single
+    // [rowset_min, rowset_max] entry and mark this rowset as aggregated.
+    void set_segments_key_bounds(const std::vector<KeyBoundsPB>& segments_key_bounds,
+                                 bool aggregate_into_single = false);
 
     void add_segment_key_bounds(KeyBoundsPB segments_key_bounds) {
         *_rowset_meta_pb.add_segments_key_bounds() = std::move(segments_key_bounds);
diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp
@@ -227,8 +227,9 @@ Result<std::vector<PendingRowsetGuard>> SnapshotManager::convert_rowset_ids(
             // src be local rowset
             RowsetId rowset_id = _engine.next_rowset_id();
             guards.push_back(_engine.pending_local_rowsets().add(rowset_id));
-            RETURN_IF_ERROR_RESULT(_rename_rowset_id(visible_rowset, clone_dir, tablet_schema,
-                                                     rowset_id, rowset_meta));
+            RETURN_IF_ERROR_RESULT(_rename_rowset_id(
+                    visible_rowset, clone_dir, tablet_schema, rowset_id, rowset_meta,
+                    new_tablet_meta_pb.enable_unique_key_merge_on_write()));
             RowsetId src_rs_id;
             if (visible_rowset.rowset_id() > 0) {
                 src_rs_id.init(visible_rowset.rowset_id());
@@ -269,8 +270,9 @@ Result<std::vector<PendingRowsetGuard>> SnapshotManager::convert_rowset_ids(
             // src be local rowset
             RowsetId rowset_id = _engine.next_rowset_id();
             guards.push_back(_engine.pending_local_rowsets().add(rowset_id));
-            RETURN_IF_ERROR_RESULT(_rename_rowset_id(stale_rowset, clone_dir, tablet_schema,
-                                                     rowset_id, rowset_meta));
+            RETURN_IF_ERROR_RESULT(_rename_rowset_id(
+                    stale_rowset, clone_dir, tablet_schema, rowset_id, rowset_meta,
+                    new_tablet_meta_pb.enable_unique_key_merge_on_write()));
             RowsetId src_rs_id;
             if (stale_rowset.rowset_id() > 0) {
                 src_rs_id.init(stale_rowset.rowset_id());
@@ -324,7 +326,8 @@ Result<std::vector<PendingRowsetGuard>> SnapshotManager::convert_rowset_ids(
 Status SnapshotManager::_rename_rowset_id(const RowsetMetaPB& rs_meta_pb,
                                           const std::string& new_tablet_path,
                                           TabletSchemaSPtr tablet_schema, const RowsetId& rowset_id,
-                                          RowsetMetaPB* new_rs_meta_pb) {
+                                          RowsetMetaPB* new_rs_meta_pb,
+                                          bool enable_unique_key_merge_on_write) {
     Status st = Status::OK();
     RowsetMetaSharedPtr rowset_meta(new RowsetMeta());
     rowset_meta->init_from_pb(rs_meta_pb);
@@ -350,6 +353,9 @@ Status SnapshotManager::_rename_rowset_id(const RowsetMetaPB& rs_meta_pb,
     context.newest_write_timestamp = org_rowset_meta->newest_write_timestamp();
     // keep segments_overlap same as origin rowset
     context.segments_overlap = rowset_meta->segments_overlap();
+    // propagate MOW flag so that non-MOW key-bounds aggregation is not applied
+    // when restoring a MOW tablet's rowset
+    context.enable_unique_key_merge_on_write = enable_unique_key_merge_on_write;
 
     auto rs_writer = DORIS_TRY(RowsetFactory::create_rowset_writer(_engine, context, false));
 
diff --git a/be/src/olap/snapshot_manager.h b/be/src/olap/snapshot_manager.h
@@ -132,7 +132,7 @@ class SnapshotManager {
 
     Status _rename_rowset_id(const RowsetMetaPB& rs_meta_pb, const std::string& new_tablet_path,
                              TabletSchemaSPtr tablet_schema, const RowsetId& next_id,
-                             RowsetMetaPB* new_rs_meta_pb);
+                             RowsetMetaPB* new_rs_meta_pb, bool enable_unique_key_merge_on_write);
 
     Status _rename_index_ids(TabletSchemaPB& schema_pb,
                              const TabletSchemaSPtr& tablet_schema) const;
diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp
@@ -306,7 +306,10 @@ Status IndexBuilder::update_inverted_index_info() {
         RETURN_IF_ERROR(input_rowset->get_segments_key_bounds(&key_bounds));
         rowset_meta->set_segments_key_bounds_truncated(
                 input_rowset_meta->is_segments_key_bounds_truncated());
-        rowset_meta->set_segments_key_bounds(key_bounds);
+        // preserve aggregated layout via the setter so the aggregated flag is not
+        // clobbered by set_segments_key_bounds's default reset path.
+        rowset_meta->set_segments_key_bounds(
+                key_bounds, input_rowset_meta->is_segments_key_bounds_aggregated());
         std::vector<uint32_t> num_segment_rows;
         input_rowset_meta->get_num_segment_rows(&num_segment_rows);
         rowset_meta->set_num_segment_rows(num_segment_rows);
diff --git a/be/test/olap/rowset/rowset_meta_test.cpp b/be/test/olap/rowset/rowset_meta_test.cpp
diff --git a/be/test/olap/segments_key_bounds_truncation_test.cpp b/be/test/olap/segments_key_bounds_truncation_test.cpp
diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto
diff --git a/regression-test/suites/data_model_p0/duplicate/test_non_mow_key_bounds_aggregation.groovy b/regression-test/suites/data_model_p0/duplicate/test_non_mow_key_bounds_aggregation.groovy

Original file line number	Diff line number	Diff line change
`@@ -302,6 +302,10 @@ class Rowset : public std::enable_shared_from_this<Rowset>, public MetadataAdder`
`302`	`302`	`return _rowset_meta->is_segments_key_bounds_truncated();`
`303`	`303`	`}`
`304`	`304`
	`305`	`+ bool is_segments_key_bounds_aggregated() const {`
	`306`	`+ return _rowset_meta->is_segments_key_bounds_aggregated();`
	`307`	`+ }`
	`308`	`+`
`305`	`309`	`bool check_rowset_segment();`
`306`	`310`
`307`	`311`	`[[nodiscard]] virtual Status add_to_binlog() { return Status::OK(); }`