|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +#include "cloud/cloud_cluster_info.h" |
| 19 | + |
| 20 | +#include <glog/logging.h> |
| 21 | + |
| 22 | +#include "cloud/cloud_meta_mgr.h" |
| 23 | +#include "cloud/cloud_storage_engine.h" |
| 24 | +#include "cloud/cloud_tablet.h" |
| 25 | +#include "cloud/config.h" |
| 26 | +#include "gen_cpp/cloud.pb.h" |
| 27 | +#include "runtime/exec_env.h" |
| 28 | +#include "util/time.h" |
| 29 | + |
| 30 | +namespace doris { |
| 31 | + |
| 32 | +CloudClusterInfo::~CloudClusterInfo() { |
| 33 | + stop_bg_worker(); |
| 34 | +} |
| 35 | + |
| 36 | +void CloudClusterInfo::start_bg_worker() { |
| 37 | + bool expected = true; |
| 38 | + if (!_bg_worker_stopped.compare_exchange_strong(expected, false)) { |
| 39 | + // Already running |
| 40 | + return; |
| 41 | + } |
| 42 | + |
| 43 | + _bg_worker = std::thread(&CloudClusterInfo::_bg_worker_func, this); |
| 44 | + LOG(INFO) << "CloudClusterInfo background worker started, " |
| 45 | + << "refresh_interval=" << config::cluster_status_cache_refresh_interval_sec << "s"; |
| 46 | +} |
| 47 | + |
| 48 | +void CloudClusterInfo::stop_bg_worker() { |
| 49 | + bool expected = false; |
| 50 | + if (!_bg_worker_stopped.compare_exchange_strong(expected, true)) { |
| 51 | + // Already stopped |
| 52 | + return; |
| 53 | + } |
| 54 | + |
| 55 | + { |
| 56 | + std::lock_guard lock(_bg_worker_mutex); |
| 57 | + _bg_worker_cv.notify_all(); |
| 58 | + } |
| 59 | + |
| 60 | + if (_bg_worker.joinable()) { |
| 61 | + _bg_worker.join(); |
| 62 | + } |
| 63 | + |
| 64 | + LOG(INFO) << "CloudClusterInfo background worker stopped"; |
| 65 | +} |
| 66 | + |
| 67 | +void CloudClusterInfo::_bg_worker_func() { |
| 68 | + LOG(INFO) << "CloudClusterInfo background worker thread running"; |
| 69 | + |
| 70 | + while (!_bg_worker_stopped.load()) { |
| 71 | + _refresh_cluster_status(); |
| 72 | + |
| 73 | + std::unique_lock lock(_bg_worker_mutex); |
| 74 | + _bg_worker_cv.wait_for( |
| 75 | + lock, std::chrono::seconds(config::cluster_status_cache_refresh_interval_sec), |
| 76 | + [this] { return _bg_worker_stopped.load(); }); |
| 77 | + } |
| 78 | +} |
| 79 | + |
| 80 | +void CloudClusterInfo::_refresh_cluster_status() { |
| 81 | + if (!config::is_cloud_mode()) { |
| 82 | + return; |
| 83 | + } |
| 84 | + auto* cloud_engine = |
| 85 | + dynamic_cast<CloudStorageEngine*>(&ExecEnv::GetInstance()->storage_engine()); |
| 86 | + if (!cloud_engine) { |
| 87 | + return; |
| 88 | + } |
| 89 | + |
| 90 | + std::unordered_map<std::string, std::pair<int32_t, int64_t>> cluster_status; |
| 91 | + std::string resolved_cluster_id; |
| 92 | + Status st = cloud_engine->meta_mgr().get_cluster_status( |
| 93 | + &cluster_status, my_cluster_id().empty() ? &resolved_cluster_id : nullptr); |
| 94 | + if (!st.ok()) { |
| 95 | + LOG(WARNING) << "Failed to refresh cluster status: " << st; |
| 96 | + return; |
| 97 | + } |
| 98 | + |
| 99 | + // Update cache |
| 100 | + { |
| 101 | + std::unique_lock lock(_mutex); |
| 102 | + _cluster_status_cache.clear(); |
| 103 | + for (const auto& [cluster_id, status_pair] : cluster_status) { |
| 104 | + _cluster_status_cache[cluster_id] = {status_pair.first, status_pair.second}; |
| 105 | + } |
| 106 | + } |
| 107 | + |
| 108 | + VLOG_DEBUG << "Refreshed cluster status cache, " << cluster_status.size() << " clusters"; |
| 109 | + |
| 110 | + // Set our own cluster_id if resolved from the response |
| 111 | + if (my_cluster_id().empty() && !resolved_cluster_id.empty()) { |
| 112 | + set_my_cluster_id(resolved_cluster_id); |
| 113 | + LOG(INFO) << "Resolved my cluster_id: " << resolved_cluster_id; |
| 114 | + } |
| 115 | +} |
| 116 | + |
| 117 | +bool CloudClusterInfo::should_skip_compaction(CloudTablet* tablet) const { |
| 118 | + if (!config::enable_compaction_rw_separation) { |
| 119 | + return false; |
| 120 | + } |
| 121 | + |
| 122 | + std::string last_active_cluster = tablet->last_active_cluster_id(); |
| 123 | + std::string my_cluster = my_cluster_id(); |
| 124 | + int64_t tablet_id = tablet->tablet_id(); |
| 125 | + |
| 126 | + // Case 1: No active cluster record, any cluster can compact |
| 127 | + if (last_active_cluster.empty()) { |
| 128 | + VLOG_DEBUG << "tablet " << tablet_id << " has no last_active_cluster record, " |
| 129 | + << "my_cluster=" << my_cluster << ", allow compaction"; |
| 130 | + return false; |
| 131 | + } |
| 132 | + |
| 133 | + // Case 2: This is the active cluster, allow compaction |
| 134 | + if (last_active_cluster == my_cluster) { |
| 135 | + VLOG_DEBUG << "tablet " << tablet_id << " last_active_cluster=" << last_active_cluster |
| 136 | + << " equals my_cluster=" << my_cluster << ", allow compaction"; |
| 137 | + return false; |
| 138 | + } |
| 139 | + |
| 140 | + // Case 3: Check if the last active cluster is available |
| 141 | + ClusterStatusCache cache; |
| 142 | + if (!get_cluster_status(last_active_cluster, &cache)) { |
| 143 | + // Cluster not found in cache, might be deleted, allow takeover |
| 144 | + LOG(INFO) << "compaction_rw_separation: tablet " << tablet_id |
| 145 | + << " last_active_cluster=" << last_active_cluster |
| 146 | + << " not found in cache (maybe deleted), my_cluster=" << my_cluster |
| 147 | + << ", allow takeover"; |
| 148 | + return false; |
| 149 | + } |
| 150 | + |
| 151 | + // Force compaction if tablet has too many rowsets (>80% of max_tablet_version_num), |
| 152 | + // even on read clusters, to prevent version count from growing unbounded |
| 153 | + // when the write cluster can't keep up or has compaction disabled. |
| 154 | + int64_t num_rowsets = tablet->fetch_add_approximate_num_rowsets(0); |
| 155 | + auto threshold = static_cast<int64_t>(tablet->max_version_config() * |
| 156 | + config::compaction_rw_separation_version_threshold_ratio); |
| 157 | + if (num_rowsets > threshold) { |
| 158 | + LOG(INFO) << "compaction_rw_separation: force compaction on tablet " << tablet_id |
| 159 | + << ", num_rowsets=" << num_rowsets << " > threshold=" << threshold << " (80% of " |
| 160 | + << tablet->max_version_config() << ")" |
| 161 | + << ", my_cluster=" << my_cluster; |
| 162 | + return false; |
| 163 | + } |
| 164 | + |
| 165 | + auto status = static_cast<cloud::ClusterStatus>(cache.status); |
| 166 | + int64_t status_mtime = cache.mtime_ms; |
| 167 | + int64_t now = UnixMillis(); |
| 168 | + int64_t elapsed = now - status_mtime; |
| 169 | + int64_t timeout = config::compaction_cluster_takeover_timeout_ms; |
| 170 | + |
| 171 | + // Case 4: Original cluster is NORMAL (still active), cannot takeover |
| 172 | + if (status == cloud::ClusterStatus::NORMAL) { |
| 173 | + LOG_EVERY_N(INFO, 100) << "compaction_rw_separation: skip tablet " << tablet_id |
| 174 | + << ", last_active_cluster=" << last_active_cluster |
| 175 | + << " is NORMAL (active), my_cluster=" << my_cluster; |
| 176 | + return true; |
| 177 | + } |
| 178 | + |
| 179 | + // Case 5: Original cluster is unavailable (SUSPENDED/MANUAL_SHUTDOWN/deleted) |
| 180 | + if (elapsed > timeout) { |
| 181 | + // Takeover successful |
| 182 | + LOG(INFO) << "compaction_rw_separation: takeover tablet " << tablet_id |
| 183 | + << ", last_active_cluster=" << last_active_cluster |
| 184 | + << " status=" << cloud::ClusterStatus_Name(status) |
| 185 | + << " status_mtime=" << status_mtime << " elapsed=" << elapsed |
| 186 | + << "ms > timeout=" << timeout << "ms" |
| 187 | + << ", my_cluster=" << my_cluster; |
| 188 | + return false; |
| 189 | + } else { |
| 190 | + // Timeout not reached yet, waiting |
| 191 | + LOG_EVERY_N(INFO, 100) << "compaction_rw_separation: skip tablet " << tablet_id |
| 192 | + << ", last_active_cluster=" << last_active_cluster |
| 193 | + << " status=" << cloud::ClusterStatus_Name(status) |
| 194 | + << " status_mtime=" << status_mtime << " elapsed=" << elapsed |
| 195 | + << "ms <= timeout=" << timeout << "ms" |
| 196 | + << ", my_cluster=" << my_cluster << ", waiting for takeover"; |
| 197 | + return true; |
| 198 | + } |
| 199 | +} |
| 200 | + |
| 201 | +} // namespace doris |
0 commit comments