-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Add performance counter multiplexing #2083
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
double-fault
wants to merge
3
commits into
google:main
Choose a base branch
from
double-fault:perf_multiplexing
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+80
−42
Open
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,46 +39,77 @@ | |
| namespace benchmark { | ||
| namespace internal { | ||
|
|
||
| // Typically, we can only read a small number of counters. There is also a | ||
| // padding preceding counter values, when reading multiple counters with one | ||
| // syscall (which is desirable). PerfCounterValues abstracts these details. | ||
| // Typically, we only read a small number of counters. There is also a | ||
| // specific format when reading multiple counters with one syscall | ||
| // (which is desirable). PerfCounterValues abstracts these details. | ||
| // The implementation ensures the storage is inlined, and allows 0-based | ||
| // indexing into the counter values. | ||
| // The object is used in conjunction with a PerfCounters object, by passing it | ||
| // to Snapshot(). The Read() method relocates individual reads, discarding | ||
| // the initial padding from each group leader in the values buffer such that | ||
| // all user accesses through the [] operator are correct. | ||
| // to Snapshot(). The Read() method reads in the appropriate fields into | ||
| // the values buffer such that all user accesses through the [] operator are | ||
| // correct. | ||
| // The [] operator estimates the true value of the counter using the | ||
| // time_enabled and time_running values (which will be equal if there is no | ||
| // multiplexing, i.e only a single group). | ||
| class BENCHMARK_EXPORT PerfCounterValues { | ||
| public: | ||
| explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) { | ||
| BM_CHECK_LE(nr_counters_, kMaxCounters); | ||
| } | ||
|
|
||
| // We are reading correctly now so the values don't need to skip padding | ||
| uint64_t operator[](size_t pos) const { return values_[pos]; } | ||
| double operator[](size_t pos) const { return values_[pos].GetEstimate(); } | ||
|
|
||
| PerfCounterValues operator-=(const PerfCounterValues& counter_values) { | ||
| for (size_t i = 0; i < nr_counters_; i++) { | ||
| values_[i] -= counter_values.values_[i]; | ||
| } | ||
|
|
||
| return *this; | ||
| } | ||
|
|
||
| // Increased the maximum to 32 only since the buffer | ||
| // is std::array<> backed | ||
| static constexpr size_t kMaxCounters = 32; | ||
|
|
||
| private: | ||
| // Represents the value of a counter. | ||
| // time_enabled_ = time_running_ if there is no multiplexing, | ||
| // i.e only a single group. | ||
| class Value { | ||
| public: | ||
| void Set(uint64_t time_enabled, uint64_t time_running, uint64_t value) { | ||
| time_enabled_ = time_enabled; | ||
| time_running_ = time_running; | ||
| value_ = value; | ||
| } | ||
|
|
||
| double GetEstimate() const { | ||
| return static_cast<double>(value_ * time_enabled_) / | ||
| static_cast<double>(time_running_); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks iffy. What are the typical perf counter value ranges? We are either going to be losing precision here, |
||
| } | ||
|
|
||
| Value operator-=(const Value& value) { | ||
| time_enabled_ -= value.time_enabled_; | ||
| time_running_ -= value.time_running_; | ||
| value_ -= value.value_; | ||
|
|
||
| return *this; | ||
| } | ||
|
|
||
| private: | ||
| uint64_t time_enabled_; | ||
| uint64_t time_running_; | ||
| uint64_t value_; | ||
| }; | ||
|
|
||
| friend class PerfCounters; | ||
| // Get the byte buffer in which perf counters can be captured. | ||
| // This is used by PerfCounters::Read | ||
| std::pair<char*, size_t> get_data_buffer() { | ||
| return {reinterpret_cast<char*>(values_.data()), | ||
| sizeof(uint64_t) * (kPadding + nr_counters_)}; | ||
| } | ||
|
|
||
| // This reading is complex and as the goal of this class is to | ||
| // abstract away the intrincacies of the reading process, this is | ||
| // a better place for it | ||
| size_t Read(const std::vector<int>& leaders); | ||
|
|
||
| // Move the padding to 2 due to the reading algorithm (1st padding plus a | ||
| // current read padding) | ||
| static constexpr size_t kPadding = 2; | ||
| std::array<uint64_t, kPadding + kMaxCounters> values_; | ||
| std::array<Value, kMaxCounters> values_; | ||
| const size_t nr_counters_; | ||
| }; | ||
|
|
||
|
|
@@ -174,10 +205,9 @@ class BENCHMARK_EXPORT PerfCountersMeasurement final { | |
| valid_read_ &= counters_.Snapshot(&end_values_); | ||
| ClobberMemory(); | ||
|
|
||
| end_values_ -= start_values_; | ||
| for (size_t i = 0; i < counters_.names().size(); ++i) { | ||
| double measurement = static_cast<double>(end_values_[i]) - | ||
| static_cast<double>(start_values_[i]); | ||
| measurements.push_back({counters_.names()[i], measurement}); | ||
| measurements.push_back({counters_.names()[i], end_values_[i]}); | ||
| } | ||
|
|
||
| return valid_read_; | ||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
structthing is nice, but i'm like 80% sure there is UB here.The most safe way would be to read into
array<uint64_t, 3+kMaxCounters>,and then produce that
bufferstruct from it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, can that be sunk into
for (int lead : leaders) {?