Skip to content

Commit a06f2e1

Browse files
authored
MAL: add safeDiv(divisor) on SampleFamily that yields 0 when the divisor is 0. (#13846)
1 parent 2319def commit a06f2e1

9 files changed

Lines changed: 204 additions & 22 deletions

File tree

docs/en/changes/changes.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
* Fix: remove `VirtualServiceAnalysisListener`'s dependency on `GenAIAnalyzerModule` if it is disabled.
4040
* MAL: register `TimeUnit` in `MALCodegenHelper.ENUM_FQCN` so rule YAML can write `.histogram("le", TimeUnit.MILLISECONDS)` for SDKs that emit histogram bucket bounds in ms (default `SECONDS` unit applies a ×1000 rescale that would otherwise inflate stored `le` labels 1000×).
4141
* Fix: potential unexpected current directory inclusion in Docker OAP classpath.
42+
* MAL: add `safeDiv(divisor)` on `SampleFamily` that yields `0` when the divisor is `0` instead of `Infinity`/`NaN`. Replace `/` with `safeDiv(...)` in Envoy AI Gateway latency-average rules so `sum / count * 1000` no longer produces dropped or out-of-range samples when a counter is zero in a window.
43+
* Fix: `envoy-ai-gateway` metrics rules, make the metrics value return `0` when the divisor is `0`.
4244

4345
#### UI
4446
* Add mobile menu icon and i18n labels for the iOS layer.

docs/en/concepts-and-designs/mal.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,22 @@ have no match and will not show up in the result:
152152
{region="asia-north",az="az-1"} 0.3333 // 11 / 33
153153
```
154154

155+
#### Safe division
156+
157+
The binary `/` operator produces `Infinity` when the right-hand value is `0`, and `NaN` when both
158+
sides are `0` (NaN samples are silently dropped during evaluation). Either result is rarely useful
159+
for downstream metrics. The `safeDiv` method on a sample family substitutes `0` for the result whenever
160+
the divisor is `0`. In other cases it follows normal division semantics, except that dividing by an
161+
empty sample family yields an empty result instead of `Infinity`/`NaN`-style output:
162+
163+
```
164+
gen_ai_server_request_duration_sum.sum(['service_name']).increase('PT1M')
165+
.safeDiv(gen_ai_server_request_duration_count.sum(['service_name']).increase('PT1M')) * 1000
166+
```
167+
168+
`safeDiv` accepts either a sample family or a scalar number argument. Use it when the denominator
169+
is a counter that may legitimately be zero in a given window (e.g. request count, error count).
170+
155171
### Aggregation Operation
156172

157173
Sample family supports the following aggregation operations that can be used to aggregate the samples of a single sample family,

oap-server/analyzer/meter-analyzer/src/main/java/org/apache/skywalking/oap/meter/analyzer/v2/dsl/SampleFamily.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,31 @@ public SampleFamily div(SampleFamily another) {
229229
return newValue(another, (a, b) -> a / b);
230230
}
231231

232+
/**
233+
* Safe variant of {@link #div(Number)}: when the divisor is zero, the
234+
* resulting samples are valued {@code 0.0} instead of {@code NaN}/{@code Infinity}.
235+
*/
236+
public SampleFamily safeDiv(Number number) {
237+
final double divisor = number.doubleValue();
238+
if (divisor == 0.0) {
239+
return newValue(v -> 0.0);
240+
}
241+
return newValue(v -> v / divisor);
242+
}
243+
244+
/**
245+
* Safe variant of {@link #div(SampleFamily)}: for each label-matched pair,
246+
* yields {@code 0.0} when the right-hand value is zero. When the divisor
247+
* family is {@link #EMPTY}, the result is also {@link #EMPTY} (no samples
248+
* to evaluate against), instead of producing {@code NaN}/{@code Infinity}.
249+
*/
250+
public SampleFamily safeDiv(SampleFamily another) {
251+
if (this == EMPTY || another == EMPTY) {
252+
return SampleFamily.EMPTY;
253+
}
254+
return newValue(another, (a, b) -> b == 0.0 ? 0.0 : a / b);
255+
}
256+
232257
/* Aggregation operators */
233258
public SampleFamily sum(List<String> by) {
234259
return aggregate(by, Double::sum);

oap-server/analyzer/meter-analyzer/src/test/java/org/apache/skywalking/oap/meter/analyzer/v2/compiler/MALClassGeneratorTest.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,16 @@
1717

1818
package org.apache.skywalking.oap.meter.analyzer.v2.compiler;
1919

20+
import com.google.common.collect.ImmutableMap;
2021
import javassist.ClassPool;
2122
import org.apache.skywalking.oap.meter.analyzer.v2.dsl.MalExpression;
23+
import org.apache.skywalking.oap.meter.analyzer.v2.dsl.Sample;
24+
import org.apache.skywalking.oap.meter.analyzer.v2.dsl.SampleFamily;
25+
import org.apache.skywalking.oap.meter.analyzer.v2.dsl.SampleFamilyBuilder;
2226
import org.junit.jupiter.api.BeforeEach;
2327
import org.junit.jupiter.api.Test;
2428

29+
import static org.junit.jupiter.api.Assertions.assertEquals;
2530
import static org.junit.jupiter.api.Assertions.assertNotNull;
2631
import static org.junit.jupiter.api.Assertions.assertThrows;
2732
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -148,6 +153,56 @@ void compileMethodCallMultiply() throws Exception {
148153
assertNotNull(expr.run(java.util.Map.of()));
149154
}
150155

156+
@Test
157+
void compileSafeDivBetweenFamiliesYieldsZeroWhenDenominatorIsZero() throws Exception {
158+
final MalExpression expr = generator.compile(
159+
"test_safe_div",
160+
"metric_sum.safeDiv(metric_count) * 1000");
161+
assertNotNull(expr);
162+
163+
final SampleFamily sum = SampleFamilyBuilder.newBuilder(
164+
Sample.builder().labels(ImmutableMap.of("svc", "s1")).value(50.0).build()).build();
165+
final SampleFamily count = SampleFamilyBuilder.newBuilder(
166+
Sample.builder().labels(ImmutableMap.of("svc", "s1")).value(0.0).build()).build();
167+
168+
final SampleFamily result = expr.run(java.util.Map.of(
169+
"metric_sum", sum, "metric_count", count));
170+
assertNotNull(result);
171+
assertEquals(1, result.samples.length);
172+
assertEquals(0.0, result.samples[0].getValue(), 0.001);
173+
}
174+
175+
@Test
176+
void compileSafeDivBetweenFamiliesNonZeroDenominator() throws Exception {
177+
final MalExpression expr = generator.compile(
178+
"test_safe_div_ok",
179+
"metric_sum.safeDiv(metric_count) * 1000");
180+
assertNotNull(expr);
181+
182+
final SampleFamily sum = SampleFamilyBuilder.newBuilder(
183+
Sample.builder().labels(ImmutableMap.of("svc", "s1")).value(2.0).build()).build();
184+
final SampleFamily count = SampleFamilyBuilder.newBuilder(
185+
Sample.builder().labels(ImmutableMap.of("svc", "s1")).value(4.0).build()).build();
186+
187+
final SampleFamily result = expr.run(java.util.Map.of(
188+
"metric_sum", sum, "metric_count", count));
189+
assertEquals(500.0, result.samples[0].getValue(), 0.001);
190+
}
191+
192+
@Test
193+
void compileSafeDivByZeroNumberLiteral() throws Exception {
194+
final MalExpression expr = generator.compile(
195+
"test_safe_div_zero_lit",
196+
"metric.safeDiv(0)");
197+
assertNotNull(expr);
198+
199+
final SampleFamily input = SampleFamilyBuilder.newBuilder(
200+
Sample.builder().labels(ImmutableMap.of("k", "v")).value(123.0).build()).build();
201+
202+
final SampleFamily result = expr.run(java.util.Map.of("metric", input));
203+
assertEquals(0.0, result.samples[0].getValue(), 0.001);
204+
}
205+
151206
// ==================== Error handling tests ====================
152207

153208
@Test

oap-server/analyzer/meter-analyzer/src/test/java/org/apache/skywalking/oap/meter/analyzer/v2/dsl/DSLV2Test.java

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,88 @@ void filterExpressionWithMalFilter() {
8989
assertEquals(1, filtered.samples.length);
9090
assertEquals(10.0, filtered.samples[0].getValue());
9191
}
92+
93+
@Test
94+
void safeDivByZeroNumberYieldsZero() {
95+
final SampleFamily sf = SampleFamilyBuilder.newBuilder(
96+
Sample.builder().name("m").labels(ImmutableMap.of("k", "v")).value(100.0).build()).build();
97+
98+
final SampleFamily result = sf.safeDiv(0);
99+
100+
assertNotNull(result);
101+
assertTrue(result != SampleFamily.EMPTY);
102+
assertEquals(1, result.samples.length);
103+
assertEquals(0.0, result.samples[0].getValue());
104+
}
105+
106+
@Test
107+
void safeDivByNonZeroNumberDividesNormally() {
108+
final SampleFamily sf = SampleFamilyBuilder.newBuilder(
109+
Sample.builder().name("m").labels(ImmutableMap.of("k", "v")).value(100.0).build()).build();
110+
111+
final SampleFamily result = sf.safeDiv(4);
112+
113+
assertEquals(25.0, result.samples[0].getValue());
114+
}
115+
116+
@Test
117+
void safeDivByFamilyYieldsZeroWhenDenominatorIsZero() {
118+
final SampleFamily numerator = SampleFamilyBuilder.newBuilder(
119+
Sample.builder().name("sum").labels(ImmutableMap.of("svc", "s1")).value(50.0).build()).build();
120+
final SampleFamily denominator = SampleFamilyBuilder.newBuilder(
121+
Sample.builder().name("count").labels(ImmutableMap.of("svc", "s1")).value(0.0).build()).build();
122+
123+
final SampleFamily result = numerator.safeDiv(denominator);
124+
125+
assertNotNull(result);
126+
assertTrue(result != SampleFamily.EMPTY);
127+
assertEquals(1, result.samples.length);
128+
assertEquals(0.0, result.samples[0].getValue());
129+
}
130+
131+
@Test
132+
void safeDivByFamilyMixesZeroAndNonZeroDenominators() {
133+
final SampleFamily numerator = SampleFamilyBuilder.newBuilder(
134+
Sample.builder().name("sum").labels(ImmutableMap.of("svc", "s1")).value(60.0).build(),
135+
Sample.builder().name("sum").labels(ImmutableMap.of("svc", "s2")).value(20.0).build()).build();
136+
final SampleFamily denominator = SampleFamilyBuilder.newBuilder(
137+
Sample.builder().name("count").labels(ImmutableMap.of("svc", "s1")).value(0.0).build(),
138+
Sample.builder().name("count").labels(ImmutableMap.of("svc", "s2")).value(4.0).build()).build();
139+
140+
final SampleFamily result = numerator.safeDiv(denominator);
141+
142+
assertEquals(2, result.samples.length);
143+
final double v1 = sampleValueByLabel(result, "svc", "s1");
144+
final double v2 = sampleValueByLabel(result, "svc", "s2");
145+
assertEquals(0.0, v1);
146+
assertEquals(5.0, v2);
147+
}
148+
149+
@Test
150+
void safeDivByEmptyFamilyReturnsEmpty() {
151+
final SampleFamily numerator = SampleFamilyBuilder.newBuilder(
152+
Sample.builder().name("sum").labels(ImmutableMap.of("svc", "s1")).value(60.0).build()).build();
153+
154+
final SampleFamily result = numerator.safeDiv(SampleFamily.EMPTY);
155+
156+
assertEquals(SampleFamily.EMPTY, result);
157+
}
158+
159+
@Test
160+
void safeDivOnEmptyFamilyReturnsEmpty() {
161+
final SampleFamily denominator = SampleFamilyBuilder.newBuilder(
162+
Sample.builder().name("count").labels(ImmutableMap.of("svc", "s1")).value(2.0).build()).build();
163+
164+
assertEquals(SampleFamily.EMPTY, SampleFamily.EMPTY.safeDiv(denominator));
165+
assertEquals(SampleFamily.EMPTY, SampleFamily.EMPTY.safeDiv(2));
166+
}
167+
168+
private static double sampleValueByLabel(final SampleFamily sf, final String labelKey, final String labelValue) {
169+
for (final Sample s : sf.samples) {
170+
if (labelValue.equals(s.getLabels().get(labelKey))) {
171+
return s.getValue();
172+
}
173+
}
174+
throw new IllegalStateException("No sample with " + labelKey + "=" + labelValue);
175+
}
92176
}

oap-server/server-starter/src/main/resources/otel-rules/envoy-ai-gateway/gateway-instance.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ metricsRules:
3131

3232
# Request latency average (ms)
3333
- name: request_latency_avg
34-
exp: gen_ai_server_request_duration_sum.sum(['service_name', 'service_instance_id']).increase('PT1M') / gen_ai_server_request_duration_count.sum(['service_name', 'service_instance_id']).increase('PT1M') * 1000
34+
exp: gen_ai_server_request_duration_sum.sum(['service_name', 'service_instance_id']).increase('PT1M').safeDiv(gen_ai_server_request_duration_count.sum(['service_name', 'service_instance_id']).increase('PT1M')) * 1000
3535

3636
# Request latency percentile (ms)
3737
- name: request_latency_percentile
@@ -47,15 +47,15 @@ metricsRules:
4747

4848
# TTFT average (ms)
4949
- name: ttft_avg
50-
exp: gen_ai_server_time_to_first_token_sum.sum(['service_name', 'service_instance_id']).increase('PT1M') / gen_ai_server_time_to_first_token_count.sum(['service_name', 'service_instance_id']).increase('PT1M') * 1000
50+
exp: gen_ai_server_time_to_first_token_sum.sum(['service_name', 'service_instance_id']).increase('PT1M').safeDiv(gen_ai_server_time_to_first_token_count.sum(['service_name', 'service_instance_id']).increase('PT1M')) * 1000
5151

5252
# TTFT percentile (ms)
5353
- name: ttft_percentile
5454
exp: gen_ai_server_time_to_first_token.sum(['le', 'service_name', 'service_instance_id']).increase('PT1M').histogram().histogram_percentile([50,75,90,95,99]) * 1000
5555

5656
# TPOT average (ms)
5757
- name: tpot_avg
58-
exp: gen_ai_server_time_per_output_token_sum.sum(['service_name', 'service_instance_id']).increase('PT1M') / gen_ai_server_time_per_output_token_count.sum(['service_name', 'service_instance_id']).increase('PT1M') * 1000
58+
exp: gen_ai_server_time_per_output_token_sum.sum(['service_name', 'service_instance_id']).increase('PT1M').safeDiv(gen_ai_server_time_per_output_token_count.sum(['service_name', 'service_instance_id']).increase('PT1M')) * 1000
5959

6060
# TPOT percentile (ms)
6161
- name: tpot_percentile
@@ -73,7 +73,7 @@ metricsRules:
7373

7474
# Provider latency average (ms)
7575
- name: provider_latency_avg
76-
exp: gen_ai_server_request_duration_sum.sum(['gen_ai_provider_name', 'service_name', 'service_instance_id']).increase('PT1M') / gen_ai_server_request_duration_count.sum(['gen_ai_provider_name', 'service_name', 'service_instance_id']).increase('PT1M') * 1000
76+
exp: gen_ai_server_request_duration_sum.sum(['gen_ai_provider_name', 'service_name', 'service_instance_id']).increase('PT1M').safeDiv(gen_ai_server_request_duration_count.sum(['gen_ai_provider_name', 'service_name', 'service_instance_id']).increase('PT1M')) * 1000
7777

7878
# ===================== Per-model breakdown =====================
7979

@@ -87,12 +87,12 @@ metricsRules:
8787

8888
# Model latency average (ms)
8989
- name: model_latency_avg
90-
exp: gen_ai_server_request_duration_sum.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M') / gen_ai_server_request_duration_count.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M') * 1000
90+
exp: gen_ai_server_request_duration_sum.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M').safeDiv(gen_ai_server_request_duration_count.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M')) * 1000
9191

9292
# Model TTFT average (ms)
9393
- name: model_ttft_avg
94-
exp: gen_ai_server_time_to_first_token_sum.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M') / gen_ai_server_time_to_first_token_count.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M') * 1000
94+
exp: gen_ai_server_time_to_first_token_sum.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M').safeDiv(gen_ai_server_time_to_first_token_count.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M')) * 1000
9595

9696
# Model TPOT average (ms)
9797
- name: model_tpot_avg
98-
exp: gen_ai_server_time_per_output_token_sum.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M') / gen_ai_server_time_per_output_token_count.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M') * 1000
98+
exp: gen_ai_server_time_per_output_token_sum.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M').safeDiv(gen_ai_server_time_per_output_token_count.sum(['gen_ai_response_model', 'service_name', 'service_instance_id']).increase('PT1M')) * 1000

oap-server/server-starter/src/main/resources/otel-rules/envoy-ai-gateway/gateway-mcp-instance.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ metricsRules:
3131

3232
# MCP request latency average (ms)
3333
- name: request_latency_avg
34-
exp: mcp_request_duration_sum.sum(['service_name', 'service_instance_id']).increase('PT1M') / mcp_request_duration_count.sum(['service_name', 'service_instance_id']).increase('PT1M') * 1000
34+
exp: mcp_request_duration_sum.sum(['service_name', 'service_instance_id']).increase('PT1M').safeDiv(mcp_request_duration_count.sum(['service_name', 'service_instance_id']).increase('PT1M')) * 1000
3535

3636
# MCP request latency percentile (ms)
3737
- name: request_latency_percentile
@@ -47,7 +47,7 @@ metricsRules:
4747

4848
# MCP initialization latency average (ms)
4949
- name: initialization_latency_avg
50-
exp: mcp_initialization_duration_sum.sum(['service_name', 'service_instance_id']).increase('PT1M') / mcp_initialization_duration_count.sum(['service_name', 'service_instance_id']).increase('PT1M') * 1000
50+
exp: mcp_initialization_duration_sum.sum(['service_name', 'service_instance_id']).increase('PT1M').safeDiv(mcp_initialization_duration_count.sum(['service_name', 'service_instance_id']).increase('PT1M')) * 1000
5151

5252
# MCP initialization latency percentile (ms)
5353
- name: initialization_latency_percentile
@@ -65,7 +65,7 @@ metricsRules:
6565

6666
# Backend request latency average (ms)
6767
- name: backend_request_latency_avg
68-
exp: mcp_request_duration_sum.sum(['mcp_backend', 'service_name', 'service_instance_id']).increase('PT1M') / mcp_request_duration_count.sum(['mcp_backend', 'service_name', 'service_instance_id']).increase('PT1M') * 1000
68+
exp: mcp_request_duration_sum.sum(['mcp_backend', 'service_name', 'service_instance_id']).increase('PT1M').safeDiv(mcp_request_duration_count.sum(['mcp_backend', 'service_name', 'service_instance_id']).increase('PT1M')) * 1000
6969

7070
# Backend method CPM
7171
- name: backend_method_cpm
@@ -77,4 +77,4 @@ metricsRules:
7777

7878
# Backend initialization latency average (ms)
7979
- name: backend_initialization_latency_avg
80-
exp: mcp_initialization_duration_sum.sum(['mcp_backend', 'service_name', 'service_instance_id']).increase('PT1M') / mcp_initialization_duration_count.sum(['mcp_backend', 'service_name', 'service_instance_id']).increase('PT1M') * 1000
80+
exp: mcp_initialization_duration_sum.sum(['mcp_backend', 'service_name', 'service_instance_id']).increase('PT1M').safeDiv(mcp_initialization_duration_count.sum(['mcp_backend', 'service_name', 'service_instance_id']).increase('PT1M')) * 1000

oap-server/server-starter/src/main/resources/otel-rules/envoy-ai-gateway/gateway-mcp-service.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ metricsRules:
3636

3737
# MCP request latency average (ms)
3838
- name: request_latency_avg
39-
exp: mcp_request_duration_sum.sum(['service_name']).increase('PT1M') / mcp_request_duration_count.sum(['service_name']).increase('PT1M') * 1000
39+
exp: mcp_request_duration_sum.sum(['service_name']).increase('PT1M').safeDiv(mcp_request_duration_count.sum(['service_name']).increase('PT1M')) * 1000
4040

4141
# MCP request latency percentile (ms)
4242
- name: request_latency_percentile
@@ -52,7 +52,7 @@ metricsRules:
5252

5353
# MCP initialization latency average (ms)
5454
- name: initialization_latency_avg
55-
exp: mcp_initialization_duration_sum.sum(['service_name']).increase('PT1M') / mcp_initialization_duration_count.sum(['service_name']).increase('PT1M') * 1000
55+
exp: mcp_initialization_duration_sum.sum(['service_name']).increase('PT1M').safeDiv(mcp_initialization_duration_count.sum(['service_name']).increase('PT1M')) * 1000
5656

5757
# MCP initialization latency percentile (ms)
5858
- name: initialization_latency_percentile
@@ -70,7 +70,7 @@ metricsRules:
7070

7171
# Backend request latency average (ms) — labeled by mcp_backend
7272
- name: backend_request_latency_avg
73-
exp: mcp_request_duration_sum.sum(['mcp_backend', 'service_name']).increase('PT1M') / mcp_request_duration_count.sum(['mcp_backend', 'service_name']).increase('PT1M') * 1000
73+
exp: mcp_request_duration_sum.sum(['mcp_backend', 'service_name']).increase('PT1M').safeDiv(mcp_request_duration_count.sum(['mcp_backend', 'service_name']).increase('PT1M')) * 1000
7474

7575
# Backend method CPM — labeled by mcp_backend and mcp_method_name
7676
- name: backend_method_cpm
@@ -82,4 +82,4 @@ metricsRules:
8282

8383
# Backend initialization latency average (ms) — labeled by mcp_backend
8484
- name: backend_initialization_latency_avg
85-
exp: mcp_initialization_duration_sum.sum(['mcp_backend', 'service_name']).increase('PT1M') / mcp_initialization_duration_count.sum(['mcp_backend', 'service_name']).increase('PT1M') * 1000
85+
exp: mcp_initialization_duration_sum.sum(['mcp_backend', 'service_name']).increase('PT1M').safeDiv(mcp_initialization_duration_count.sum(['mcp_backend', 'service_name']).increase('PT1M')) * 1000

0 commit comments

Comments
 (0)