From a8ba0c8e9ece02aa38c04cbe77ace132939a1cbe Mon Sep 17 00:00:00 2001
From: Gagan Trivedi <gagandeeptrivedi47@gmail.com>
Date: Fri, 12 Jun 2026 13:37:54 +0530
Subject: [PATCH 1/5] feat(experimentation): Bayesian stats kernel

---
 api/experimentation/stats.py                 |  90 ++++++++++
 api/tests/unit/experimentation/test_stats.py | 173 +++++++++++++++++++
 2 files changed, 263 insertions(+)
 create mode 100644 api/experimentation/stats.py
 create mode 100644 api/tests/unit/experimentation/test_stats.py

diff --git a/api/experimentation/stats.py b/api/experimentation/stats.py
new file mode 100644
index 000000000000..cc7b27cadc48
--- /dev/null
+++ b/api/experimentation/stats.py
@@ -0,0 +1,90 @@
+import math
+from collections.abc import Sequence
+from dataclasses import dataclass
+from statistics import NormalDist
+
+_STANDARD_NORMAL = NormalDist()
+_Z_95 = 1.959963984540054
+
+
+@dataclass(frozen=True)
+class VariantStats:
+    n: int
+    sum: float
+    sum_squares: float
+
+    @property
+    def mean(self) -> float:
+        return self.sum / self.n
+
+    @property
+    def variance(self) -> float:
+        return max(0.0, (self.sum_squares - self.sum**2 / self.n) / (self.n - 1))
+
+
+@dataclass(frozen=True)
+class Inference:
+    lift: float
+    ci_low: float
+    ci_high: float
+    chance_to_win: float
+
+
+def compare_to_control(
+    control: VariantStats,
+    treatment: VariantStats,
+) -> Inference | None:
+    if control.n < 2 or treatment.n < 2 or control.mean == 0:
+        return None
+
+    lift = (treatment.mean - control.mean) / control.mean
+    # Delta-method variance of the relative lift; the arms are independent.
+    variance = treatment.variance / (
+        treatment.n * control.mean**2
+    ) + treatment.mean**2 * control.variance / (control.n * control.mean**4)
+    standard_error = math.sqrt(variance)
+    if standard_error == 0:
+        certainty = 0.5 if lift == 0 else float(lift > 0)
+        return Inference(lift=lift, ci_low=lift, ci_high=lift, chance_to_win=certainty)
+
+    return Inference(
+        lift=lift,
+        ci_low=lift - _Z_95 * standard_error,
+        ci_high=lift + _Z_95 * standard_error,
+        chance_to_win=_STANDARD_NORMAL.cdf(lift / standard_error),
+    )
+
+
+def srm_p_value(
+    observed: Sequence[int],
+    expected_shares: Sequence[float],
+) -> float | None:
+    total = sum(observed)
+    if len(observed) < 2 or total == 0 or any(s <= 0 for s in expected_shares):
+        return None
+
+    statistic = sum(
+        (count - total * share) ** 2 / (total * share)
+        for count, share in zip(observed, expected_shares, strict=True)
+    )
+    return _chi_squared_survival(statistic, degrees_of_freedom=len(observed) - 1)
+
+
+def _chi_squared_survival(statistic: float, degrees_of_freedom: int) -> float:
+    if statistic <= 0:
+        return 1.0
+    # The standard library has no chi-squared distribution, but for integer
+    # degrees of freedom the survival function is exact from the base cases
+    # Q(1/2, y) = erfc(√y) and Q(1, y) = e⁻ʸ via the recurrence
+    # Q(a+1, y) = Q(a, y) + yᵃe⁻ʸ/Γ(a+1).
+    y = statistic / 2.0
+    if degrees_of_freedom % 2:
+        a = 0.5
+        survival = math.erfc(math.sqrt(y))
+    else:
+        a = 1.0
+        survival = math.exp(-y)
+    while a + 1.0 <= degrees_of_freedom / 2.0:
+        survival += math.exp(a * math.log(y) - y - math.lgamma(a + 1.0))
+        a += 1.0
+    return survival
diff --git a/api/tests/unit/experimentation/test_stats.py b/api/tests/unit/experimentation/test_stats.py
new file mode 100644
index 000000000000..520390a74b5c
--- /dev/null
+++ b/api/tests/unit/experimentation/test_stats.py
@@ -0,0 +1,173 @@
+import pytest
+
+from experimentation.stats import (
+    Inference,
+    VariantStats,
+    compare_to_control,
+    srm_p_value,
+)
+
+
+def test_variant_stats__sufficient_statistics__derive_mean_and_variance() -> None:
+    # Given 1000 identities with 100 conversions (0/1 values)
+    stats = VariantStats(n=1000, sum=100.0, sum_squares=100.0)
+
+    # When / Then
+    assert stats.mean == 0.1
+    assert stats.variance == pytest.approx(90.0 / 999.0)
+
+
+def test_variant_stats__float_noise__variance_clamped_to_zero() -> None:
+    # Given sums whose rounding puts the raw variance just below zero
+    stats = VariantStats(n=2, sum=2.0, sum_squares=1.9999999999999996)
+
+    # When / Then
+    assert stats.variance == 0.0
+
+
+def test_compare_to_control__more_conversions__positive_lift_inference() -> None:
+    # Given a 10% control and a 12% treatment, 1000 identities each
+    control = VariantStats(n=1000, sum=100.0, sum_squares=100.0)
+    treatment = VariantStats(n=1000, sum=120.0, sum_squares=120.0)
+
+    # When
+    inference = compare_to_control(control, treatment)
+
+    # Then
+    assert inference is not None
+    assert inference.lift == pytest.approx(0.2)
+    assert inference.ci_low == pytest.approx(-0.10074, abs=1e-4)
+    assert inference.ci_high == pytest.approx(0.50074, abs=1e-4)
+    assert inference.chance_to_win == pytest.approx(0.90379, abs=1e-4)
+
+
+def test_compare_to_control__identical_arms__chance_is_even() -> None:
+    # Given two arms with the same conversions
+    arm = VariantStats(n=1000, sum=100.0, sum_squares=100.0)
+
+    # When
+    inference = compare_to_control(arm, arm)
+
+    # Then
+    assert inference is not None
+    assert inference.lift == 0.0
+    assert inference.chance_to_win == 0.5
+    assert inference.ci_low == pytest.approx(-inference.ci_high)
+
+
+@pytest.mark.parametrize(
+    "treatment, expected",
+    [
+        (
+            VariantStats(n=10, sum=20.0, sum_squares=40.0),
+            Inference(lift=1.0, ci_low=1.0, ci_high=1.0, chance_to_win=1.0),
+        ),
+        (
+            VariantStats(n=10, sum=5.0, sum_squares=2.5),
+            Inference(lift=-0.5, ci_low=-0.5, ci_high=-0.5, chance_to_win=0.0),
+        ),
+        (
+            VariantStats(n=10, sum=10.0, sum_squares=10.0),
+            Inference(lift=0.0, ci_low=0.0, ci_high=0.0, chance_to_win=0.5),
+        ),
+    ],
+    ids=["better", "worse", "equal"],
+)
+def test_compare_to_control__zero_variance_arms__degenerate_certainty(
+    treatment: VariantStats,
+    expected: Inference,
+) -> None:
+    # Given arms with constant values (zero variance)
+    control = VariantStats(n=10, sum=10.0, sum_squares=10.0)
+
+    # When / Then
+    assert compare_to_control(control, treatment) == expected
+
+
+def test_compare_to_control__zero_control_mean__returns_none() -> None:
+    # Given a control with no conversions: relative lift is undefined
+    control = VariantStats(n=1000, sum=0.0, sum_squares=0.0)
+    treatment = VariantStats(n=1000, sum=120.0, sum_squares=120.0)
+
+    # When / Then
+    assert compare_to_control(control, treatment) is None
+
+
+@pytest.mark.parametrize(
+    "control_n, treatment_n",
+    [(1, 1000), (1000, 1), (0, 1000)],
+    ids=["control_too_small", "treatment_too_small", "control_empty"],
+)
+def test_compare_to_control__insufficient_observations__returns_none(
+    control_n: int,
+    treatment_n: int,
+) -> None:
+    # Given an arm with fewer than two observations: variance is undefined
+    control = VariantStats(
+        n=control_n, sum=float(control_n), sum_squares=float(control_n)
+    )
+    treatment = VariantStats(
+        n=treatment_n, sum=float(treatment_n), sum_squares=float(treatment_n)
+    )
+
+    # When / Then
+    assert compare_to_control(control, treatment) is None
+
+
+def test_srm_p_value__balanced_split__no_mismatch() -> None:
+    # Given observed counts exactly matching the expected 50/50 split
+    # When
+    p_value = srm_p_value([5000, 5000], [0.5, 0.5])
+
+    # Then
+    assert p_value == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize(
+    "observed, shares, expected_p",
+    [
+        # chi-squared = 4.0, 1 dof
+        ([5100, 4900], [0.5, 0.5], 0.04550),
+        # chi-squared = 2.0, 2 dof: survival is exp(-1)
+        ([3400, 3300, 3300], [1 / 3, 1 / 3, 1 / 3], 0.36788),
+        # chi-squared = 8.0, 3 dof
+        ([2600, 2500, 2500, 2400], [0.25, 0.25, 0.25, 0.25], 0.04601),
+    ],
+    ids=["one_dof", "two_dof", "three_dof"],
+)
+def test_srm_p_value__known_chi_squared__matches_reference(
+    observed: list[int],
+    shares: list[float],
+    expected_p: float,
+) -> None:
+    # Given observed counts with a hand-computed chi-squared statistic
+    # When / Then
+    assert srm_p_value(observed, shares) == pytest.approx(expected_p, abs=1e-4)
+
+
+def test_srm_p_value__heavy_imbalance__fails_threshold() -> None:
+    # Given a 60/40 observed split against an expected 50/50
+    # When
+    p_value = srm_p_value([6000, 4000], [0.5, 0.5])
+
+    # Then
+    assert p_value is not None
+    assert p_value < 0.001
+
+
+@pytest.mark.parametrize(
+    "observed, shares",
+    [
+        ([0, 0], [0.5, 0.5]),
+        ([5000, 5000], [1.0, 0.0]),
+        ([10000], [1.0]),
+    ],
+    ids=["no_observations", "zero_share", "single_variant"],
+)
+def test_srm_p_value__not_computable__returns_none(
+    observed: list[int],
+    shares: list[float],
+) -> None:
+    # Given inputs the chi-squared test is undefined for
+    # When / Then
+    assert srm_p_value(observed, shares) is None

From aa632bded8eb0da2162a28306903e82880cfa9ec Mon Sep 17 00:00:00 2001
From: Gagan Trivedi <gagandeeptrivedi47@gmail.com>
Date: Mon, 15 Jun 2026 12:53:34 +0530
Subject: [PATCH 2/5] docs(experimentation): explain the stats kernel for
 non-statisticians

---
 api/experimentation/stats.py | 55 ++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 8 deletions(-)

diff --git a/api/experimentation/stats.py b/api/experimentation/stats.py
index cc7b27cadc48..ed1c45cfaa31 100644
--- a/api/experimentation/stats.py
+++ b/api/experimentation/stats.py
@@ -1,17 +1,35 @@
+"""Bayesian statistics for experiment results.
+
+Compares a treatment variant against control and reports, in plain terms:
+how much better/worse the treatment did (``lift``), how sure we are
+(``chance_to_win`` and the credible interval), and whether traffic was split
+fairly between variants (``srm_p_value``). All inputs are summary numbers; no
+raw events reach this module.
+"""
+
 import math
 from collections.abc import Sequence
 from dataclasses import dataclass
 from statistics import NormalDist
 
 _STANDARD_NORMAL = NormalDist()
+# A 95% interval spans the mean ± 1.96 standard deviations of a normal curve.
 _Z_95 = 1.959963984540054
 
 
 @dataclass(frozen=True)
 class VariantStats:
-    n: int
-    sum: float
-    sum_squares: float
+    """Everything we need to know about one variant, as three running totals.
+
+    For a conversion metric each identity contributes 0 or 1, so ``sum`` is the
+    conversion count; for a value metric (e.g. revenue) it is the total. These
+    three numbers are enough to recover the average and the spread, so the
+    warehouse never has to send per-identity rows.
+    """
+
+    n: int  # identities in the variant
+    sum: float  # total of their per-identity values
+    sum_squares: float  # total of the squares, used to derive the spread
 
     @property
     def mean(self) -> float:
@@ -19,34 +37,44 @@ def mean(self) -> float:
 
     @property
     def variance(self) -> float:
+        # Spread of the per-identity values. max(0, …) guards against tiny
+        # negative results from floating-point error when every value is equal.
         return max(0.0, (self.sum_squares - self.sum**2 / self.n) / (self.n - 1))
 
 
 @dataclass(frozen=True)
 class Inference:
-    lift: float
-    ci_low: float
-    ci_high: float
-    chance_to_win: float
+    lift: float  # relative change vs control, e.g. 0.12 == +12%
+    ci_low: float  # credible interval: we're 95% sure the true lift
+    ci_high: float  # lies between ci_low and ci_high
+    chance_to_win: float  # probability (0–1) the treatment really beats control
 
 
 def compare_to_control(
     control: VariantStats,
     treatment: VariantStats,
 ) -> Inference | None:
+    # Inference is undefined without two observations per arm (no spread to
+    # measure) or a zero control mean (relative lift divides by it).
     if control.n < 2 or treatment.n < 2 or control.mean == 0:
         return None
 
     lift = (treatment.mean - control.mean) / control.mean
-    # Delta-method variance of the relative lift; the arms are independent.
+    # How uncertain that lift is. Both arms are noisy, so the uncertainty of a
+    # ratio combines both; the delta method is the standard approximation, and
+    # the arms being independent means there is no covariance term.
     variance = treatment.variance / (
         treatment.n * control.mean**2
     ) + treatment.mean**2 * control.variance / (control.n * control.mean**4)
     standard_error = math.sqrt(variance)
     if standard_error == 0:
+        # No uncertainty (every value identical): the result is exact.
         certainty = 0.5 if lift == 0 else float(lift > 0)
         return Inference(lift=lift, ci_low=lift, ci_high=lift, chance_to_win=certainty)
 
+    # Treat the true lift as a normal curve centred on `lift`, `standard_error`
+    # wide. The interval is its middle 95%; chance_to_win is the share of the
+    # curve above zero (i.e. how much of our belief says the treatment is up).
     return Inference(
         lift=lift,
         ci_low=lift - _Z_95 * standard_error,
@@ -59,10 +87,19 @@ def srm_p_value(
     observed: Sequence[int],
     expected_shares: Sequence[float],
 ) -> float | None:
+    """Sample ratio mismatch check: was traffic split as configured?
+
+    Returns the probability that random assignment alone would drift from the
+    configured split at least as much as we observed. A tiny value (< 0.001 by
+    convention) means the split is broken and the results can't be trusted.
+    ``None`` when the question is meaningless (no traffic, one variant).
+    """
     total = sum(observed)
     if len(observed) < 2 or total == 0 or any(s <= 0 for s in expected_shares):
         return None
 
+    # Chi-squared statistic: total squared gap between observed and expected
+    # counts, scaled by what's expected. Bigger gap == bigger number.
     statistic = sum(
         (count - total * share) ** 2 / (total * share)
         for count, share in zip(observed, expected_shares, strict=True)
@@ -71,6 +108,8 @@ def srm_p_value(
 
 
 def _chi_squared_survival(statistic: float, degrees_of_freedom: int) -> float:
+    # Turns the chi-squared statistic into a probability (the p-value above):
+    # how likely a gap this large is by chance. 0 gap → certain (1.0).
     if statistic <= 0:
         return 1.0
     # The standard library has no chi-squared distribution, but for integer

From 88db608338e695938840f372934ceff4eaf1f684 Mon Sep 17 00:00:00 2001
From: Gagan Trivedi <gagandeeptrivedi47@gmail.com>
Date: Mon, 15 Jun 2026 13:51:09 +0530
Subject: [PATCH 3/5] docs(experimentation): add experiment statistics page

---
 docs/docs/experiment-statistics.md | 155 +++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 docs/docs/experiment-statistics.md

diff --git a/docs/docs/experiment-statistics.md b/docs/docs/experiment-statistics.md
new file mode 100644
index 000000000000..6e8728b61dc2
--- /dev/null
+++ b/docs/docs/experiment-statistics.md
@@ -0,0 +1,155 @@
+---
+title: Experiment Statistics
+sidebar_label: Experiment Statistics
+sidebar_position: 5
+---
+
+Flagsmith's statistics engine answers three questions about an experiment: **Am I winning?** (is the variant better than
+control), **by how much?** (the lift), and **can I trust it?** (is the difference real, and was traffic split fairly).
+This page explains the terms you'll see, in plain language — no statistics background needed.
+
+:::info Availability
+
+- **Available now:** the **Exposures panel** — who was bucketed into each variant, over time.
+- **Coming soon:** the **Results scorecard** — lift, credible intervals, chance to beat control, and the sample ratio
+  mismatch check.
+
+:::
+
+## Terms you'll see
+
+**Experiment** — A controlled comparison: show different versions of a feature to different people, measure which
+performs better.
+
+**Variant** — One version being compared. Each person sees exactly one.
+
+**Control** — The "leave things as they are" variant — your current experience, the baseline everything is measured
+against (its key is the reserved value `control`).
+
+**Treatment** — Any variant that isn't the control — the change you're testing.
+
+**Identity** — One user, device, or account — the individual whose behaviour you measure.
+
+**Exposure** — The moment an identity is shown a variant. The exposure count is how many people entered the experiment.
+
+**Conversion rate** — The percentage of people who did the thing you care about (4.21% ≈ 4 in 100).
+
+**Metric** — What you measure to judge success — checkout rate, revenue, page views.
+
+**Goal metric** — A metric you want to improve (the reason you're running the experiment).
+
+**Guardrail metric** — A metric you want to keep an eye on so the change doesn't make it worse.
+
+**Lift** — How much better or worse a variant did than control, as a percentage. "+11%" means 11% better.
+
+**Credible interval** — Our confidence range — the band we're 95% sure the true lift falls inside. Narrow = precise;
+wide = needs more data.
+
+**Chance to beat control (chance to win)** — The probability a variant is genuinely better than control. "97%" reads
+exactly as it sounds.
+
+**Winning / losing / inconclusive** — The verdict. **Winning** = over 95% chance to win. **Losing** = under 5%.
+**Inconclusive** = can't tell yet, keep collecting.
+
+**Sample ratio mismatch (SRM)** — A health check that flags when traffic wasn't split the way you set it up. A broken
+split means the results can't be trusted.
+
+**Quarantined / excluded identity** — Someone recorded in more than one variant. Set aside so they don't distort the
+counts, and shown as a separate total.
+
+**Collecting data** — Not enough data yet to report a result, so numbers are withheld rather than shown when they'd be
+meaningless.
+
+**Last computed (as-of)** — Results are computed periodically, not on every page load. This timestamp tells you how
+fresh the figures are.
+
+## Exposures
+
+**Available now.**
+
+An identity is counted **once**, in the variant it saw **first**. Because exposures are deduplicated by identity and
+keep only the earliest timestamp, duplicate event delivery can't inflate your counts, and each identity lands in the
+time bucket of its first exposure.
+
+If an identity is recorded against **more than one** variant (a "flicker", or bucketing that changed mid-flight), it's
+**quarantined** — excluded from every variant's count and surfaced as a single excluded-identities figure. A small
+number is normal; a growing one means users are slipping between variants.
+
+The panel shows a headline total, a cumulative chart (one line per variant), a variant table (key, **Control** badge,
+identities, share %), the excluded note, and a "last computed" time.
+
+## Experiment results
+
+**Coming soon.**
+
+For each metric, the scorecard reports how each variant did against control, using a **Bayesian** engine. Three numbers
+per variant:
+
+**Lift** — relative change vs control. Control at 4.21%, variant at 4.68% → `(4.68 − 4.21) / 4.21 ≈ +11%`.
+
+**95% credible interval** — the range we're 95% sure the true lift sits in, drawn as a bar centred on zero:
+
+- **Clear of zero** (e.g. +2% to +20%) → confident the variant is genuinely better (or, on the negative side, worse).
+- **Crosses zero** (e.g. −3% to +14%) → inconclusive; the effect could be anything. Collect more data.
+
+**Chance to beat control** — the same belief as one number, e.g. _97%_. Over **95%** → **winning**; under **5%** →
+**losing**; in between → **inconclusive**.
+
+:::note Why Bayesian?
+
+"97% chance to beat control" means what it sounds like — unlike a p-value, which is routinely misread. It's also safe to
+check whenever you like: peeking doesn't inflate your error rate the way repeated p-value checks do.
+
+:::
+
+## Sample ratio mismatch (SRM)
+
+**Coming soon.**
+
+Before trusting a result, you need traffic to have split the way you configured. If you set 50/50 but see 9,120 in
+control and 6,400 in the variant, something is broken — and if assignment is broken, every other number is suspect,
+because the groups aren't comparable.
+
+Flagsmith compares the observed split against the configured one and raises a warning when the imbalance is a
+one-in-a-thousand event or rarer (it only checks once there are at least 100 identities). When it fires, **don't act on
+the results** — investigate one-variant crashes, redirects that bypass bucketing, flicker, or dropped events first.
+
+## Collecting data
+
+**Coming soon.**
+
+Statistics on a handful of people are meaningless, so a metric shows **collecting data** until every arm has at least
+**50 identities** (and, for conversion metrics, at least **5 conversions**). This stops you reading a "+300% lift" off
+three conversions.
+
+## Metric types
+
+**Metrics: available now. Inference over them: coming soon.**
+
+| Aggregation    | What it measures                               | Example                 |
+| -------------- | ---------------------------------------------- | ----------------------- |
+| **Occurrence** | Did the event happen at least once? (0 or 1)   | Did the user check out? |
+| **Count**      | How many times the event happened per identity | Number of page views    |
+| **Sum**        | The total of a numeric value across events     | Total revenue           |
+| **Mean**       | The average numeric value per identity         | Average order value     |
+
+A metric's **expected direction** (up, down, not-increase, not-decrease) tells Flagsmith which way is "good" and sorts
+metrics into **goals** and **guardrails**.
+
+## Summary
+
+| Capability                                      | Status                       |
+| ----------------------------------------------- | ---------------------------- |
+| Exposures panel (counts, chart, share)          | Available now                |
+| First-exposure attribution & duplicate immunity | Available now                |
+| Quarantined (multi-variant) identities          | Available now                |
+| Metric definitions                              | Available now                |
+| Results scorecard: lift, credible interval      | Coming soon                  |
+| Chance to beat control & winning/losing flags   | Coming soon                  |
+| Sample ratio mismatch (SRM) check               | Coming soon                  |
+| Collecting-data floor                           | Coming soon                  |
+| Risk / decision banner / trend chart            | Not planned for this version |
+| Frequentist engine                              | Deferred                     |
+
+For experiment setup — multivariate flags, bucketing, identities — see
+[Experimentation (A/B Testing)](/experimentation-ab-testing) and [managing identities](/flagsmith-concepts/identities).

From f33a4380560fe323845b14df0b9f632945c34cb1 Mon Sep 17 00:00:00 2001
From: Gagan Trivedi <gagandeeptrivedi47@gmail.com>
Date: Mon, 15 Jun 2026 13:54:41 +0530
Subject: [PATCH 4/5] fix(experimentation): treat non-positive control mean as
 undefined inference

---
 api/experimentation/stats.py                 |  5 +++--
 api/tests/unit/experimentation/test_stats.py | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/api/experimentation/stats.py b/api/experimentation/stats.py
index ed1c45cfaa31..e9c57bf75422 100644
--- a/api/experimentation/stats.py
+++ b/api/experimentation/stats.py
@@ -55,8 +55,9 @@ def compare_to_control(
     treatment: VariantStats,
 ) -> Inference | None:
     # Inference is undefined without two observations per arm (no spread to
-    # measure) or a zero control mean (relative lift divides by it).
-    if control.n < 2 or treatment.n < 2 or control.mean == 0:
+    # measure) or a non-positive control mean (relative lift against it is
+    # meaningless, and divides by zero when the mean is exactly zero).
+    if control.n < 2 or treatment.n < 2 or control.mean <= 0:
         return None
 
     lift = (treatment.mean - control.mean) / control.mean
diff --git a/api/tests/unit/experimentation/test_stats.py b/api/tests/unit/experimentation/test_stats.py
index 520390a74b5c..9880255d6894 100644
--- a/api/tests/unit/experimentation/test_stats.py
+++ b/api/tests/unit/experimentation/test_stats.py
@@ -93,6 +93,16 @@ def test_compare_to_control__zero_control_mean__returns_none() -> None:
     assert compare_to_control(control, treatment) is None
 
 
+def test_compare_to_control__negative_control_mean__returns_none() -> None:
+    # Given a control whose values average below zero (e.g. a revenue metric
+    # with refunds): relative lift against it is meaningless
+    control = VariantStats(n=1000, sum=-50.0, sum_squares=600.0)
+    treatment = VariantStats(n=1000, sum=120.0, sum_squares=120.0)
+
+    # When / Then
+    assert compare_to_control(control, treatment) is None
+
+
 @pytest.mark.parametrize(
     "control_n, treatment_n",
     [(1, 1000), (1000, 1), (0, 1000)],

From 32c858220c0b0873cec4b55fdce4ac4bbebca04e Mon Sep 17 00:00:00 2001
From: Gagan Trivedi <gagandeeptrivedi47@gmail.com>
Date: Mon, 15 Jun 2026 14:38:02 +0530
Subject: [PATCH 5/5] docs(experimentation): frame stats as an upcoming
 Enterprise beta

---
 docs/docs/experiment-statistics.md | 44 ++++++++++++------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/docs/docs/experiment-statistics.md b/docs/docs/experiment-statistics.md
index 6e8728b61dc2..c9093941584e 100644
--- a/docs/docs/experiment-statistics.md
+++ b/docs/docs/experiment-statistics.md
@@ -8,11 +8,11 @@ Flagsmith's statistics engine answers three questions about an experiment: **Am
 control), **by how much?** (the lift), and **can I trust it?** (is the difference real, and was traffic split fairly).
 This page explains the terms you'll see, in plain language — no statistics background needed.
 
-:::info Availability
+:::info Coming soon — Enterprise beta
 
-- **Available now:** the **Exposures panel** — who was bucketed into each variant, over time.
-- **Coming soon:** the **Results scorecard** — lift, credible intervals, chance to beat control, and the sample ratio
-  mismatch check.
+Experiment statistics aren't generally available yet. They are launching as a beta on **Enterprise** plans — to join,
+[get in touch](https://www.flagsmith.com/contact-us). Everything described on this page is part of that upcoming
+release; it previews what the feature will do.
 
 :::
 
@@ -65,8 +65,6 @@ fresh the figures are.
 
 ## Exposures
 
-**Available now.**
-
 An identity is counted **once**, in the variant it saw **first**. Because exposures are deduplicated by identity and
 keep only the earliest timestamp, duplicate event delivery can't inflate your counts, and each identity lands in the
 time bucket of its first exposure.
@@ -80,8 +78,6 @@ identities, share %), the excluded note, and a "last computed" time.
 
 ## Experiment results
 
-**Coming soon.**
-
 For each metric, the scorecard reports how each variant did against control, using a **Bayesian** engine. Three numbers
 per variant:
 
@@ -104,8 +100,6 @@ check whenever you like: peeking doesn't inflate your error rate the way repeate
 
 ## Sample ratio mismatch (SRM)
 
-**Coming soon.**
-
 Before trusting a result, you need traffic to have split the way you configured. If you set 50/50 but see 9,120 in
 control and 6,400 in the variant, something is broken — and if assignment is broken, every other number is suspect,
 because the groups aren't comparable.
@@ -116,16 +110,12 @@ the results** — investigate one-variant crashes, redirects that bypass bucketi
 
 ## Collecting data
 
-**Coming soon.**
-
 Statistics on a handful of people are meaningless, so a metric shows **collecting data** until every arm has at least
 **50 identities** (and, for conversion metrics, at least **5 conversions**). This stops you reading a "+300% lift" off
 three conversions.
 
 ## Metric types
 
-**Metrics: available now. Inference over them: coming soon.**
-
 | Aggregation    | What it measures                               | Example                 |
 | -------------- | ---------------------------------------------- | ----------------------- |
 | **Occurrence** | Did the event happen at least once? (0 or 1)   | Did the user check out? |
@@ -138,18 +128,20 @@ metrics into **goals** and **guardrails**.
 
 ## Summary
 
-| Capability                                      | Status                       |
-| ----------------------------------------------- | ---------------------------- |
-| Exposures panel (counts, chart, share)          | Available now                |
-| First-exposure attribution & duplicate immunity | Available now                |
-| Quarantined (multi-variant) identities          | Available now                |
-| Metric definitions                              | Available now                |
-| Results scorecard: lift, credible interval      | Coming soon                  |
-| Chance to beat control & winning/losing flags   | Coming soon                  |
-| Sample ratio mismatch (SRM) check               | Coming soon                  |
-| Collecting-data floor                           | Coming soon                  |
-| Risk / decision banner / trend chart            | Not planned for this version |
-| Frequentist engine                              | Deferred                     |
+Nothing here is generally available yet — the table shows what the upcoming Enterprise beta will include.
+
+| Capability                                      | Status                |
+| ----------------------------------------------- | --------------------- |
+| Exposures panel (counts, chart, share)          | In the first beta     |
+| First-exposure attribution & duplicate immunity | In the first beta     |
+| Quarantined (multi-variant) identities          | In the first beta     |
+| Metric definitions                              | In the first beta     |
+| Results scorecard: lift, credible interval      | Planned               |
+| Chance to beat control & winning/losing flags   | Planned               |
+| Sample ratio mismatch (SRM) check               | Planned               |
+| Collecting-data floor                           | Planned               |
+| Risk / decision banner / trend chart            | Not currently planned |
+| Frequentist engine                              | Deferred              |
 
 For experiment setup — multivariate flags, bucketing, identities — see
 [Experimentation (A/B Testing)](/experimentation-ab-testing) and [managing identities](/flagsmith-concepts/identities).