diff --git a/api/experimentation/stats.py b/api/experimentation/stats.py
new file mode 100644
index 000000000000..e9c57bf75422
--- /dev/null
+++ b/api/experimentation/stats.py
@@ -0,0 +1,130 @@
+"""Bayesian statistics for experiment results.
+
+Compares a treatment variant against control and reports, in plain terms:
+how much better/worse the treatment did (``lift``), how sure we are
+(``chance_to_win`` and the credible interval), and whether traffic was split
+fairly between variants (``srm_p_value``). All inputs are summary numbers; no
+raw events reach this module.
+"""
+
+import math
+from collections.abc import Sequence
+from dataclasses import dataclass
+from statistics import NormalDist
+
+_STANDARD_NORMAL = NormalDist()
+# A 95% interval spans the mean ± 1.96 standard deviations of a normal curve.
+_Z_95 = 1.959963984540054
+
+
+@dataclass(frozen=True)
+class VariantStats:
+    """Everything we need to know about one variant, as three running totals.
+
+    For a conversion metric each identity contributes 0 or 1, so ``sum`` is the
+    conversion count; for a value metric (e.g. revenue) it is the total. These
+    three numbers are enough to recover the average and the spread, so the
+    warehouse never has to send per-identity rows.
+    """
+
+    n: int  # identities in the variant
+    sum: float  # total of their per-identity values
+    sum_squares: float  # total of the squares, used to derive the spread
+
+    @property
+    def mean(self) -> float:
+        return self.sum / self.n
+
+    @property
+    def variance(self) -> float:
+        # Spread of the per-identity values. max(0, …) guards against tiny
+        # negative results from floating-point error when every value is equal.
+        return max(0.0, (self.sum_squares - self.sum**2 / self.n) / (self.n - 1))
+
+
+@dataclass(frozen=True)
+class Inference:
+    lift: float  # relative change vs control, e.g. 0.12 == +12%
+    ci_low: float  # credible interval: we're 95% sure the true lift
+    ci_high: float  # lies between ci_low and ci_high
+    chance_to_win: float  # probability (0–1) the treatment really beats control
+
+
+def compare_to_control(
+    control: VariantStats,
+    treatment: VariantStats,
+) -> Inference | None:
+    # Inference is undefined without two observations per arm (no spread to
+    # measure) or a non-positive control mean (relative lift against it is
+    # meaningless, and divides by zero when the mean is exactly zero).
+    if control.n < 2 or treatment.n < 2 or control.mean <= 0:
+        return None
+
+    lift = (treatment.mean - control.mean) / control.mean
+    # How uncertain that lift is. Both arms are noisy, so the uncertainty of a
+    # ratio combines both; the delta method is the standard approximation, and
+    # the arms being independent means there is no covariance term.
+    variance = treatment.variance / (
+        treatment.n * control.mean**2
+    ) + treatment.mean**2 * control.variance / (control.n * control.mean**4)
+    standard_error = math.sqrt(variance)
+    if standard_error == 0:
+        # No uncertainty (every value identical): the result is exact.
+        certainty = 0.5 if lift == 0 else float(lift > 0)
+        return Inference(lift=lift, ci_low=lift, ci_high=lift, chance_to_win=certainty)
+
+    # Treat the true lift as a normal curve centred on `lift`, `standard_error`
+    # wide. The interval is its middle 95%; chance_to_win is the share of the
+    # curve above zero (i.e. how much of our belief says the treatment is up).
+    return Inference(
+        lift=lift,
+        ci_low=lift - _Z_95 * standard_error,
+        ci_high=lift + _Z_95 * standard_error,
+        chance_to_win=_STANDARD_NORMAL.cdf(lift / standard_error),
+    )
+
+
+def srm_p_value(
+    observed: Sequence[int],
+    expected_shares: Sequence[float],
+) -> float | None:
+    """Sample ratio mismatch check: was traffic split as configured?
+
+    Returns the probability that random assignment alone would drift from the
+    configured split at least as much as we observed. A tiny value (< 0.001 by
+    convention) means the split is broken and the results can't be trusted.
+    ``None`` when the question is meaningless (no traffic, one variant).
+    """
+    total = sum(observed)
+    if len(observed) < 2 or total == 0 or any(s <= 0 for s in expected_shares):
+        return None
+
+    # Chi-squared statistic: total squared gap between observed and expected
+    # counts, scaled by what's expected. Bigger gap == bigger number.
+    statistic = sum(
+        (count - total * share) ** 2 / (total * share)
+        for count, share in zip(observed, expected_shares, strict=True)
+    )
+    return _chi_squared_survival(statistic, degrees_of_freedom=len(observed) - 1)
+
+
+def _chi_squared_survival(statistic: float, degrees_of_freedom: int) -> float:
+    # Turns the chi-squared statistic into a probability (the p-value above):
+    # how likely a gap this large is by chance. 0 gap → certain (1.0).
+    if statistic <= 0:
+        return 1.0
+    # The standard library has no chi-squared distribution, but for integer
+    # degrees of freedom the survival function is exact from the base cases
+    # Q(1/2, y) = erfc(√y) and Q(1, y) = e⁻ʸ via the recurrence
+    # Q(a+1, y) = Q(a, y) + yᵃe⁻ʸ/Γ(a+1).
+    y = statistic / 2.0
+    if degrees_of_freedom % 2:
+        a = 0.5
+        survival = math.erfc(math.sqrt(y))
+    else:
+        a = 1.0
+        survival = math.exp(-y)
+    while a + 1.0 <= degrees_of_freedom / 2.0:
+        survival += math.exp(a * math.log(y) - y - math.lgamma(a + 1.0))
+        a += 1.0
+    return survival
diff --git a/api/tests/unit/experimentation/test_stats.py b/api/tests/unit/experimentation/test_stats.py
new file mode 100644
index 000000000000..9880255d6894
--- /dev/null
+++ b/api/tests/unit/experimentation/test_stats.py
@@ -0,0 +1,183 @@
+import pytest
+
+from experimentation.stats import (
+    Inference,
+    VariantStats,
+    compare_to_control,
+    srm_p_value,
+)
+
+
+def test_variant_stats__sufficient_statistics__derive_mean_and_variance() -> None:
+    # Given 1000 identities with 100 conversions (0/1 values)
+    stats = VariantStats(n=1000, sum=100.0, sum_squares=100.0)
+
+    # When / Then
+    assert stats.mean == 0.1
+    assert stats.variance == pytest.approx(90.0 / 999.0)
+
+
+def test_variant_stats__float_noise__variance_clamped_to_zero() -> None:
+    # Given sums whose rounding puts the raw variance just below zero
+    stats = VariantStats(n=2, sum=2.0, sum_squares=1.9999999999999996)
+
+    # When / Then
+    assert stats.variance == 0.0
+
+
+def test_compare_to_control__more_conversions__positive_lift_inference() -> None:
+    # Given a 10% control and a 12% treatment, 1000 identities each
+    control = VariantStats(n=1000, sum=100.0, sum_squares=100.0)
+    treatment = VariantStats(n=1000, sum=120.0, sum_squares=120.0)
+
+    # When
+    inference = compare_to_control(control, treatment)
+
+    # Then
+    assert inference is not None
+    assert inference.lift == pytest.approx(0.2)
+    assert inference.ci_low == pytest.approx(-0.10074, abs=1e-4)
+    assert inference.ci_high == pytest.approx(0.50074, abs=1e-4)
+    assert inference.chance_to_win == pytest.approx(0.90379, abs=1e-4)
+
+
+def test_compare_to_control__identical_arms__chance_is_even() -> None:
+    # Given two arms with the same conversions
+    arm = VariantStats(n=1000, sum=100.0, sum_squares=100.0)
+
+    # When
+    inference = compare_to_control(arm, arm)
+
+    # Then
+    assert inference is not None
+    assert inference.lift == 0.0
+    assert inference.chance_to_win == 0.5
+    assert inference.ci_low == pytest.approx(-inference.ci_high)
+
+
+@pytest.mark.parametrize(
+    "treatment, expected",
+    [
+        (
+            VariantStats(n=10, sum=20.0, sum_squares=40.0),
+            Inference(lift=1.0, ci_low=1.0, ci_high=1.0, chance_to_win=1.0),
+        ),
+        (
+            VariantStats(n=10, sum=5.0, sum_squares=2.5),
+            Inference(lift=-0.5, ci_low=-0.5, ci_high=-0.5, chance_to_win=0.0),
+        ),
+        (
+            VariantStats(n=10, sum=10.0, sum_squares=10.0),
+            Inference(lift=0.0, ci_low=0.0, ci_high=0.0, chance_to_win=0.5),
+        ),
+    ],
+    ids=["better", "worse", "equal"],
+)
+def test_compare_to_control__zero_variance_arms__degenerate_certainty(
+    treatment: VariantStats,
+    expected: Inference,
+) -> None:
+    # Given arms with constant values (zero variance)
+    control = VariantStats(n=10, sum=10.0, sum_squares=10.0)
+
+    # When / Then
+    assert compare_to_control(control, treatment) == expected
+
+
+def test_compare_to_control__zero_control_mean__returns_none() -> None:
+    # Given a control with no conversions: relative lift is undefined
+    control = VariantStats(n=1000, sum=0.0, sum_squares=0.0)
+    treatment = VariantStats(n=1000, sum=120.0, sum_squares=120.0)
+
+    # When / Then
+    assert compare_to_control(control, treatment) is None
+
+
+def test_compare_to_control__negative_control_mean__returns_none() -> None:
+    # Given a control whose values average below zero (e.g. a revenue metric
+    # with refunds): relative lift against it is meaningless
+    control = VariantStats(n=1000, sum=-50.0, sum_squares=600.0)
+    treatment = VariantStats(n=1000, sum=120.0, sum_squares=120.0)
+
+    # When / Then
+    assert compare_to_control(control, treatment) is None
+
+
+@pytest.mark.parametrize(
+    "control_n, treatment_n",
+    [(1, 1000), (1000, 1), (0, 1000)],
+    ids=["control_too_small", "treatment_too_small", "control_empty"],
+)
+def test_compare_to_control__insufficient_observations__returns_none(
+    control_n: int,
+    treatment_n: int,
+) -> None:
+    # Given an arm with fewer than two observations: variance is undefined
+    control = VariantStats(
+        n=control_n, sum=float(control_n), sum_squares=float(control_n)
+    )
+    treatment = VariantStats(
+        n=treatment_n, sum=float(treatment_n), sum_squares=float(treatment_n)
+    )
+
+    # When / Then
+    assert compare_to_control(control, treatment) is None
+
+
+def test_srm_p_value__balanced_split__no_mismatch() -> None:
+    # Given observed counts exactly matching the expected 50/50 split
+    # When
+    p_value = srm_p_value([5000, 5000], [0.5, 0.5])
+
+    # Then
+    assert p_value == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize(
+    "observed, shares, expected_p",
+    [
+        # chi-squared = 4.0, 1 dof
+        ([5100, 4900], [0.5, 0.5], 0.04550),
+        # chi-squared = 2.0, 2 dof: survival is exp(-1)
+        ([3400, 3300, 3300], [1 / 3, 1 / 3, 1 / 3], 0.36788),
+        # chi-squared = 8.0, 3 dof
+        ([2600, 2500, 2500, 2400], [0.25, 0.25, 0.25, 0.25], 0.04601),
+    ],
+    ids=["one_dof", "two_dof", "three_dof"],
+)
+def test_srm_p_value__known_chi_squared__matches_reference(
+    observed: list[int],
+    shares: list[float],
+    expected_p: float,
+) -> None:
+    # Given observed counts with a hand-computed chi-squared statistic
+    # When / Then
+    assert srm_p_value(observed, shares) == pytest.approx(expected_p, abs=1e-4)
+
+
+def test_srm_p_value__heavy_imbalance__fails_threshold() -> None:
+    # Given a 60/40 observed split against an expected 50/50
+    # When
+    p_value = srm_p_value([6000, 4000], [0.5, 0.5])
+
+    # Then
+    assert p_value is not None
+    assert p_value < 0.001
+
+
+@pytest.mark.parametrize(
+    "observed, shares",
+    [
+        ([0, 0], [0.5, 0.5]),
+        ([5000, 5000], [1.0, 0.0]),
+        ([10000], [1.0]),
+    ],
+    ids=["no_observations", "zero_share", "single_variant"],
+)
+def test_srm_p_value__not_computable__returns_none(
+    observed: list[int],
+    shares: list[float],
+) -> None:
+    # Given inputs the chi-squared test is undefined for
+    # When / Then
+    assert srm_p_value(observed, shares) is None
diff --git a/docs/docs/experiment-statistics.md b/docs/docs/experiment-statistics.md
new file mode 100644
index 000000000000..c9093941584e
--- /dev/null
+++ b/docs/docs/experiment-statistics.md
@@ -0,0 +1,147 @@
+---
+title: Experiment Statistics
+sidebar_label: Experiment Statistics
+sidebar_position: 5
+---
+
+Flagsmith's statistics engine answers three questions about an experiment: **Am I winning?** (is the variant better than
+control), **by how much?** (the lift), and **can I trust it?** (is the difference real, and was traffic split fairly).
+This page explains the terms you'll see, in plain language — no statistics background needed.
+
+:::info Coming soon — Enterprise beta
+
+Experiment statistics aren't generally available yet. They are launching as a beta on **Enterprise** plans — to join,
+[get in touch](https://www.flagsmith.com/contact-us). Everything described on this page is part of that upcoming
+release; it previews what the feature will do.
+
+:::
+
+## Terms you'll see
+
+**Experiment** — A controlled comparison: show different versions of a feature to different people, measure which
+performs better.
+
+**Variant** — One version being compared. Each person sees exactly one.
+
+**Control** — The "leave things as they are" variant — your current experience, the baseline everything is measured
+against (its key is the reserved value `control`).
+
+**Treatment** — Any variant that isn't the control — the change you're testing.
+
+**Identity** — One user, device, or account — the individual whose behaviour you measure.
+
+**Exposure** — The moment an identity is shown a variant. The exposure count is how many people entered the experiment.
+
+**Conversion rate** — The percentage of people who did the thing you care about (4.21% ≈ 4 in 100).
+
+**Metric** — What you measure to judge success — checkout rate, revenue, page views.
+
+**Goal metric** — A metric you want to improve (the reason you're running the experiment).
+
+**Guardrail metric** — A metric you want to keep an eye on so the change doesn't make it worse.
+
+**Lift** — How much better or worse a variant did than control, as a percentage. "+11%" means 11% better.
+
+**Credible interval** — Our confidence range — the band we're 95% sure the true lift falls inside. Narrow = precise;
+wide = needs more data.
+
+**Chance to beat control (chance to win)** — The probability a variant is genuinely better than control. "97%" reads
+exactly as it sounds.
+
+**Winning / losing / inconclusive** — The verdict. **Winning** = over 95% chance to win. **Losing** = under 5%.
+**Inconclusive** = can't tell yet, keep collecting.
+
+**Sample ratio mismatch (SRM)** — A health check that flags when traffic wasn't split the way you set it up. A broken
+split means the results can't be trusted.
+
+**Quarantined / excluded identity** — Someone recorded in more than one variant. Set aside so they don't distort the
+counts, and shown as a separate total.
+
+**Collecting data** — Not enough data yet to report a result, so numbers are withheld rather than shown when they'd be
+meaningless.
+
+**Last computed (as-of)** — Results are computed periodically, not on every page load. This timestamp tells you how
+fresh the figures are.
+
+## Exposures
+
+An identity is counted **once**, in the variant it saw **first**. Because exposures are deduplicated by identity and
+keep only the earliest timestamp, duplicate event delivery can't inflate your counts, and each identity lands in the
+time bucket of its first exposure.
+
+If an identity is recorded against **more than one** variant (a "flicker", or bucketing that changed mid-flight), it's
+**quarantined** — excluded from every variant's count and surfaced as a single excluded-identities figure. A small
+number is normal; a growing one means users are slipping between variants.
+
+The panel shows a headline total, a cumulative chart (one line per variant), a variant table (key, **Control** badge,
+identities, share %), the excluded note, and a "last computed" time.
+
+## Experiment results
+
+For each metric, the scorecard reports how each variant did against control, using a **Bayesian** engine. Three numbers
+per variant:
+
+**Lift** — relative change vs control. Control at 4.21%, variant at 4.68% → `(4.68 − 4.21) / 4.21 ≈ +11%`.
+
+**95% credible interval** — the range we're 95% sure the true lift sits in, drawn as a bar centred on zero:
+
+- **Clear of zero** (e.g. +2% to +20%) → confident the variant is genuinely better (or, on the negative side, worse).
+- **Crosses zero** (e.g. −3% to +14%) → inconclusive; the effect could be anything. Collect more data.
+
+**Chance to beat control** — the same belief as one number, e.g. _97%_. Over **95%** → **winning**; under **5%** →
+**losing**; in between → **inconclusive**.
+
+:::note Why Bayesian?
+
+"97% chance to beat control" means what it sounds like — unlike a p-value, which is routinely misread. It's also safe to
+check whenever you like: peeking doesn't inflate your error rate the way repeated p-value checks do.
+
+:::
+
+## Sample ratio mismatch (SRM)
+
+Before trusting a result, you need traffic to have split the way you configured. If you set 50/50 but see 9,120 in
+control and 6,400 in the variant, something is broken — and if assignment is broken, every other number is suspect,
+because the groups aren't comparable.
+
+Flagsmith compares the observed split against the configured one and raises a warning when the imbalance is a
+one-in-a-thousand event or rarer (it only checks once there are at least 100 identities). When it fires, **don't act on
+the results** — investigate one-variant crashes, redirects that bypass bucketing, flicker, or dropped events first.
+
+## Collecting data
+
+Statistics on a handful of people are meaningless, so a metric shows **collecting data** until every arm has at least
+**50 identities** (and, for conversion metrics, at least **5 conversions**). This stops you reading a "+300% lift" off
+three conversions.
+
+## Metric types
+
+| Aggregation    | What it measures                               | Example                 |
+| -------------- | ---------------------------------------------- | ----------------------- |
+| **Occurrence** | Did the event happen at least once? (0 or 1)   | Did the user check out? |
+| **Count**      | How many times the event happened per identity | Number of page views    |
+| **Sum**        | The total of a numeric value across events     | Total revenue           |
+| **Mean**       | The average numeric value per identity         | Average order value     |
+
+A metric's **expected direction** (up, down, not-increase, not-decrease) tells Flagsmith which way is "good" and sorts
+metrics into **goals** and **guardrails**.
+
+## Summary
+
+Nothing here is generally available yet — the table shows what the upcoming Enterprise beta will include.
+
+| Capability                                      | Status                |
+| ----------------------------------------------- | --------------------- |
+| Exposures panel (counts, chart, share)          | In the first beta     |
+| First-exposure attribution & duplicate immunity | In the first beta     |
+| Quarantined (multi-variant) identities          | In the first beta     |
+| Metric definitions                              | In the first beta     |
+| Results scorecard: lift, credible interval      | Planned               |
+| Chance to beat control & winning/losing flags   | Planned               |
+| Sample ratio mismatch (SRM) check               | Planned               |
+| Collecting-data floor                           | Planned               |
+| Risk / decision banner / trend chart            | Not currently planned |
+| Frequentist engine                              | Deferred              |
+
+For experiment setup — multivariate flags, bucketing, identities — see
+[Experimentation (A/B Testing)](/experimentation-ab-testing) and [managing identities](/flagsmith-concepts/identities).