Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"dev": "vite dev",
"dev:port": "vite dev --port",
"prebuild": "bun run generate:changelog && bun run scripts/copy-docs-images.cjs",
"build": "NODE_OPTIONS=--max-old-space-size=5120 vite build && bun run scripts/generate-static-cache.ts && bun run scripts/generate-search-index.ts",
"build": "NODE_OPTIONS=--max-old-space-size=5120 vite build && bun run scripts/generate-static-cache.ts && bun run scripts/generate-search-index.ts && bun run scripts/generate-sitemap.ts",
"build:cf": "bun run build",
"build:cf:staging": "CLOUDFLARE_ENV=staging bun run build",
"sync:mixedbread": "mxbai vs sync $MIXEDBREAD_STORE_ID './content/docs' --ci",
Expand Down
269 changes: 269 additions & 0 deletions scripts/generate-sitemap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
/**
* Post-build script: generates a static `sitemap.xml` with real `<lastmod>`
* dates derived from git history.
*
* Cloudflare Workers have no filesystem / git at request time, so the dates are
* resolved here (Node.js, full repo) and baked into a static asset served at
* `/docs/sitemap.xml` — the same delivery path as `search-index.json`.
*
* Usage: bun run scripts/generate-sitemap.ts
* Called automatically as part of `bun run build` (after `vite build`, which
* empties `dist/`, so this must run last alongside the other generators).
*/

import { execFile } from "node:child_process";
import { readFileSync } from "node:fs";
import fs from "node:fs/promises";
import path from "node:path";
import { promisify } from "node:util";
import { createServer } from "vite";
import react from "@vitejs/plugin-react";
import tsConfigPaths from "vite-tsconfig-paths";
import mdx from "fumadocs-mdx/vite";

const execFileAsync = promisify(execFile);

const DIST_CLIENT = path.join(process.cwd(), "dist/client");
const OUTPUT_PATH = path.join(DIST_CLIENT, "docs/sitemap.xml");

/**
* Sentinel prefixing each commit-date line in the `git log` output so it can
* never be mistaken for a file path (no tracked path begins with it).
*/
const COMMIT_PREFIX = "@@commit@@";

/**
* Fumadocs `<include>relative/path.mdx</include>` directives. A page's rendered
* body includes these files, so an edit to a shared include must count toward
* the page's `lastmod` even though the wrapper file itself didn't change.
*/
const INCLUDE_PATTERN = /<include[^>]*>([\s\S]*?)<\/include>/g;

/**
* Extra tracked data files a page renders through a React component (not via
* `<include>`), so an edit to the data bumps the page's `lastmod`. Keyed by the
* wrapper's repo-relative path. Extend this when a page's rendered output depends
* on a committed data file imported by a component.
*/
const SUPPLEMENTAL_SOURCES: Record<string, string[]> = {
// /docs/changelog renders <ChangelogTimeline/>, which imports this generated,
// committed JSON — changelog regenerations don't touch the wrapper MDX.
"content/docs/changelog/index.mdx": ["src/lib/changelog-entries.json"],
};
Comment thread
dcrawbuck marked this conversation as resolved.

/**
* One `git log` pass, newest commit first, mapping each file to the date of the
* most recent commit that touched it (YYYY-MM-DD). First occurrence wins because
* the log is in reverse-chronological order. Returns an empty map (never throws)
* so a git problem degrades to a date-less sitemap instead of failing the build.
*/
async function buildGitDateMap(): Promise<Map<string, string>> {
const dates = new Map<string, string>();

let stdout: string;
try {
({ stdout } = await execFileAsync(
"git",
[
"-c",
"core.quotePath=false",
"log",
"--no-merges",
"--name-only",
`--pretty=format:${COMMIT_PREFIX}%cs`,
],
{ maxBuffer: 1024 * 1024 * 128 },
));
} catch (err) {
console.warn(
` ⚠️ Could not read git history (${(err as Error).message}); ` +
"emitting sitemap without <lastmod>.",
);
return dates;
}

let currentDate: string | undefined;
for (const line of stdout.split("\n")) {
if (line.startsWith(COMMIT_PREFIX)) {
currentDate = line.slice(COMMIT_PREFIX.length).trim() || undefined;
continue;
}
const file = line.trim();
if (!file || !currentDate) continue;
if (!dates.has(file)) dates.set(file, currentDate);
}

return dates;
}

/**
* Direct dependencies of a file as repo-relative POSIX paths: its `<include>`
* targets (for `.mdx`) plus any supplemental component-data files mapped to it.
*/
const includeCache = new Map<string, string[]>();
function directIncludes(relPath: string): string[] {
const cached = includeCache.get(relPath);
if (cached) return cached;

const targets: string[] = [];
if (relPath.endsWith(".mdx")) {
try {
const content = readFileSync(path.join(process.cwd(), relPath), "utf8");
const fromDir = path.posix.dirname(relPath);
for (const match of content.matchAll(INCLUDE_PATTERN)) {
const rel = match[1].trim();
if (rel) targets.push(path.posix.normalize(path.posix.join(fromDir, rel)));
}
} catch {
// Unreadable wrapper: fall back to whatever supplemental sources are mapped.
}
}

targets.push(...(SUPPLEMENTAL_SOURCES[relPath] ?? []));

includeCache.set(relPath, targets);
return targets;
}

/**
* Expand declared source files to include every file transitively pulled in via
* `<include>` (cycle-safe). The wrapper plus its shared includes all contribute
* to the page's last-modified date.
*/
function expandSources(declaredPaths: string[]): string[] {
const resolved = new Set<string>();
const stack = [...declaredPaths];
while (stack.length > 0) {
const current = stack.pop();
if (!current || resolved.has(current)) continue;
resolved.add(current);
for (const included of directIncludes(current)) {
if (!resolved.has(included)) stack.push(included);
}
}
return [...resolved];
}

/** Most recent (max) date among the given source files, or undefined if none tracked. */
function resolveLastModified(
sourcePaths: string[],
dateMap: Map<string, string>,
): string | undefined {
let latest: string | undefined;
for (const sourcePath of sourcePaths) {
const date = dateMap.get(sourcePath);
// YYYY-MM-DD sorts lexicographically, so string comparison == date comparison.
if (date && (!latest || date > latest)) latest = date;
}
return latest;
}

/**
* Ensure full git history is available, deepening a shallow clone if needed.
* Deploy environments like Cloudflare Workers Builds shallow-clone with no
* fetch-depth setting, so `git fetch --unshallow` is the only way to get real
* per-file dates there (anonymous fetch works because the repo is public).
*
* Returns false — so the caller omits <lastmod> rather than publishing clustered,
* misleading dates — whenever full history can't be *confirmed*: the depth probe
* itself fails, or a shallow clone can't be deepened. Never fails the build.
*/
async function ensureFullHistory(): Promise<boolean> {
let isShallow: boolean;
try {
const { stdout } = await execFileAsync("git", ["rev-parse", "--is-shallow-repository"]);
isShallow = stdout.trim() === "true";
} catch (err) {
console.warn(
` ⚠️ Could not determine clone depth (${(err as Error).message}); ` +
"omitting <lastmod> rather than risk inaccurate dates.",
);
return false;
}

if (!isShallow) return true;

console.warn(" ⚠️ Shallow clone — fetching full history with `git fetch --unshallow`…");
try {
// A successful --unshallow converts the clone to complete history.
await execFileAsync("git", ["fetch", "--unshallow", "--quiet"], { timeout: 180_000 });
} catch (err) {
console.warn(` ⚠️ Could not deepen history (${(err as Error).message}).`);
return false;
}

console.log(" ✓ Fetched full git history.");
return true;
}

async function main() {
console.log("Generating static sitemap.xml…");

// Lightweight Vite SSR server (no Cloudflare plugin) to resolve the fumadocs
// virtual modules, mirroring the other post-build generators.
const server = await createServer({
configFile: false,
logLevel: "error",
server: { port: 0, host: "127.0.0.1" },
resolve: {
alias: { "@": path.resolve(process.cwd(), "./src") },
},
plugins: [
mdx(await import("../source.config")),
tsConfigPaths({ projects: ["./tsconfig.json"] }),
react(),
],
});

try {
const { source } = await server.ssrLoadModule("./src/lib/source");
const { getSitemapSourceEntries, attachLastModified, buildSitemapXml } =
await server.ssrLoadModule("./src/lib/sitemap");

const pages = source.getPages() as Array<{ url: string; path: string }>;
const contentPages = pages.map((page) => ({
url: page.url,
sourcePaths: [`content/docs/${page.path}`],
Comment thread
dcrawbuck marked this conversation as resolved.
Comment thread
dcrawbuck marked this conversation as resolved.
}));

const sourceEntries = getSitemapSourceEntries(contentPages).map((entry) => ({
...entry,
sourcePaths: expandSources(entry.sourcePaths),
}));

// Real dates need full history. If we can't get it (e.g. an un-deepenable
// shallow clone), omit <lastmod> rather than publish one wrong date — and
// never fail the build over it.
let dateMap = new Map<string, string>();
if (await ensureFullHistory()) {
dateMap = await buildGitDateMap();
} else {
console.warn(" ⚠️ Omitting <lastmod>: full git history unavailable.");
}

const entries = attachLastModified(sourceEntries, (sourcePaths) =>
resolveLastModified(sourcePaths, dateMap),
);

const xml = buildSitemapXml(entries);

await fs.mkdir(path.dirname(OUTPUT_PATH), { recursive: true });
await fs.writeFile(OUTPUT_PATH, xml);

const withDates = entries.filter((entry) => entry.lastModified).length;
console.log(
` ✓ sitemap.xml: ${entries.length} urls (${withDates} with <lastmod>) → ` +
`${path.relative(process.cwd(), OUTPUT_PATH)}`,
);
if (withDates < entries.length) {
console.log(` ℹ ${entries.length - withDates} url(s) had no git date and omit <lastmod>.`);
}
} finally {
await server.close();
}
}

main().catch((err) => {
console.error("Sitemap generation failed:", err);
process.exit(1);
});
89 changes: 74 additions & 15 deletions src/lib/seo-routes.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
import assert from "node:assert/strict";
import { describe, test } from "node:test";
import { buildRobotsTxt } from "../routes/robots[.]txt";
import { buildSitemapXml, getSitemapEntries } from "./sitemap";
import {
attachLastModified,
buildSitemapXml,
getSitemapSourceEntries,
type SitemapSourceEntry,
} from "./sitemap";

const CONTENT_PAGES = [
{ url: "/docs/ios", sourcePaths: ["content/docs/ios/index.mdx"] },
{ url: "/docs/android", sourcePaths: ["content/docs/android/index.mdx"] },
{
url: "/docs/ios/quickstart/install",
sourcePaths: ["content/docs/ios/quickstart/install.mdx"],
},
];

function entryFor(entries: SitemapSourceEntry[], url: string) {
const entry = entries.find((candidate) => candidate.url === url);
assert.ok(entry, `expected sitemap entry for ${url}`);
return entry;
}

describe("seo routes", () => {
test("buildRobotsTxt includes sitemap declaration", () => {
Expand All @@ -12,28 +32,67 @@ describe("seo routes", () => {
assert.match(robots, /^Sitemap: https:\/\/superwall\.com\/docs\/sitemap\.xml$/m);
});

test("getSitemapEntries includes docs root and generated pages", () => {
const entries = getSitemapEntries(
["/docs/ios/quickstart/install", "/docs/android/quickstart/install"],
new Date("2026-03-02T00:00:00.000Z"),
);
test("getSitemapSourceEntries includes the docs root and content pages", () => {
const entries = getSitemapSourceEntries(CONTENT_PAGES);
const urls = entries.map((entry) => entry.url);

assert.ok(urls.includes("https://superwall.com/docs/"));
assert.ok(urls.some((url) => url.startsWith("https://superwall.com/docs/ios")));
assert.ok(urls.some((url) => url.startsWith("https://superwall.com/docs/android")));
assert.ok(urls.includes("https://superwall.com/docs/ios"));
assert.ok(urls.includes("https://superwall.com/docs/android"));
});

test("docs root is backed by its route component, not a content file", () => {
const entries = getSitemapSourceEntries(CONTENT_PAGES);
const root = entryFor(entries, "https://superwall.com/docs/");

assert.equal(root.priority, 1.0);
assert.deepEqual(root.sourcePaths, ["src/routes/index.tsx"]);
});

test("landing pages keep their content source but get a bumped priority", () => {
const entries = getSitemapSourceEntries(CONTENT_PAGES);
const ios = entryFor(entries, "https://superwall.com/docs/ios");

// Priority bumped from the 0.8 content default to the 0.9 landing priority…
assert.equal(ios.priority, 0.9);
// …while the content file remains the single source of truth for the date.
assert.deepEqual(ios.sourcePaths, ["content/docs/ios/index.mdx"]);
});

test("buildSitemapXml renders XML urlset output", () => {
const entries = getSitemapEntries(
["/docs/ios/quickstart/install"],
new Date("2026-03-02T00:00:00.000Z"),
test("entries are sorted by priority descending", () => {
const entries = getSitemapSourceEntries(CONTENT_PAGES);
const priorities = entries.map((entry) => entry.priority);
const sorted = [...priorities].sort((a, b) => b - a);

assert.deepEqual(priorities, sorted);
});

test("attachLastModified omits the date when the source has no git history", () => {
const entries = getSitemapSourceEntries(CONTENT_PAGES);
const dated = attachLastModified(entries, (sourcePaths) =>
sourcePaths.includes("content/docs/ios/index.mdx") ? "2026-03-02" : undefined,
);
const xml = buildSitemapXml(entries.slice(0, 2));

const ios = dated.find((entry) => entry.url === "https://superwall.com/docs/ios");
const android = dated.find((entry) => entry.url === "https://superwall.com/docs/android");

assert.equal(ios?.lastModified, "2026-03-02");
assert.equal(android?.lastModified, undefined);
});

test("buildSitemapXml renders urlset output and only emits known lastmod", () => {
const entries = attachLastModified(getSitemapSourceEntries(CONTENT_PAGES), (sourcePaths) =>
sourcePaths.includes("content/docs/ios/index.mdx") ? "2026-03-02" : undefined,
);
const xml = buildSitemapXml(entries);

assert.match(xml, /^<\?xml version="1\.0" encoding="UTF-8"\?>/);
assert.match(xml, /<urlset xmlns="http:\/\/www\.sitemaps\.org\/schemas\/sitemap\/0\.9">/);
assert.match(xml, /<loc>https:\/\/superwall\.com\/docs\//);
assert.match(xml, /<lastmod>2026-03-02T00:00:00\.000Z<\/lastmod>/);
assert.match(xml, /<loc>https:\/\/superwall\.com\/docs\/ios<\/loc>/);
assert.match(xml, /<lastmod>2026-03-02<\/lastmod>/);

// A single <lastmod> for ios; entries without a known date emit none.
const lastmodCount = xml.match(/<lastmod>/g)?.length ?? 0;
assert.equal(lastmodCount, 1);
});
});
Loading
Loading