-
Notifications
You must be signed in to change notification settings - Fork 3
feat(seo): static sitemap.xml with git-based lastmod #222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
dcrawbuck
wants to merge
3
commits into
main
Choose a base branch
from
dcrawbuck/raleigh-v2
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,269 @@ | ||
| /** | ||
| * Post-build script: generates a static `sitemap.xml` with real `<lastmod>` | ||
| * dates derived from git history. | ||
| * | ||
| * Cloudflare Workers have no filesystem / git at request time, so the dates are | ||
| * resolved here (Node.js, full repo) and baked into a static asset served at | ||
| * `/docs/sitemap.xml` — the same delivery path as `search-index.json`. | ||
| * | ||
| * Usage: bun run scripts/generate-sitemap.ts | ||
| * Called automatically as part of `bun run build` (after `vite build`, which | ||
| * empties `dist/`, so this must run last alongside the other generators). | ||
| */ | ||
|
|
||
| import { execFile } from "node:child_process"; | ||
| import { readFileSync } from "node:fs"; | ||
| import fs from "node:fs/promises"; | ||
| import path from "node:path"; | ||
| import { promisify } from "node:util"; | ||
| import { createServer } from "vite"; | ||
| import react from "@vitejs/plugin-react"; | ||
| import tsConfigPaths from "vite-tsconfig-paths"; | ||
| import mdx from "fumadocs-mdx/vite"; | ||
|
|
||
| const execFileAsync = promisify(execFile); | ||
|
|
||
| const DIST_CLIENT = path.join(process.cwd(), "dist/client"); | ||
| const OUTPUT_PATH = path.join(DIST_CLIENT, "docs/sitemap.xml"); | ||
|
|
||
| /** | ||
| * Sentinel prefixing each commit-date line in the `git log` output so it can | ||
| * never be mistaken for a file path (no tracked path begins with it). | ||
| */ | ||
| const COMMIT_PREFIX = "@@commit@@"; | ||
|
|
||
| /** | ||
| * Fumadocs `<include>relative/path.mdx</include>` directives. A page's rendered | ||
| * body includes these files, so an edit to a shared include must count toward | ||
| * the page's `lastmod` even though the wrapper file itself didn't change. | ||
| */ | ||
| const INCLUDE_PATTERN = /<include[^>]*>([\s\S]*?)<\/include>/g; | ||
|
|
||
| /** | ||
| * Extra tracked data files a page renders through a React component (not via | ||
| * `<include>`), so an edit to the data bumps the page's `lastmod`. Keyed by the | ||
| * wrapper's repo-relative path. Extend this when a page's rendered output depends | ||
| * on a committed data file imported by a component. | ||
| */ | ||
| const SUPPLEMENTAL_SOURCES: Record<string, string[]> = { | ||
| // /docs/changelog renders <ChangelogTimeline/>, which imports this generated, | ||
| // committed JSON — changelog regenerations don't touch the wrapper MDX. | ||
| "content/docs/changelog/index.mdx": ["src/lib/changelog-entries.json"], | ||
| }; | ||
|
|
||
| /** | ||
| * One `git log` pass, newest commit first, mapping each file to the date of the | ||
| * most recent commit that touched it (YYYY-MM-DD). First occurrence wins because | ||
| * the log is in reverse-chronological order. Returns an empty map (never throws) | ||
| * so a git problem degrades to a date-less sitemap instead of failing the build. | ||
| */ | ||
| async function buildGitDateMap(): Promise<Map<string, string>> { | ||
| const dates = new Map<string, string>(); | ||
|
|
||
| let stdout: string; | ||
| try { | ||
| ({ stdout } = await execFileAsync( | ||
| "git", | ||
| [ | ||
| "-c", | ||
| "core.quotePath=false", | ||
| "log", | ||
| "--no-merges", | ||
| "--name-only", | ||
| `--pretty=format:${COMMIT_PREFIX}%cs`, | ||
| ], | ||
| { maxBuffer: 1024 * 1024 * 128 }, | ||
| )); | ||
| } catch (err) { | ||
| console.warn( | ||
| ` ⚠️ Could not read git history (${(err as Error).message}); ` + | ||
| "emitting sitemap without <lastmod>.", | ||
| ); | ||
| return dates; | ||
| } | ||
|
|
||
| let currentDate: string | undefined; | ||
| for (const line of stdout.split("\n")) { | ||
| if (line.startsWith(COMMIT_PREFIX)) { | ||
| currentDate = line.slice(COMMIT_PREFIX.length).trim() || undefined; | ||
| continue; | ||
| } | ||
| const file = line.trim(); | ||
| if (!file || !currentDate) continue; | ||
| if (!dates.has(file)) dates.set(file, currentDate); | ||
| } | ||
|
|
||
| return dates; | ||
| } | ||
|
|
||
| /** | ||
| * Direct dependencies of a file as repo-relative POSIX paths: its `<include>` | ||
| * targets (for `.mdx`) plus any supplemental component-data files mapped to it. | ||
| */ | ||
| const includeCache = new Map<string, string[]>(); | ||
| function directIncludes(relPath: string): string[] { | ||
| const cached = includeCache.get(relPath); | ||
| if (cached) return cached; | ||
|
|
||
| const targets: string[] = []; | ||
| if (relPath.endsWith(".mdx")) { | ||
| try { | ||
| const content = readFileSync(path.join(process.cwd(), relPath), "utf8"); | ||
| const fromDir = path.posix.dirname(relPath); | ||
| for (const match of content.matchAll(INCLUDE_PATTERN)) { | ||
| const rel = match[1].trim(); | ||
| if (rel) targets.push(path.posix.normalize(path.posix.join(fromDir, rel))); | ||
| } | ||
| } catch { | ||
| // Unreadable wrapper: fall back to whatever supplemental sources are mapped. | ||
| } | ||
| } | ||
|
|
||
| targets.push(...(SUPPLEMENTAL_SOURCES[relPath] ?? [])); | ||
|
|
||
| includeCache.set(relPath, targets); | ||
| return targets; | ||
| } | ||
|
|
||
| /** | ||
| * Expand declared source files to include every file transitively pulled in via | ||
| * `<include>` (cycle-safe). The wrapper plus its shared includes all contribute | ||
| * to the page's last-modified date. | ||
| */ | ||
| function expandSources(declaredPaths: string[]): string[] { | ||
| const resolved = new Set<string>(); | ||
| const stack = [...declaredPaths]; | ||
| while (stack.length > 0) { | ||
| const current = stack.pop(); | ||
| if (!current || resolved.has(current)) continue; | ||
| resolved.add(current); | ||
| for (const included of directIncludes(current)) { | ||
| if (!resolved.has(included)) stack.push(included); | ||
| } | ||
| } | ||
| return [...resolved]; | ||
| } | ||
|
|
||
| /** Most recent (max) date among the given source files, or undefined if none tracked. */ | ||
| function resolveLastModified( | ||
| sourcePaths: string[], | ||
| dateMap: Map<string, string>, | ||
| ): string | undefined { | ||
| let latest: string | undefined; | ||
| for (const sourcePath of sourcePaths) { | ||
| const date = dateMap.get(sourcePath); | ||
| // YYYY-MM-DD sorts lexicographically, so string comparison == date comparison. | ||
| if (date && (!latest || date > latest)) latest = date; | ||
| } | ||
| return latest; | ||
| } | ||
|
|
||
| /** | ||
| * Ensure full git history is available, deepening a shallow clone if needed. | ||
| * Deploy environments like Cloudflare Workers Builds shallow-clone with no | ||
| * fetch-depth setting, so `git fetch --unshallow` is the only way to get real | ||
| * per-file dates there (anonymous fetch works because the repo is public). | ||
| * | ||
| * Returns false — so the caller omits <lastmod> rather than publishing clustered, | ||
| * misleading dates — whenever full history can't be *confirmed*: the depth probe | ||
| * itself fails, or a shallow clone can't be deepened. Never fails the build. | ||
| */ | ||
| async function ensureFullHistory(): Promise<boolean> { | ||
| let isShallow: boolean; | ||
| try { | ||
| const { stdout } = await execFileAsync("git", ["rev-parse", "--is-shallow-repository"]); | ||
| isShallow = stdout.trim() === "true"; | ||
| } catch (err) { | ||
| console.warn( | ||
| ` ⚠️ Could not determine clone depth (${(err as Error).message}); ` + | ||
| "omitting <lastmod> rather than risk inaccurate dates.", | ||
| ); | ||
| return false; | ||
| } | ||
|
|
||
| if (!isShallow) return true; | ||
|
|
||
| console.warn(" ⚠️ Shallow clone — fetching full history with `git fetch --unshallow`…"); | ||
| try { | ||
| // A successful --unshallow converts the clone to complete history. | ||
| await execFileAsync("git", ["fetch", "--unshallow", "--quiet"], { timeout: 180_000 }); | ||
| } catch (err) { | ||
| console.warn(` ⚠️ Could not deepen history (${(err as Error).message}).`); | ||
| return false; | ||
| } | ||
|
|
||
| console.log(" ✓ Fetched full git history."); | ||
| return true; | ||
| } | ||
|
|
||
| async function main() { | ||
| console.log("Generating static sitemap.xml…"); | ||
|
|
||
| // Lightweight Vite SSR server (no Cloudflare plugin) to resolve the fumadocs | ||
| // virtual modules, mirroring the other post-build generators. | ||
| const server = await createServer({ | ||
| configFile: false, | ||
| logLevel: "error", | ||
| server: { port: 0, host: "127.0.0.1" }, | ||
| resolve: { | ||
| alias: { "@": path.resolve(process.cwd(), "./src") }, | ||
| }, | ||
| plugins: [ | ||
| mdx(await import("../source.config")), | ||
| tsConfigPaths({ projects: ["./tsconfig.json"] }), | ||
| react(), | ||
| ], | ||
| }); | ||
|
|
||
| try { | ||
| const { source } = await server.ssrLoadModule("./src/lib/source"); | ||
| const { getSitemapSourceEntries, attachLastModified, buildSitemapXml } = | ||
| await server.ssrLoadModule("./src/lib/sitemap"); | ||
|
|
||
| const pages = source.getPages() as Array<{ url: string; path: string }>; | ||
| const contentPages = pages.map((page) => ({ | ||
| url: page.url, | ||
| sourcePaths: [`content/docs/${page.path}`], | ||
|
dcrawbuck marked this conversation as resolved.
dcrawbuck marked this conversation as resolved.
|
||
| })); | ||
|
|
||
| const sourceEntries = getSitemapSourceEntries(contentPages).map((entry) => ({ | ||
| ...entry, | ||
| sourcePaths: expandSources(entry.sourcePaths), | ||
| })); | ||
|
|
||
| // Real dates need full history. If we can't get it (e.g. an un-deepenable | ||
| // shallow clone), omit <lastmod> rather than publish one wrong date — and | ||
| // never fail the build over it. | ||
| let dateMap = new Map<string, string>(); | ||
| if (await ensureFullHistory()) { | ||
| dateMap = await buildGitDateMap(); | ||
| } else { | ||
| console.warn(" ⚠️ Omitting <lastmod>: full git history unavailable."); | ||
| } | ||
|
|
||
| const entries = attachLastModified(sourceEntries, (sourcePaths) => | ||
| resolveLastModified(sourcePaths, dateMap), | ||
| ); | ||
|
|
||
| const xml = buildSitemapXml(entries); | ||
|
|
||
| await fs.mkdir(path.dirname(OUTPUT_PATH), { recursive: true }); | ||
| await fs.writeFile(OUTPUT_PATH, xml); | ||
|
|
||
| const withDates = entries.filter((entry) => entry.lastModified).length; | ||
| console.log( | ||
| ` ✓ sitemap.xml: ${entries.length} urls (${withDates} with <lastmod>) → ` + | ||
| `${path.relative(process.cwd(), OUTPUT_PATH)}`, | ||
| ); | ||
| if (withDates < entries.length) { | ||
| console.log(` ℹ ${entries.length - withDates} url(s) had no git date and omit <lastmod>.`); | ||
| } | ||
| } finally { | ||
| await server.close(); | ||
| } | ||
| } | ||
|
|
||
| main().catch((err) => { | ||
| console.error("Sitemap generation failed:", err); | ||
| process.exit(1); | ||
| }); | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.