johnsoncodehk · theoephraim · Jun 6, 2026
diff --git a/src/api.ts b/src/api.ts
@@ -1,4 +1,4 @@
-import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, TokenPattern } from './types.ts';
+import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, StringInterpolation, TokenPattern } from './types.ts';
 import {
   altPattern, anyChar, followedBy, isTokenPattern, lit, never, noneOf, notFollowedBy,
   notPrecededBy, oneOf, optPattern, plus, precededBy, range, repeat,
@@ -17,6 +17,9 @@ interface TokenOptions {
   skip?: boolean;
   scope?: string;
   escape?: TokenPatternInput;
+  // Highlight-only interpolation regions for ordinary string tokens (e.g. env-spec `${…}` / `$(…)`).
+  // The parser/lexer stay token-based; generators re-express these as nested regions.
+  interpolation?: StringInterpolation | StringInterpolation[];
   // A regex matching exactly one well-formed escape sequence. Engine-scanned tokens
   // (templates) validate each `\`-escape against it and reject any that don't match —
   // unlike `escape` (highlight-only), this drives tokenization. Skipped in tag
@@ -414,6 +417,9 @@ export function defineGrammar(config: GrammarConfig): CstGrammar & { name: strin
       flags,
       scope: tok.opts.scope,
       escapePattern: tok.opts.escape,
+      interpolation: tok.opts.interpolation
+        ? (Array.isArray(tok.opts.interpolation) ? tok.opts.interpolation : [tok.opts.interpolation]).map((i) => ({ ...i }))
+        : undefined,
       escapeValidPattern: tok.opts.escapeValid,
       embed: tok.opts.embed,
       identifier: tok.opts.identifier,

diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts
@@ -966,7 +966,7 @@ export function createLexer(grammar: CstGrammar) {
           //  • LINE-LEAD at the document root (a bare top-level `"a\nb`, or `---\n"a\nb`) → -1.
           // Blank (whitespace-only) continuation lines are skipped — they are folded line breaks, legal
           // at any column. Flow is exempt (indentation suspended). yaml-test-suite DK95[1] / QB6E.
-          if (tm.isString && indent && flowDepth === 0 && m[0].includes('\n')) {
+          if (tm.isString && indent?.blockScalar && flowDepth === 0 && m[0].includes('\n')) {
             const prevT = tokens[tokens.length - 1];
             const prevIsDocMarker = !!prevT && blockScalarDocMarkers.includes(prevT.text);
             let parentCol: number;

diff --git a/src/gen-monarch.ts b/src/gen-monarch.ts
@@ -488,6 +488,11 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage {
 
   const stringTopRules: MonarchRule[] = [];      // entered from root/value
   const stringNestedRules: MonarchRule[] = [];   // entered from interpolation holes
+  // Highlight-only string interpolation regions (e.g. env-spec `${…}` / `$(…)`): per region we add a
+  // begin rule into the string body and build a dedicated interp state (re-enter the expression body,
+  // pop on the region's end). Specs are collected here; the states are built after templates, once the
+  // nested string/template rules they include are populated.
+  const interpStateSpecs: { name: string; end: string }[] = [];
 
   for (const t of grammar.tokens) {
     if (t.flags.includes('skip') || t.flags.includes('regex') || t.template) continue;
@@ -505,7 +510,19 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage {
         const body: MonarchRule[] = [];
         const escapePattern = tokenEscapePatternSource(t);
         if (escapePattern) body.push([anchoredSource(escapePattern), 'string.escape']);
-        body.push([`[^${escapeForCharClass(delim[0])}\\\\]+`, tok]);
+        // Interpolation openers come BEFORE the content run so they win; the content run then excludes
+        // any position that begins an interpolation (negative lookahead) so it can't swallow `${`.
+        const interps = t.interpolation ?? [];
+        interps.forEach((interp, i) => {
+          const name = `string_interp_${suffix}_${i + 1}`;
+          body.push([interp.begin, { token: 'delimiter.bracket', next: `@${name}` }]);
+          interpStateSpecs.push({ name, end: interp.end });
+        });
+        const dc = escapeForCharClass(delim[0]);
+        const content = interps.length
+          ? `(?:(?!${interps.map(p => p.begin).join('|')})[^${dc}\\\\])+`
+          : `[^${dc}\\\\]+`;
+        body.push([content, tok]);
         body.push(['\\\\.', 'string.escape']);
         tokenizer[bodyState] = body;
       }
@@ -591,6 +608,28 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage {
     ];
   }
 
+  // String-interpolation states (collected in the string loop above). Built here, after templates,
+  // so the nested string/template rules they include are populated; `@interpExprBody` is a lazy
+  // include resolved by Monarch. A bare `{` pushes a brace-counting frame (shared with templates).
+  if (interpStateSpecs.length) {
+    if (!tokenizer['bracketCounting']) {
+      tokenizer['bracketCounting'] = [
+        wsRule, ...commentRules, ...stringNestedRules, ...templateNestedRules,
+        ['\\{', { token: 'delimiter.bracket', next: '@bracketCounting' }],
+        ['\\}', { token: 'delimiter.bracket', next: '@pop' }],
+        { include: '@interpExprBody' },
+      ];
+    }
+    for (const spec of interpStateSpecs) {
+      tokenizer[spec.name] = [
+        wsRule, ...commentRules, ...stringNestedRules, ...templateNestedRules,
+        ['\\{', { token: 'delimiter.bracket', next: '@bracketCounting' }],
+        [spec.end, { token: 'delimiter.bracket', next: '@pop' }],
+        { include: '@interpExprBody' },
+      ];
+    }
+  }
+
   // ── Numbers (most-specific first; token decl order encodes specificity) ──
   const numberRules: MonarchRule[] = [];
   for (const t of grammar.tokens) {

diff --git a/src/gen-tm.ts b/src/gen-tm.ts
@@ -4464,12 +4464,26 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
     } else if (tokenEscapePatternSource(tok) && scope.startsWith('string.')) {
       // String with escape sequences: generate begin/end for each delimiter
       const escapePat: TmPattern = { match: tokenEscapePatternSource(tok)!, name: `constant.character.escape.${langName}` };
+      // Highlight-only interpolation regions (e.g. env-spec `${…}` / `$(…)`): each becomes a nested
+      // begin/end region — the same shape a template literal's hole gets. `begin`/`end` are
+      // author-supplied regex SOURCES (not literals), so they are NOT re-escaped here.
+      const interpPats: TmPattern[] = (tok.interpolation ?? []).map((interp) => {
+        const p: TmPattern = { begin: interp.begin, end: interp.end, patterns: [{ include: interp.include ?? '$self' }] };
+        if (interp.beginScope) p.beginCaptures = { '0': { name: `${interp.beginScope}.${langName}` } };
+        if (interp.endScope) p.endCaptures = { '0': { name: `${interp.endScope}.${langName}` } };
+        if (interp.contentScope) p.name = `${interp.contentScope}.${langName}`;
+        return p;
+      });
+      const stringPats: (TmPattern | { include: string })[] = [escapePat, ...interpPats];
       const delimiters: [string, string][] = [];
+      // Drive the delimiter scope off the EXTRACTED delimiter generically: `"`/`'` keep their
+      // canonical scopes; any other delimiter (e.g. a backtick string) takes the token's own scope
+      // instead of the old loop's `"`-fallback (which mis-delimited backtick strings).
+      const scopeForDelim = (d: string) => d === '"' ? 'string.quoted.double' : d === "'" ? 'string.quoted.single' : scope;
       for (const delim of tokenPatternStringDelimiters(tok)) {
-        if (delim === '"') delimiters.push(['"', 'string.quoted.double']);
-        else if (delim === "'") delimiters.push(["'", 'string.quoted.single']);
+        delimiters.push([delim, scopeForDelim(delim)]);
       }
-      if (delimiters.length === 0) delimiters.push(['"', scope]); // fallback
+      if (delimiters.length === 0) delimiters.push(['"', scope]); // fallback: no delimiter extractable
 
       if (delimiters.length === 1) {
         const [delim, delimScope] = delimiters[0];
@@ -4479,7 +4493,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
           beginCaptures: { '0': { name: `punctuation.definition.string.begin.${langName}` } },
           end: `${escapeRegex(delim)}|$`,
           endCaptures: { '0': { name: `punctuation.definition.string.end.${langName}` } },
-          patterns: [escapePat],
+          patterns: stringPats,
         };
         topPatterns.push({ include: `#${key}` });
         rememberLiteralKey(delimScope, key, tok.name);
@@ -4493,7 +4507,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
             beginCaptures: { '0': { name: `punctuation.definition.string.begin.${langName}` } },
             end: `${escapeRegex(delim)}|$`,
             endCaptures: { '0': { name: `punctuation.definition.string.end.${langName}` } },
-            patterns: [escapePat],
+            patterns: stringPats,
           };
           topPatterns.push({ include: `#${subKey}` });
           rememberLiteralKey(delimScope, subKey, tok.name);

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
@@ -149,6 +149,9 @@ interface GrammarJsContext {
    * `template_chars` token. `null` when no template token exists.
    */
   templatePlan: TemplatePlan | null;
+  /** String tokens carrying highlight-only interpolation regions, each re-expressed as a rule
+   *  backed by an external `<rule>_chars` token (parallel to `templatePlan`). Empty if none. */
+  interpolationPlans: InterpolationPlan[];
   /**
    * Ref nodes (the identifier right after a definition keyword) that should be
    * wrapped in `field('name', …)` so highlights.scm can target them with the
@@ -358,6 +361,8 @@ function buildTokenBody(name: string, ctx: GrammarJsContext): string | null {
   // The interpolated-template token is re-expressed as a `template` RULE (with
   // `${ … }` holes that re-enter the expression grammar), emitted separately.
   if (ctx.templatePlan && ctx.templatePlan.tokenName === name) return null;
+  // A string token with interpolation regions is likewise re-expressed as a rule (emitted separately).
+  if (ctx.interpolationPlans.some(ip => ip.tokenName === name)) return null;
   // Skip-flagged tokens (comments, whitespace) go in `extras`, not as a named
   // rule reference — but we still emit them so highlights can capture comments.
   // tree-sitter's token() DFA rejects zero-width assertions, so strip them first.
@@ -538,6 +543,50 @@ function planTemplate(grammar: CstGrammar): TemplatePlan | null {
   };
 }
 
+/**
+ * A string token carrying highlight-only interpolation regions (e.g. env-spec `${…}` / `$(…)`),
+ * re-expressed as a tree-sitter RULE (open delim + chars/interpolation runs + close delim) — the
+ * same shape a template literal gets. The literal text between regions is an external
+ * `<rule>_chars` token (the scanner stops it at the close delim or any region opener).
+ */
+interface InterpolationPlan {
+  tokenName: string;     // original token name (e.g. 'DQ') — now emitted as a rule, not a token
+  ruleSnake: string;     // snake rule name (e.g. 'dq') — keeps `$.dq` references valid
+  charsSnake: string;    // external scanner symbol for the literal text (e.g. 'dq_chars')
+  open: string;          // opening delimiter (e.g. '"')
+  close: string;         // closing delimiter (same as open for a string token)
+  regions: { ruleSnake: string; open: string; close: string }[]; // one sub-rule per interpolation entry
+}
+
+// Decode an author-supplied interpolation begin/end REGEX fragment to the literal text it matches:
+// drop an optional leading escaped-backslash (`\\?`, the env-spec `\${` vs `${` allowance), then
+// unescape the rest. Targets the scanner-friendly forms (decoded literal length 1–2; see PR #9).
+function decodeInterpDelim(src: string): string {
+  return src.replace(/\\\\\?/g, '').replace(/\\(.)/g, '$1');
+}
+
+function planInterpolations(grammar: CstGrammar): InterpolationPlan[] {
+  const plans: InterpolationPlan[] = [];
+  for (const tok of grammar.tokens) {
+    if (!tok.interpolation?.length) continue;
+    const open = tokenPatternStringDelimiters(tok)[0] ?? '"';
+    const ruleSnake = toSnake(tok.name);
+    plans.push({
+      tokenName: tok.name,
+      ruleSnake,
+      charsSnake: ruleSnake + '_chars',
+      open,
+      close: open,
+      regions: tok.interpolation.map((interp, i) => ({
+        ruleSnake: `${ruleSnake}_interpolation_${i + 1}`,
+        open: decodeInterpDelim(interp.begin),
+        close: decodeInterpDelim(interp.end),
+      })),
+    });
+  }
+  return plans;
+}
+
 /** Determine which tokens the external scanner must provide. */
 function planScannerTokens(grammar: CstGrammar): Map<string, string> {
   const map = new Map<string, string>();
@@ -560,6 +609,7 @@ function planScannerTokens(grammar: CstGrammar): Map<string, string> {
 function externalSymbols(ctx: GrammarJsContext): string[] {
   const syms = [...ctx.scannerTokenFor.values()];
   if (ctx.templatePlan) syms.push(ctx.templatePlan.charsSnake);
+  for (const ip of ctx.interpolationPlans) syms.push(ip.charsSnake);
   return syms;
 }
 
@@ -725,8 +775,10 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree
 
   const scannerTokenFor = planScannerTokens(grammar);
   const templatePlan = planTemplate(grammar);
+  const interpolationPlans = planInterpolations(grammar);
   const externalSnake = new Set([...scannerTokenFor.values()]);
   if (templatePlan) externalSnake.add(templatePlan.charsSnake);
+  for (const ip of interpolationPlans) externalSnake.add(ip.charsSnake);
 
   // Find the identifier nodes that follow a declaration keyword, so we can wrap
   // them in `field('name', …)` in grammar.js AND emit standard `name:` highlight
@@ -736,6 +788,7 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree
   const ctx: GrammarJsContext = {
     grammar, tokenNames, ruleSnake, tokenSnake, prattRules, externalSnake, scannerTokenFor,
     templatePlan,
+    interpolationPlans,
     nameFieldNodes: nameFields.nodes,
   };
 
@@ -859,6 +912,27 @@ function buildGrammarJs(ctx: GrammarJsContext, grammarName: string): string {
     );
   }
 
+  // String-interpolation tokens: re-expressed as a rule (open + chars/interpolation runs + close);
+  // each interpolation region is a sub-rule whose hole re-enters the expression grammar (like a template).
+  const interpExprName = [...ctx.prattRules][0];
+  const interpExprSnake = interpExprName ? ctx.ruleSnake.get(interpExprName)! : null;
+  const interpHole = interpExprSnake ? `optional($.${interpExprSnake})` : 'blank()';
+  for (const ip of ctx.interpolationPlans) {
+    const choices = [`$.${ip.charsSnake}`, ...ip.regions.map(r => `$.${r.ruleSnake}`)].join(', ');
+    ruleEntries.push(
+      `    ${ip.ruleSnake}: $ => seq(\n` +
+      `      ${jsString(ip.open)},\n` +
+      `      repeat(choice(${choices})),\n` +
+      `      ${jsString(ip.close)}\n` +
+      `    )`,
+    );
+    for (const r of ip.regions) {
+      ruleEntries.push(
+        `    ${r.ruleSnake}: $ => seq(${jsString(r.open)}, ${interpHole}, ${jsString(r.close)})`,
+      );
+    }
+  }
+
   lines.push(ruleEntries.join(',\n\n'));
   lines.push('  }');
   lines.push('});');
@@ -1087,6 +1161,15 @@ function buildHighlightsScm(
     tokenNodeCaptures.push({ query: `(${tpl.substRuleSnake} ${jsString(tpl.interpOpen)})`, capture: '@punctuation.special' });
     tokenNodeCaptures.push({ query: `(${tpl.substRuleSnake} ${jsString(tpl.interpClose)})`, capture: '@punctuation.special' });
   }
+  // String-interpolation regions: the literal text reads as string; the region delimiters as
+  // punctuation — same treatment as a template hole, derived from the interpolation metadata.
+  for (const ip of ctx.interpolationPlans) {
+    tokenNodeCaptures.push({ query: `(${ip.charsSnake})`, capture: '@string' });
+    for (const r of ip.regions) {
+      tokenNodeCaptures.push({ query: `(${r.ruleSnake} ${jsString(r.open)})`, capture: '@punctuation.special' });
+      tokenNodeCaptures.push({ query: `(${r.ruleSnake} ${jsString(r.close)})`, capture: '@punctuation.special' });
+    }
+  }
 
   // ── D. Contextual node captures via emitted fields ──
   // Operators carry an `operator` field in Pratt rules; they're already covered by
@@ -1753,6 +1836,49 @@ function buildScannerC(
     L.push('');
   }
 
+  // ── Interpolated-string char scanners (one per string token carrying interpolation) ──
+  // Each scans the literal run inside the string, stopping before the close delimiter or any
+  // interpolation opener (so the opener re-enters the expression grammar via its sub-rule). The
+  // openers are DATA from the interpolation metadata (decoded literals, length 1–2).
+  {
+    const cChar = (ch: string) => ch === '\\' ? "'\\\\'" : ch === "'" ? "'\\''" : `'${ch}'`;
+    for (const ip of ctx.interpolationPlans) {
+      const charsSym = ip.charsSnake.toUpperCase();
+      const up = ip.ruleSnake.toUpperCase();
+      const openerInit = ip.regions.map(r => jsString(r.open)).join(', ');
+      L.push(`// ── Interpolated-string scan (${ip.tokenName}): literal text up to the close delim or an opener ──`);
+      L.push(`static const char *${up}_OPENERS[] = { ${openerInit} };`);
+      L.push(`static const unsigned ${up}_OPENER_COUNT = ${ip.regions.length};`);
+      L.push(`static bool scan_${ip.ruleSnake}_chars(TSLexer *lexer) {`);
+      L.push('  bool has_content = false;');
+      L.push('  for (;;) {');
+      L.push('    lexer->mark_end(lexer);');
+      L.push('    int32_t c = lexer->lookahead;');
+      L.push('    if (c == 0) return false; // EOF — let the CFG report the unterminated string');
+      L.push(`    if (c == ${cChar(ip.close)}) break; // closing delimiter`);
+      L.push('    bool first_match = false;');
+      L.push(`    for (unsigned i = 0; i < ${up}_OPENER_COUNT; i++) if ((int32_t)${up}_OPENERS[i][0] == c) { first_match = true; break; }`);
+      L.push('    if (first_match) {');
+      L.push('      advance(lexer);                 // peek past the opener\'s first char');
+      L.push('      int32_t c2 = lexer->lookahead;');
+      L.push('      bool real = false;');
+      L.push(`      for (unsigned i = 0; i < ${up}_OPENER_COUNT; i++)`);
+      L.push(`        if ((int32_t)${up}_OPENERS[i][0] == c && (${up}_OPENERS[i][1] == 0 || (int32_t)${up}_OPENERS[i][1] == c2)) { real = true; break; }`);
+      L.push('      if (real) break;                // a real opener — token ends before it (mark_end frozen above)');
+      L.push('      has_content = true; continue;   // lone first char → literal content');
+      L.push('    }');
+      L.push('    if (c == \'\\\\\') { advance(lexer); if (lexer->lookahead != 0) advance(lexer); has_content = true; continue; }');
+      L.push('    advance(lexer);');
+      L.push('    has_content = true;');
+      L.push('  }');
+      L.push('  if (!has_content) return false;');
+      L.push(`  lexer->result_symbol = ${charsSym};`);
+      L.push('  return true;');
+      L.push('}');
+      L.push('');
+    }
+  }
+
   // ── scan() entry ──
   L.push('bool tree_sitter_' + grammarName + '_external_scanner_scan(void *payload, TSLexer *lexer,');
   L.push('                                                          const bool *valid_symbols) {');
@@ -1797,6 +1923,14 @@ function buildScannerC(
     L.push('  }');
     L.push('');
   }
+  for (const ip of ctx.interpolationPlans) {
+    const charsSym = ip.charsSnake.toUpperCase();
+    L.push(`  // ${ip.tokenName} interpolated-string literal text (whitespace inside is content, not skipped).`);
+    L.push(`  if (valid_symbols[${charsSym}]) {`);
+    L.push(`    if (scan_${ip.ruleSnake}_chars(lexer)) return true;`);
+    L.push('  }');
+    L.push('');
+  }
   L.push('  return false;');
   L.push('}');
   L.push('');