Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/api.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, TokenPattern } from './types.ts';
import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, StringInterpolation, TokenPattern } from './types.ts';
import {
altPattern, anyChar, followedBy, isTokenPattern, lit, never, noneOf, notFollowedBy,
notPrecededBy, oneOf, optPattern, plus, precededBy, range, repeat,
Expand All @@ -17,6 +17,9 @@ interface TokenOptions {
skip?: boolean;
scope?: string;
escape?: TokenPatternInput;
// Highlight-only interpolation regions for ordinary string tokens (e.g. env-spec `${…}` / `$(…)`).
// The parser/lexer stay token-based; generators re-express these as nested regions.
interpolation?: StringInterpolation | StringInterpolation[];
// A regex matching exactly one well-formed escape sequence. Engine-scanned tokens
// (templates) validate each `\`-escape against it and reject any that don't match —
// unlike `escape` (highlight-only), this drives tokenization. Skipped in tag
Expand Down Expand Up @@ -414,6 +417,9 @@ export function defineGrammar(config: GrammarConfig): CstGrammar & { name: strin
flags,
scope: tok.opts.scope,
escapePattern: tok.opts.escape,
interpolation: tok.opts.interpolation
? (Array.isArray(tok.opts.interpolation) ? tok.opts.interpolation : [tok.opts.interpolation]).map((i) => ({ ...i }))
: undefined,
escapeValidPattern: tok.opts.escapeValid,
embed: tok.opts.embed,
identifier: tok.opts.identifier,
Expand Down
2 changes: 1 addition & 1 deletion src/gen-lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ export function createLexer(grammar: CstGrammar) {
// • LINE-LEAD at the document root (a bare top-level `"a\nb`, or `---\n"a\nb`) → -1.
// Blank (whitespace-only) continuation lines are skipped — they are folded line breaks, legal
// at any column. Flow is exempt (indentation suspended). yaml-test-suite DK95[1] / QB6E.
if (tm.isString && indent && flowDepth === 0 && m[0].includes('\n')) {
if (tm.isString && indent?.blockScalar && flowDepth === 0 && m[0].includes('\n')) {
const prevT = tokens[tokens.length - 1];
const prevIsDocMarker = !!prevT && blockScalarDocMarkers.includes(prevT.text);
let parentCol: number;
Expand Down
41 changes: 40 additions & 1 deletion src/gen-monarch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,11 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage {

const stringTopRules: MonarchRule[] = []; // entered from root/value
const stringNestedRules: MonarchRule[] = []; // entered from interpolation holes
// Highlight-only string interpolation regions (e.g. env-spec `${…}` / `$(…)`): per region we add a
// begin rule into the string body and build a dedicated interp state (re-enter the expression body,
// pop on the region's end). Specs are collected here; the states are built after templates, once the
// nested string/template rules they include are populated.
const interpStateSpecs: { name: string; end: string }[] = [];

for (const t of grammar.tokens) {
if (t.flags.includes('skip') || t.flags.includes('regex') || t.template) continue;
Expand All @@ -505,7 +510,19 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage {
const body: MonarchRule[] = [];
const escapePattern = tokenEscapePatternSource(t);
if (escapePattern) body.push([anchoredSource(escapePattern), 'string.escape']);
body.push([`[^${escapeForCharClass(delim[0])}\\\\]+`, tok]);
// Interpolation openers come BEFORE the content run so they win; the content run then excludes
// any position that begins an interpolation (negative lookahead) so it can't swallow `${`.
const interps = t.interpolation ?? [];
interps.forEach((interp, i) => {
const name = `string_interp_${suffix}_${i + 1}`;
body.push([interp.begin, { token: 'delimiter.bracket', next: `@${name}` }]);
interpStateSpecs.push({ name, end: interp.end });
});
const dc = escapeForCharClass(delim[0]);
const content = interps.length
? `(?:(?!${interps.map(p => p.begin).join('|')})[^${dc}\\\\])+`
: `[^${dc}\\\\]+`;
body.push([content, tok]);
body.push(['\\\\.', 'string.escape']);
tokenizer[bodyState] = body;
}
Expand Down Expand Up @@ -591,6 +608,28 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage {
];
}

// String-interpolation states (collected in the string loop above). Built here, after templates,
// so the nested string/template rules they include are populated; `@interpExprBody` is a lazy
// include resolved by Monarch. A bare `{` pushes a brace-counting frame (shared with templates).
if (interpStateSpecs.length) {
if (!tokenizer['bracketCounting']) {
tokenizer['bracketCounting'] = [
wsRule, ...commentRules, ...stringNestedRules, ...templateNestedRules,
['\\{', { token: 'delimiter.bracket', next: '@bracketCounting' }],
['\\}', { token: 'delimiter.bracket', next: '@pop' }],
{ include: '@interpExprBody' },
];
}
for (const spec of interpStateSpecs) {
tokenizer[spec.name] = [
wsRule, ...commentRules, ...stringNestedRules, ...templateNestedRules,
['\\{', { token: 'delimiter.bracket', next: '@bracketCounting' }],
[spec.end, { token: 'delimiter.bracket', next: '@pop' }],
{ include: '@interpExprBody' },
];
}
}

// ── Numbers (most-specific first; token decl order encodes specificity) ──
const numberRules: MonarchRule[] = [];
for (const t of grammar.tokens) {
Expand Down
24 changes: 19 additions & 5 deletions src/gen-tm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4464,12 +4464,26 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
} else if (tokenEscapePatternSource(tok) && scope.startsWith('string.')) {
// String with escape sequences: generate begin/end for each delimiter
const escapePat: TmPattern = { match: tokenEscapePatternSource(tok)!, name: `constant.character.escape.${langName}` };
// Highlight-only interpolation regions (e.g. env-spec `${…}` / `$(…)`): each becomes a nested
// begin/end region — the same shape a template literal's hole gets. `begin`/`end` are
// author-supplied regex SOURCES (not literals), so they are NOT re-escaped here.
const interpPats: TmPattern[] = (tok.interpolation ?? []).map((interp) => {
const p: TmPattern = { begin: interp.begin, end: interp.end, patterns: [{ include: interp.include ?? '$self' }] };
if (interp.beginScope) p.beginCaptures = { '0': { name: `${interp.beginScope}.${langName}` } };
if (interp.endScope) p.endCaptures = { '0': { name: `${interp.endScope}.${langName}` } };
if (interp.contentScope) p.name = `${interp.contentScope}.${langName}`;
return p;
});
const stringPats: (TmPattern | { include: string })[] = [escapePat, ...interpPats];
const delimiters: [string, string][] = [];
// Drive the delimiter scope off the EXTRACTED delimiter generically: `"`/`'` keep their
// canonical scopes; any other delimiter (e.g. a backtick string) takes the token's own scope
// instead of the old loop's `"`-fallback (which mis-delimited backtick strings).
const scopeForDelim = (d: string) => d === '"' ? 'string.quoted.double' : d === "'" ? 'string.quoted.single' : scope;
for (const delim of tokenPatternStringDelimiters(tok)) {
if (delim === '"') delimiters.push(['"', 'string.quoted.double']);
else if (delim === "'") delimiters.push(["'", 'string.quoted.single']);
delimiters.push([delim, scopeForDelim(delim)]);
}
if (delimiters.length === 0) delimiters.push(['"', scope]); // fallback
if (delimiters.length === 0) delimiters.push(['"', scope]); // fallback: no delimiter extractable

if (delimiters.length === 1) {
const [delim, delimScope] = delimiters[0];
Expand All @@ -4479,7 +4493,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
beginCaptures: { '0': { name: `punctuation.definition.string.begin.${langName}` } },
end: `${escapeRegex(delim)}|$`,
endCaptures: { '0': { name: `punctuation.definition.string.end.${langName}` } },
patterns: [escapePat],
patterns: stringPats,
};
topPatterns.push({ include: `#${key}` });
rememberLiteralKey(delimScope, key, tok.name);
Expand All @@ -4493,7 +4507,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
beginCaptures: { '0': { name: `punctuation.definition.string.begin.${langName}` } },
end: `${escapeRegex(delim)}|$`,
endCaptures: { '0': { name: `punctuation.definition.string.end.${langName}` } },
patterns: [escapePat],
patterns: stringPats,
};
topPatterns.push({ include: `#${subKey}` });
rememberLiteralKey(delimScope, subKey, tok.name);
Expand Down
134 changes: 134 additions & 0 deletions src/gen-treesitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ interface GrammarJsContext {
* `template_chars` token. `null` when no template token exists.
*/
templatePlan: TemplatePlan | null;
/** String tokens carrying highlight-only interpolation regions, each re-expressed as a rule
* backed by an external `<rule>_chars` token (parallel to `templatePlan`). Empty if none. */
interpolationPlans: InterpolationPlan[];
/**
* Ref nodes (the identifier right after a definition keyword) that should be
* wrapped in `field('name', …)` so highlights.scm can target them with the
Expand Down Expand Up @@ -358,6 +361,8 @@ function buildTokenBody(name: string, ctx: GrammarJsContext): string | null {
// The interpolated-template token is re-expressed as a `template` RULE (with
// `${ … }` holes that re-enter the expression grammar), emitted separately.
if (ctx.templatePlan && ctx.templatePlan.tokenName === name) return null;
// A string token with interpolation regions is likewise re-expressed as a rule (emitted separately).
if (ctx.interpolationPlans.some(ip => ip.tokenName === name)) return null;
// Skip-flagged tokens (comments, whitespace) go in `extras`, not as a named
// rule reference — but we still emit them so highlights can capture comments.
// tree-sitter's token() DFA rejects zero-width assertions, so strip them first.
Expand Down Expand Up @@ -538,6 +543,50 @@ function planTemplate(grammar: CstGrammar): TemplatePlan | null {
};
}

/**
* A string token carrying highlight-only interpolation regions (e.g. env-spec `${…}` / `$(…)`),
* re-expressed as a tree-sitter RULE (open delim + chars/interpolation runs + close delim) — the
* same shape a template literal gets. The literal text between regions is an external
* `<rule>_chars` token (the scanner stops it at the close delim or any region opener).
*/
interface InterpolationPlan {
tokenName: string; // original token name (e.g. 'DQ') — now emitted as a rule, not a token
ruleSnake: string; // snake rule name (e.g. 'dq') — keeps `$.dq` references valid
charsSnake: string; // external scanner symbol for the literal text (e.g. 'dq_chars')
open: string; // opening delimiter (e.g. '"')
close: string; // closing delimiter (same as open for a string token)
regions: { ruleSnake: string; open: string; close: string }[]; // one sub-rule per interpolation entry
}

// Decode an author-supplied interpolation begin/end REGEX fragment to the literal text it matches:
// drop an optional leading escaped-backslash (`\\?`, the env-spec `\${` vs `${` allowance), then
// unescape the rest. Targets the scanner-friendly forms (decoded literal length 1–2; see PR #9).
function decodeInterpDelim(src: string): string {
return src.replace(/\\\\\?/g, '').replace(/\\(.)/g, '$1');
}

function planInterpolations(grammar: CstGrammar): InterpolationPlan[] {
const plans: InterpolationPlan[] = [];
for (const tok of grammar.tokens) {
if (!tok.interpolation?.length) continue;
const open = tokenPatternStringDelimiters(tok)[0] ?? '"';
const ruleSnake = toSnake(tok.name);
plans.push({
tokenName: tok.name,
ruleSnake,
charsSnake: ruleSnake + '_chars',
open,
close: open,
regions: tok.interpolation.map((interp, i) => ({
ruleSnake: `${ruleSnake}_interpolation_${i + 1}`,
open: decodeInterpDelim(interp.begin),
close: decodeInterpDelim(interp.end),
})),
});
}
return plans;
}

/** Determine which tokens the external scanner must provide. */
function planScannerTokens(grammar: CstGrammar): Map<string, string> {
const map = new Map<string, string>();
Expand All @@ -560,6 +609,7 @@ function planScannerTokens(grammar: CstGrammar): Map<string, string> {
function externalSymbols(ctx: GrammarJsContext): string[] {
const syms = [...ctx.scannerTokenFor.values()];
if (ctx.templatePlan) syms.push(ctx.templatePlan.charsSnake);
for (const ip of ctx.interpolationPlans) syms.push(ip.charsSnake);
return syms;
}

Expand Down Expand Up @@ -725,8 +775,10 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree

const scannerTokenFor = planScannerTokens(grammar);
const templatePlan = planTemplate(grammar);
const interpolationPlans = planInterpolations(grammar);
const externalSnake = new Set([...scannerTokenFor.values()]);
if (templatePlan) externalSnake.add(templatePlan.charsSnake);
for (const ip of interpolationPlans) externalSnake.add(ip.charsSnake);

// Find the identifier nodes that follow a declaration keyword, so we can wrap
// them in `field('name', …)` in grammar.js AND emit standard `name:` highlight
Expand All @@ -736,6 +788,7 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree
const ctx: GrammarJsContext = {
grammar, tokenNames, ruleSnake, tokenSnake, prattRules, externalSnake, scannerTokenFor,
templatePlan,
interpolationPlans,
nameFieldNodes: nameFields.nodes,
};

Expand Down Expand Up @@ -859,6 +912,27 @@ function buildGrammarJs(ctx: GrammarJsContext, grammarName: string): string {
);
}

// String-interpolation tokens: re-expressed as a rule (open + chars/interpolation runs + close);
// each interpolation region is a sub-rule whose hole re-enters the expression grammar (like a template).
const interpExprName = [...ctx.prattRules][0];
const interpExprSnake = interpExprName ? ctx.ruleSnake.get(interpExprName)! : null;
const interpHole = interpExprSnake ? `optional($.${interpExprSnake})` : 'blank()';
for (const ip of ctx.interpolationPlans) {
const choices = [`$.${ip.charsSnake}`, ...ip.regions.map(r => `$.${r.ruleSnake}`)].join(', ');
ruleEntries.push(
` ${ip.ruleSnake}: $ => seq(\n` +
` ${jsString(ip.open)},\n` +
` repeat(choice(${choices})),\n` +
` ${jsString(ip.close)}\n` +
` )`,
);
for (const r of ip.regions) {
ruleEntries.push(
` ${r.ruleSnake}: $ => seq(${jsString(r.open)}, ${interpHole}, ${jsString(r.close)})`,
);
}
}

lines.push(ruleEntries.join(',\n\n'));
lines.push(' }');
lines.push('});');
Expand Down Expand Up @@ -1087,6 +1161,15 @@ function buildHighlightsScm(
tokenNodeCaptures.push({ query: `(${tpl.substRuleSnake} ${jsString(tpl.interpOpen)})`, capture: '@punctuation.special' });
tokenNodeCaptures.push({ query: `(${tpl.substRuleSnake} ${jsString(tpl.interpClose)})`, capture: '@punctuation.special' });
}
// String-interpolation regions: the literal text reads as string; the region delimiters as
// punctuation — same treatment as a template hole, derived from the interpolation metadata.
for (const ip of ctx.interpolationPlans) {
tokenNodeCaptures.push({ query: `(${ip.charsSnake})`, capture: '@string' });
for (const r of ip.regions) {
tokenNodeCaptures.push({ query: `(${r.ruleSnake} ${jsString(r.open)})`, capture: '@punctuation.special' });
tokenNodeCaptures.push({ query: `(${r.ruleSnake} ${jsString(r.close)})`, capture: '@punctuation.special' });
}
}

// ── D. Contextual node captures via emitted fields ──
// Operators carry an `operator` field in Pratt rules; they're already covered by
Expand Down Expand Up @@ -1753,6 +1836,49 @@ function buildScannerC(
L.push('');
}

// ── Interpolated-string char scanners (one per string token carrying interpolation) ──
// Each scans the literal run inside the string, stopping before the close delimiter or any
// interpolation opener (so the opener re-enters the expression grammar via its sub-rule). The
// openers are DATA from the interpolation metadata (decoded literals, length 1–2).
{
const cChar = (ch: string) => ch === '\\' ? "'\\\\'" : ch === "'" ? "'\\''" : `'${ch}'`;
for (const ip of ctx.interpolationPlans) {
const charsSym = ip.charsSnake.toUpperCase();
const up = ip.ruleSnake.toUpperCase();
const openerInit = ip.regions.map(r => jsString(r.open)).join(', ');
L.push(`// ── Interpolated-string scan (${ip.tokenName}): literal text up to the close delim or an opener ──`);
L.push(`static const char *${up}_OPENERS[] = { ${openerInit} };`);
L.push(`static const unsigned ${up}_OPENER_COUNT = ${ip.regions.length};`);
L.push(`static bool scan_${ip.ruleSnake}_chars(TSLexer *lexer) {`);
L.push(' bool has_content = false;');
L.push(' for (;;) {');
L.push(' lexer->mark_end(lexer);');
L.push(' int32_t c = lexer->lookahead;');
L.push(' if (c == 0) return false; // EOF — let the CFG report the unterminated string');
L.push(` if (c == ${cChar(ip.close)}) break; // closing delimiter`);
L.push(' bool first_match = false;');
L.push(` for (unsigned i = 0; i < ${up}_OPENER_COUNT; i++) if ((int32_t)${up}_OPENERS[i][0] == c) { first_match = true; break; }`);
L.push(' if (first_match) {');
L.push(' advance(lexer); // peek past the opener\'s first char');
L.push(' int32_t c2 = lexer->lookahead;');
L.push(' bool real = false;');
L.push(` for (unsigned i = 0; i < ${up}_OPENER_COUNT; i++)`);
L.push(` if ((int32_t)${up}_OPENERS[i][0] == c && (${up}_OPENERS[i][1] == 0 || (int32_t)${up}_OPENERS[i][1] == c2)) { real = true; break; }`);
L.push(' if (real) break; // a real opener — token ends before it (mark_end frozen above)');
L.push(' has_content = true; continue; // lone first char → literal content');
L.push(' }');
L.push(' if (c == \'\\\\\') { advance(lexer); if (lexer->lookahead != 0) advance(lexer); has_content = true; continue; }');
L.push(' advance(lexer);');
L.push(' has_content = true;');
L.push(' }');
L.push(' if (!has_content) return false;');
L.push(` lexer->result_symbol = ${charsSym};`);
L.push(' return true;');
L.push('}');
L.push('');
}
}

// ── scan() entry ──
L.push('bool tree_sitter_' + grammarName + '_external_scanner_scan(void *payload, TSLexer *lexer,');
L.push(' const bool *valid_symbols) {');
Expand Down Expand Up @@ -1797,6 +1923,14 @@ function buildScannerC(
L.push(' }');
L.push('');
}
for (const ip of ctx.interpolationPlans) {
const charsSym = ip.charsSnake.toUpperCase();
L.push(` // ${ip.tokenName} interpolated-string literal text (whitespace inside is content, not skipped).`);
L.push(` if (valid_symbols[${charsSym}]) {`);
L.push(` if (scan_${ip.ruleSnake}_chars(lexer)) return true;`);
L.push(' }');
L.push('');
}
L.push(' return false;');
L.push('}');
L.push('');
Expand Down
Loading