From c82735eba55878ef14fd364f535ae34506f84132 Mon Sep 17 00:00:00 2001 From: 8times4 <46720448+8times4@users.noreply.github.com> Date: Wed, 27 May 2026 16:46:26 +0200 Subject: [PATCH 1/4] add diarization support --- .../openai-transcription-diarization.md | 7 + docs/adapters/openai.md | 32 +- docs/comparison/vercel-ai-sdk.md | 2 +- docs/media/generation-hooks.md | 2 +- docs/media/transcription.md | 39 ++- .../interfaces/TranscriptionOptions.md | 2 +- packages/ai-client/src/generation-types.ts | 8 +- .../ai-openai/src/adapters/transcription.ts | 179 +++++++++-- .../audio/transcription-provider-options.ts | 10 + .../tests/transcription-adapter.test.ts | 300 ++++++++++++++++++ .../skills/ai-core/media-generation/SKILL.md | 8 +- .../activities/generateTranscription/index.ts | 8 +- packages/ai/src/types.ts | 8 +- 13 files changed, 571 insertions(+), 34 deletions(-) create mode 100644 .changeset/openai-transcription-diarization.md create mode 100644 packages/ai-openai/tests/transcription-adapter.test.ts diff --git a/.changeset/openai-transcription-diarization.md b/.changeset/openai-transcription-diarization.md new file mode 100644 index 000000000..67769277c --- /dev/null +++ b/.changeset/openai-transcription-diarization.md @@ -0,0 +1,7 @@ +--- +'@tanstack/ai': minor +'@tanstack/ai-client': minor +'@tanstack/ai-openai': minor +--- + +Add OpenAI transcription diarization support with `diarized_json` output, speaker-labeled segments, diarization model validation, chunking strategy options, and docs. diff --git a/docs/adapters/openai.md b/docs/adapters/openai.md index 5ad088e71..af5de6d1f 100644 --- a/docs/adapters/openai.md +++ b/docs/adapters/openai.md @@ -298,10 +298,11 @@ console.log(result.text); // Transcribed text const result = await generateTranscription({ adapter: openaiTranscription("whisper-1"), audio: audioFile, + responseFormat: "verbose_json", + prompt: "Technical terms: API, SDK", modelOptions: { - response_format: "verbose_json", // Get timestamps temperature: 0, - prompt: "Technical terms: API, SDK", + timestamp_granularities: ["word", "segment"], }, }); @@ -309,6 +310,31 @@ const result = await generateTranscription({ console.log(result.segments); ``` +### Speaker Diarization + +Use `gpt-4o-transcribe-diarize` for speaker-labeled transcripts: + +```typescript +const result = await generateTranscription({ + adapter: openaiTranscription("gpt-4o-transcribe-diarize"), + audio: meetingAudioFile, + modelOptions: { + chunking_strategy: "auto", + known_speaker_names: ["agent", "customer"], + known_speaker_references: [ + "data:audio/wav;base64,...", + "data:audio/wav;base64,...", + ], + }, +}); + +for (const segment of result.segments ?? []) { + console.log(segment.speaker, segment.start, segment.end, segment.text); +} +``` + +`gpt-4o-transcribe-diarize` defaults to `responseFormat: "diarized_json"` and `chunking_strategy: "auto"`. OpenAI does not support `prompt`, `include`, or `timestamp_granularities` with diarized transcription. + ## Environment Variables Set your API key in environment variables: @@ -357,7 +383,7 @@ Creates an OpenAI text-to-speech adapter. ### `openaiTranscription(model, config?)` / `createOpenaiTranscription(model, apiKey, config?)` -Creates an OpenAI transcription adapter (Whisper). +Creates an OpenAI transcription adapter for Whisper, GPT-4o transcription, and GPT-4o diarized transcription models. ### `openaiVideo(model, config?)` / `createOpenaiVideo(model, apiKey, config?)` diff --git a/docs/comparison/vercel-ai-sdk.md b/docs/comparison/vercel-ai-sdk.md index 1674ac506..c6c8e001b 100644 --- a/docs/comparison/vercel-ai-sdk.md +++ b/docs/comparison/vercel-ai-sdk.md @@ -409,7 +409,7 @@ const result = await generateSpeech({ }) ``` -**Transcription** - `generateTranscription()` supports 5 output formats (json, text, srt, verbose_json, vtt), word-level timestamps with confidence scores, and four providers (OpenAI, Grok, ElevenLabs, fal.ai), with speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model. +**Transcription** - `generateTranscription()` supports 6 output formats (json, text, srt, verbose_json, vtt, diarized_json), word-level timestamps with confidence scores, and four providers (OpenAI, Grok, ElevenLabs, fal.ai), with speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model. ```ts import { generateTranscription } from '@tanstack/ai' diff --git a/docs/media/generation-hooks.md b/docs/media/generation-hooks.md index 8120e9e9c..eda6c2496 100644 --- a/docs/media/generation-hooks.md +++ b/docs/media/generation-hooks.md @@ -214,7 +214,7 @@ The `generate` function accepts a `TranscriptionGenerateInput`: | `audio` | `string \| File \| Blob \| ArrayBuffer` | Audio data -- base64 string, File, Blob, or ArrayBuffer (required) | | `language` | `string` | Language in ISO-639-1 format (e.g., `"en"`) | | `prompt` | `string` | Optional prompt to guide the transcription | -| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt'` | Output format | +| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Output format | | `modelOptions` | `Record` | Model-specific options | ## useSummarize diff --git a/docs/media/transcription.md b/docs/media/transcription.md index 40747ed79..c38a82585 100644 --- a/docs/media/transcription.md +++ b/docs/media/transcription.md @@ -2,7 +2,7 @@ title: Transcription id: transcription order: 4 -description: "Transcribe audio to text with OpenAI Whisper and GPT-4o-transcribe via TanStack AI's generateTranscription() API." +description: "Transcribe audio to text with OpenAI Whisper and GPT-4o transcription models, including speaker diarization, via TanStack AI's generateTranscription() API." keywords: - tanstack ai - transcription @@ -22,7 +22,7 @@ TanStack AI provides support for audio transcription (speech-to-text) through de Audio transcription is handled by transcription adapters that follow the same tree-shakeable architecture as other adapters in TanStack AI. Currently supported: -- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe +- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe, GPT-4o-transcribe-diarize - **fal.ai**: Whisper, Wizper, speech-to-text turbo, ElevenLabs speech-to-text ## Basic Usage @@ -104,6 +104,8 @@ for (const segment of result.segments ?? []) { |--------|------|-------------| | `audio` | `File \| string` | Audio data (File object or base64 string) - required | | `language` | `string` | Language code (e.g., "en", "es", "fr") | +| `prompt` | `string` | Optional prompt to guide transcription style or terms. Not supported with `gpt-4o-transcribe-diarize`. | +| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Output format | ### Supported Languages @@ -136,6 +138,7 @@ const result = await generateTranscription({ prompt: 'Technical terms: API, SDK, CLI', // Top-level: guide transcription modelOptions: { temperature: 0, // Lower = more deterministic (provider option) + timestamp_granularities: ['word', 'segment'], }, }) ``` @@ -143,8 +146,11 @@ const result = await generateTranscription({ | Option | Type | Description | |--------|------|-------------| | `temperature` | `number` | Sampling temperature (0 to 1) | -| `timestamp_granularities` | `Array<'word' \| 'segment'>` | Timestamp granularity to populate (requires top-level `responseFormat: 'verbose_json'`) | +| `timestamp_granularities` | `Array<'word' \| 'segment'>` | Timestamp granularity to populate (`whisper-1` only; requires top-level `responseFormat: 'verbose_json'`) | | `include` | `string[]` | Additional values to include in the response (e.g., `logprobs`) | +| `chunking_strategy` | `'auto' \| { type: 'server_vad', ... } \| null` | Audio chunking strategy for `gpt-4o-transcribe-diarize`; required by OpenAI for diarization inputs longer than 30 seconds | +| `known_speaker_names` | `string[]` | Up to four speaker labels for diarization | +| `known_speaker_references` | `string[]` | 2-10 second data URL audio samples matching `known_speaker_names` | > `responseFormat` and `prompt` are **top-level** options on `generateTranscription`, not `modelOptions` keys. @@ -157,6 +163,32 @@ const result = await generateTranscription({ | `srt` | SubRip subtitle format | | `verbose_json` | Detailed JSON with timestamps and segments | | `vtt` | WebVTT subtitle format | +| `diarized_json` | JSON with speaker-labeled segments. Only supported by `gpt-4o-transcribe-diarize`. | + +### Speaker Diarization + +Use `gpt-4o-transcribe-diarize` when you need speaker labels. TanStack AI defaults this model to `responseFormat: 'diarized_json'` and sends `chunking_strategy: 'auto'` unless you provide a chunking strategy yourself. + +```typescript +const result = await generateTranscription({ + adapter: openaiTranscription('gpt-4o-transcribe-diarize'), + audio: meetingAudioFile, + modelOptions: { + chunking_strategy: 'auto', + known_speaker_names: ['agent', 'customer'], + known_speaker_references: [ + 'data:audio/wav;base64,...', + 'data:audio/wav;base64,...', + ], + }, +}) + +for (const segment of result.segments ?? []) { + console.log(segment.speaker, segment.start, segment.end, segment.text) +} +``` + +OpenAI accepts up to four known speaker references. The diarization model does not support `prompt`, `include`, or `timestamp_granularities`; the adapter rejects those combinations before making the API request. ## Response Format @@ -543,3 +575,4 @@ const adapter = createOpenaiTranscription('whisper-1', 'your-openai-api-key') 6. **Timestamps**: Request `verbose_json` format and enable `timestamp_granularities: ['word', 'segment']` when you need timing information for captions or synchronization. +7. **Diarization**: Use `gpt-4o-transcribe-diarize` with `diarized_json` output for multi-speaker audio. Keep `chunking_strategy: 'auto'` unless you need custom VAD tuning. diff --git a/docs/reference/interfaces/TranscriptionOptions.md b/docs/reference/interfaces/TranscriptionOptions.md index 3a87dcb06..ab31f7418 100644 --- a/docs/reference/interfaces/TranscriptionOptions.md +++ b/docs/reference/interfaces/TranscriptionOptions.md @@ -95,7 +95,7 @@ An optional prompt to guide the transcription ### responseFormat? ```ts -optional responseFormat: "text" | "json" | "srt" | "verbose_json" | "vtt"; +optional responseFormat: "text" | "json" | "srt" | "verbose_json" | "vtt" | "diarized_json"; ``` Defined in: [packages/ai/src/types.ts:1724](https://github.com/TanStack/ai/blob/main/packages/ai/src/types.ts#L1724) diff --git a/packages/ai-client/src/generation-types.ts b/packages/ai-client/src/generation-types.ts index 05d695cf8..b3b824e29 100644 --- a/packages/ai-client/src/generation-types.ts +++ b/packages/ai-client/src/generation-types.ts @@ -265,7 +265,13 @@ export interface TranscriptionGenerateInput { /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + responseFormat?: + | 'json' + | 'text' + | 'srt' + | 'verbose_json' + | 'vtt' + | 'diarized_json' /** Model-specific options */ modelOptions?: Record } diff --git a/packages/ai-openai/src/adapters/transcription.ts b/packages/ai-openai/src/adapters/transcription.ts index 249cbf8da..9893c6517 100644 --- a/packages/ai-openai/src/adapters/transcription.ts +++ b/packages/ai-openai/src/adapters/transcription.ts @@ -14,6 +14,24 @@ import type { OpenAITranscriptionModel } from '../model-meta' import type { OpenAITranscriptionProviderOptions } from '../audio/transcription-provider-options' import type { OpenAIClientConfig } from '../utils/client' +const DIARIZE_MODELS = ['gpt-4o-transcribe-diarize'] as const + +type DiarizeModel = (typeof DIARIZE_MODELS)[number] + +function isDiarizeModel(model: string): model is DiarizeModel { + return DIARIZE_MODELS.includes(model as DiarizeModel) +} + +function mapDiarizedSegmentId(id: string, index: number): number { + const match = /^seg_(\d+)$/.exec(id) + if (match) return Number(match[1]) + + const numericId = Number(id) + if (!Number.isNaN(numericId)) return numericId + + return index +} + /** * Build TokenUsage from transcription response. * Whisper-1 uses duration-based billing, GPT-4o models use token-based billing. @@ -86,12 +104,12 @@ export interface OpenAITranscriptionConfig extends OpenAIClientConfig {} * OpenAI Transcription (Speech-to-Text) Adapter * * Tree-shakeable adapter for OpenAI audio transcription functionality. - * Supports whisper-1, gpt-4o-transcribe, gpt-4o-mini-transcribe, and gpt-4o-transcribe-diarize models. + * Supports whisper-1, gpt-4o-transcribe, gpt-4o-mini-transcribe, and gpt-4o-transcribe-diarize. * * Features: * - Multiple transcription models with different capabilities * - Language detection or specification - * - Multiple output formats: json, text, srt, verbose_json, vtt + * - Multiple output formats: json, text, srt, verbose_json, vtt, diarized_json * - Word and segment-level timestamps (with verbose_json — whisper-1 only; * gpt-4o-* transcribe models accept only json/text and reject verbose_json * with HTTP 400) @@ -116,12 +134,24 @@ export class OpenAITranscriptionAdapter< options const file = this.prepareAudioFile(audio) + const isDiarizeTranscriptionModel = isDiarizeModel(model) + const useDiarized = + responseFormat === 'diarized_json' || + (isDiarizeTranscriptionModel && responseFormat === undefined) + this.validateDiarizationOptions({ + model, + prompt, + responseFormat, + modelOptions, + }) // With exactOptionalPropertyTypes, vendor SDK request shapes reject // `T | undefined` in optional fields. Build the request incrementally and // only set optional fields when they're actually defined. - const responseFormatValue = this.mapResponseFormat(responseFormat) - const request: OpenAI_SDK.Audio.TranscriptionCreateParams = { + const responseFormatValue = useDiarized + ? 'diarized_json' + : this.mapResponseFormat(responseFormat) + const request: OpenAI_SDK.Audio.TranscriptionCreateParamsNonStreaming = { model, file, ...(modelOptions ?? {}), @@ -132,6 +162,12 @@ export class OpenAITranscriptionAdapter< if (prompt !== undefined) { request.prompt = prompt } + if ( + isDiarizeTranscriptionModel && + modelOptions?.chunking_strategy === undefined + ) { + request.chunking_strategy = 'auto' + } if (responseFormatValue !== undefined) { request.response_format = responseFormatValue } @@ -139,14 +175,44 @@ export class OpenAITranscriptionAdapter< // Only Whisper supports verbose_json. The gpt-4o-* transcribe models // accept only json/text and reject verbose_json with HTTP 400. const useVerbose = - responseFormat === 'verbose_json' || + (!useDiarized && responseFormat === 'verbose_json') || (!responseFormat && model === 'whisper-1') try { options.logger.request( - `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose}`, + `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose} diarized=${useDiarized}`, { provider: this.name, model }, ) + if (useDiarized) { + const response = (await this.client.audio.transcriptions.create( + request, + )) as OpenAI_SDK.Audio.TranscriptionDiarized + + const segments = response.segments.map( + (segment, index): TranscriptionSegment => ({ + id: mapDiarizedSegmentId(segment.id, index), + start: segment.start, + end: segment.end, + text: segment.text, + speaker: segment.speaker, + }), + ) + + const usage = buildTranscriptionUsage( + model, + response.duration, + response, + ) + return { + id: generateId(this.name), + model, + text: response.text, + duration: response.duration, + ...(segments.length > 0 && { segments }), + ...(usage !== undefined && { usage }), + } + } + if (useVerbose) { const response = (await this.client.audio.transcriptions.create({ ...request, @@ -188,20 +254,20 @@ export class OpenAITranscriptionAdapter< ...(words !== undefined && { words }), ...(usage !== undefined && { usage }), } - } else { - const response = await this.client.audio.transcriptions.create(request) + } - const usage = - typeof response === 'string' - ? undefined - : buildTranscriptionUsage(model, undefined, response) - return { - id: generateId(this.name), - model, - text: typeof response === 'string' ? response : response.text, - ...(language !== undefined && { language }), - ...(usage !== undefined && { usage }), - } + const response = await this.client.audio.transcriptions.create(request) + + const usage = + typeof response === 'string' + ? undefined + : buildTranscriptionUsage(model, undefined, response) + return { + id: generateId(this.name), + model, + text: typeof response === 'string' ? response : response.text, + ...(language !== undefined && { language }), + ...(usage !== undefined && { usage }), } } catch (error: unknown) { options.logger.errors(`${this.name}.transcribe fatal`, { @@ -257,8 +323,81 @@ export class OpenAITranscriptionAdapter< } } + private validateDiarizationOptions({ + model, + prompt, + responseFormat, + modelOptions, + }: Pick< + TranscriptionOptions, + 'model' | 'prompt' | 'responseFormat' | 'modelOptions' + >): void { + const isDiarizeTranscriptionModel = isDiarizeModel(model) + + if ( + !isDiarizeTranscriptionModel && + (responseFormat === 'diarized_json' || + modelOptions?.known_speaker_names !== undefined || + modelOptions?.known_speaker_references !== undefined) + ) { + throw new Error( + 'OpenAI speaker diarization options are only supported with OpenAI diarization transcription models.', + ) + } + + if (!isDiarizeTranscriptionModel) return + + if (prompt !== undefined) { + throw new Error( + 'OpenAI diarization transcription models do not support prompts.', + ) + } + + if (modelOptions?.include !== undefined) { + throw new Error( + 'OpenAI diarization transcription models do not support the include option.', + ) + } + + if (modelOptions?.timestamp_granularities !== undefined) { + throw new Error( + 'OpenAI diarization transcription models do not support timestamp_granularities.', + ) + } + + if (modelOptions?.known_speaker_names !== undefined) { + const knownSpeakerCount = modelOptions.known_speaker_names.length + if (knownSpeakerCount > 4) { + throw new Error( + 'OpenAI diarization transcription models support at most 4 known speaker names.', + ) + } + } + + if (modelOptions?.known_speaker_references !== undefined) { + const knownSpeakerReferenceCount = + modelOptions.known_speaker_references.length + if (knownSpeakerReferenceCount > 4) { + throw new Error( + 'OpenAI diarization transcription models support at most 4 known speaker references.', + ) + } + } + + if ( + modelOptions?.known_speaker_names !== undefined && + modelOptions.known_speaker_references !== undefined && + modelOptions.known_speaker_names.length !== + modelOptions.known_speaker_references.length + ) { + throw new Error( + 'OpenAI diarization known_speaker_names and known_speaker_references must have matching lengths.', + ) + } + } + protected mapResponseFormat( - format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt', + format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' | 'diarized_json', ): OpenAI_SDK.Audio.TranscriptionCreateParams['response_format'] { if (!format) return 'json' return format diff --git a/packages/ai-openai/src/audio/transcription-provider-options.ts b/packages/ai-openai/src/audio/transcription-provider-options.ts index 17f619cb0..befa5df6a 100644 --- a/packages/ai-openai/src/audio/transcription-provider-options.ts +++ b/packages/ai-openai/src/audio/transcription-provider-options.ts @@ -38,4 +38,14 @@ export interface OpenAITranscriptionProviderOptions { * Optional list of audio samples (as data URLs) that contain known speaker references matching known_speaker_names[]. Each sample must be between 2 and 10 seconds, and can use any of the same input audio formats supported by file. */ known_speaker_references?: Array + /** + * Controls how the audio is cut into chunks. Required by OpenAI when + * `gpt-4o-transcribe-diarize` input is longer than 30 seconds. Use `"auto"` + * for the service-managed VAD strategy, or pass a `server_vad` config to tune + * segmentation. + */ + chunking_strategy?: + | 'auto' + | OpenAI.Audio.TranscriptionCreateParams.VadConfig + | null } diff --git a/packages/ai-openai/tests/transcription-adapter.test.ts b/packages/ai-openai/tests/transcription-adapter.test.ts new file mode 100644 index 000000000..fc68d3b86 --- /dev/null +++ b/packages/ai-openai/tests/transcription-adapter.test.ts @@ -0,0 +1,300 @@ +import { describe, expect, it, vi } from 'vitest' +import { resolveDebugOption } from '@tanstack/ai/adapter-internals' +import { + OpenAITranscriptionAdapter, + createOpenaiTranscription, +} from '../src/adapters/transcription' +import type OpenAI from 'openai' +import type { OpenAITranscriptionModel } from '../src/model-meta' + +const testLogger = resolveDebugOption(false) + +class TestOpenAITranscriptionAdapter< + TModel extends OpenAITranscriptionModel, +> extends OpenAITranscriptionAdapter { + spyOnTranscriptionsCreate() { + return vi.spyOn(this.client.audio.transcriptions, 'create') + } +} + +describe('OpenAI transcription adapter', () => { + it('creates a diarization-capable adapter', () => { + const adapter = createOpenaiTranscription( + 'gpt-4o-transcribe-diarize', + 'test-api-key', + ) + + expect(adapter).toBeInstanceOf(OpenAITranscriptionAdapter) + expect(adapter.name).toBe('openai') + }) + + it('defaults the diarization model to diarized_json with automatic chunking', async () => { + const mockResponse: OpenAI.Audio.TranscriptionDiarized = { + text: 'Agent: Hello\nCustomer: Hi', + duration: 2.2, + task: 'transcribe', + segments: [ + { + id: 'seg_0', + type: 'transcript.text.segment', + start: 0, + end: 1.4, + text: 'Hello', + speaker: 'agent', + }, + { + id: 'seg_1', + type: 'transcript.text.segment', + start: 1.5, + end: 2.2, + text: 'Hi', + speaker: 'customer', + }, + ], + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + const result = await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'meeting.wav', { type: 'audio/wav' }), + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'gpt-4o-transcribe-diarize', + response_format: 'diarized_json', + chunking_strategy: 'auto', + }), + ) + expect(result.text).toBe('Agent: Hello\nCustomer: Hi') + expect(result.segments).toEqual([ + { + id: 0, + start: 0, + end: 1.4, + text: 'Hello', + speaker: 'agent', + }, + { + id: 1, + start: 1.5, + end: 2.2, + text: 'Hi', + speaker: 'customer', + }, + ]) + }) + + it('passes explicit diarization chunking and known speaker references', async () => { + const mockResponse: OpenAI.Audio.TranscriptionDiarized = { + text: 'Speaker text', + duration: 1, + task: 'transcribe', + segments: [ + { + id: 'speaker-intro', + type: 'transcript.text.segment', + start: 0, + end: 1, + text: 'Speaker text', + speaker: 'agent', + }, + ], + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + const result = await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'meeting.wav', { type: 'audio/wav' }), + responseFormat: 'diarized_json', + modelOptions: { + chunking_strategy: { + type: 'server_vad', + threshold: 0.5, + prefix_padding_ms: 300, + silence_duration_ms: 500, + }, + known_speaker_names: ['agent'], + known_speaker_references: ['data:audio/wav;base64,AAA='], + }, + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + response_format: 'diarized_json', + chunking_strategy: { + type: 'server_vad', + threshold: 0.5, + prefix_padding_ms: 300, + silence_duration_ms: 500, + }, + known_speaker_names: ['agent'], + known_speaker_references: ['data:audio/wav;base64,AAA='], + }), + ) + expect(result.segments?.[0]?.id).toBe(0) + }) + + it('respects explicit null chunking for short diarization inputs', async () => { + const mockResponse: OpenAI.Audio.TranscriptionDiarized = { + text: 'Hello', + duration: 1, + task: 'transcribe', + segments: [], + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'short.wav', { type: 'audio/wav' }), + modelOptions: { + chunking_strategy: null, + }, + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + chunking_strategy: null, + }), + ) + }) + + it('allows json or text response formats for the diarization model', async () => { + const mockResponse: OpenAI.Audio.Transcription = { + text: 'Hello', + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + const result = await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'short.wav', { type: 'audio/wav' }), + responseFormat: 'json', + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + response_format: 'json', + chunking_strategy: 'auto', + }), + ) + expect(result).toMatchObject({ + model: 'gpt-4o-transcribe-diarize', + text: 'Hello', + }) + }) + + it('rejects diarized_json with non-diarization models', async () => { + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'whisper-1', + ) + + await expect( + adapter.transcribe({ + model: 'whisper-1', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + responseFormat: 'diarized_json', + logger: testLogger, + }), + ).rejects.toThrow('speaker diarization options') + }) + + it('rejects unsupported diarization prompt and timestamp options', async () => { + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + prompt: 'Use product vocabulary', + logger: testLogger, + }), + ).rejects.toThrow('do not support prompts') + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + timestamp_granularities: ['word'], + }, + logger: testLogger, + }), + ).rejects.toThrow('timestamp_granularities') + }) + + it('rejects unsupported diarization include and too many known speakers', async () => { + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + include: ['logprobs'], + }, + logger: testLogger, + }), + ).rejects.toThrow('include') + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + known_speaker_names: ['a', 'b', 'c', 'd', 'e'], + }, + logger: testLogger, + }), + ).rejects.toThrow('at most 4') + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + known_speaker_names: ['agent'], + known_speaker_references: [ + 'data:audio/wav;base64,AAA=', + 'data:audio/wav;base64,BBB=', + ], + }, + logger: testLogger, + }), + ).rejects.toThrow('matching lengths') + }) +}) diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index 09a552b73..239e29308 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -259,7 +259,7 @@ const { generate, result, isLoading } = useGenerateSpeech({ ### 4. Audio Transcription Adapter: `openaiTranscription` (whisper-1, gpt-4o-transcribe, -gpt-4o-mini-transcribe). +gpt-4o-mini-transcribe, gpt-4o-transcribe-diarize). ```typescript import { generateTranscription } from '@tanstack/ai' @@ -271,7 +271,7 @@ const result = await generateTranscription({ language: 'en', responseFormat: 'verbose_json', modelOptions: { - include: ['segment', 'word'], + timestamp_granularities: ['word', 'segment'], }, }) @@ -281,6 +281,10 @@ const result = await generateTranscription({ // result.segments -- timestamped segments with optional word-level timestamps ``` +For speaker diarization, use `openaiTranscription('gpt-4o-transcribe-diarize')`. +It defaults to `responseFormat: 'diarized_json'` and `chunking_strategy: 'auto'`; +do not pass `prompt`, `include`, or `timestamp_granularities` with this model. + Client hook: ```tsx diff --git a/packages/ai/src/activities/generateTranscription/index.ts b/packages/ai/src/activities/generateTranscription/index.ts index 90262e9e9..9705d57b0 100644 --- a/packages/ai/src/activities/generateTranscription/index.ts +++ b/packages/ai/src/activities/generateTranscription/index.ts @@ -59,7 +59,13 @@ export interface TranscriptionActivityOptions< /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + responseFormat?: + | 'json' + | 'text' + | 'srt' + | 'verbose_json' + | 'vtt' + | 'diarized_json' /** Provider-specific options for transcription */ modelOptions?: TranscriptionProviderOptions /** diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index 29b1c8032..c49f5ea41 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -1721,7 +1721,13 @@ export interface TranscriptionOptions< /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + responseFormat?: + | 'json' + | 'text' + | 'srt' + | 'verbose_json' + | 'vtt' + | 'diarized_json' /** Model-specific options for transcription */ modelOptions?: TProviderOptions /** From fbb57a0b778c6c56eb47ac14040c08afed2b6095 Mon Sep 17 00:00:00 2001 From: 8times4 <46720448+8times4@users.noreply.github.com> Date: Thu, 28 May 2026 13:08:58 +0200 Subject: [PATCH 2/4] fix coderabbit recommendations --- .../ai-openai/src/adapters/transcription.ts | 35 ++++++++++++++++--- .../tests/transcription-adapter.test.ts | 20 +++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/packages/ai-openai/src/adapters/transcription.ts b/packages/ai-openai/src/adapters/transcription.ts index 9893c6517..d805a78f8 100644 --- a/packages/ai-openai/src/adapters/transcription.ts +++ b/packages/ai-openai/src/adapters/transcription.ts @@ -15,8 +15,12 @@ import type { OpenAITranscriptionProviderOptions } from '../audio/transcription- import type { OpenAIClientConfig } from '../utils/client' const DIARIZE_MODELS = ['gpt-4o-transcribe-diarize'] as const +const DIARIZE_RESPONSE_FORMATS = ['json', 'text', 'diarized_json'] as const type DiarizeModel = (typeof DIARIZE_MODELS)[number] +type OpenAITranscriptionResponseFormat = NonNullable< + TranscriptionOptions['responseFormat'] +> function isDiarizeModel(model: string): model is DiarizeModel { return DIARIZE_MODELS.includes(model as DiarizeModel) @@ -168,9 +172,7 @@ export class OpenAITranscriptionAdapter< ) { request.chunking_strategy = 'auto' } - if (responseFormatValue !== undefined) { - request.response_format = responseFormatValue - } + request.response_format = responseFormatValue // Only Whisper supports verbose_json. The gpt-4o-* transcribe models // accept only json/text and reject verbose_json with HTTP 400. @@ -347,6 +349,29 @@ export class OpenAITranscriptionAdapter< if (!isDiarizeTranscriptionModel) return + const modelOptionsResponseFormat = ( + modelOptions as + | { responseFormat?: OpenAITranscriptionResponseFormat } + | undefined + )?.responseFormat + const requestedResponseFormats = [ + this.mapResponseFormat(responseFormat), + ...(modelOptionsResponseFormat !== undefined + ? [this.mapResponseFormat(modelOptionsResponseFormat)] + : []), + ] + const unsupportedResponseFormat = requestedResponseFormats.find( + (format) => + !DIARIZE_RESPONSE_FORMATS.includes( + format as (typeof DIARIZE_RESPONSE_FORMATS)[number], + ), + ) + if (unsupportedResponseFormat !== undefined) { + throw new Error( + 'OpenAI diarization transcription models only support json, text, and diarized_json response formats.', + ) + } + if (prompt !== undefined) { throw new Error( 'OpenAI diarization transcription models do not support prompts.', @@ -397,8 +422,8 @@ export class OpenAITranscriptionAdapter< } protected mapResponseFormat( - format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' | 'diarized_json', - ): OpenAI_SDK.Audio.TranscriptionCreateParams['response_format'] { + format?: OpenAITranscriptionResponseFormat, + ): OpenAITranscriptionResponseFormat { if (!format) return 'json' return format } diff --git a/packages/ai-openai/tests/transcription-adapter.test.ts b/packages/ai-openai/tests/transcription-adapter.test.ts index fc68d3b86..993bcb135 100644 --- a/packages/ai-openai/tests/transcription-adapter.test.ts +++ b/packages/ai-openai/tests/transcription-adapter.test.ts @@ -211,6 +211,26 @@ describe('OpenAI transcription adapter', () => { }) }) + it('rejects unsupported response formats for the diarization model', async () => { + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + + for (const responseFormat of ['srt', 'vtt', 'verbose_json'] as const) { + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + responseFormat, + logger: testLogger, + }), + ).rejects.toThrow( + 'diarization transcription models only support json, text, and diarized_json', + ) + } + }) + it('rejects diarized_json with non-diarization models', async () => { const adapter = new TestOpenAITranscriptionAdapter( { apiKey: 'test-api-key' }, From c7cf3fc776cc47ec63b5d2d84316561b9a1f5dcf Mon Sep 17 00:00:00 2001 From: 8times4 <46720448+8times4@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:27:38 +0200 Subject: [PATCH 3/4] refactor: update transcription options and support for diarization - Removed `transcribe-provider-options.ts` file and integrated its options into `transcription-provider-options.ts`. - Updated documentation to reflect changes in response formats, emphasizing the use of `modelOptions.response_format` for diarization. - Enhanced the transcription adapter to handle new model options and response formats, including support for speaker diarization. - Adjusted various components and tests to accommodate the new structure and ensure compatibility with the updated transcription features. --- docs/adapters/openai.md | 2 +- docs/comparison/vercel-ai-sdk.md | 2 +- docs/media/generation-hooks.md | 2 +- docs/media/transcription.md | 12 +- .../interfaces/TranscriptionOptions.md | 2 +- .../ts-react-chat/src/lib/audio-providers.ts | 26 ++- .../src/lib/server-audio-adapters.ts | 2 + examples/ts-react-chat/src/lib/server-fns.ts | 14 +- .../src/routes/api.transcribe.ts | 13 +- .../src/routes/generations.transcription.tsx | 15 +- knip.json | 1 - packages/ai-client/src/generation-types.ts | 9 +- .../ai-openai/src/adapters/transcription.ts | 166 +++++++++++------- .../src/audio/transcribe-provider-options.ts | 128 -------------- .../audio/transcription-provider-options.ts | 17 ++ .../tests/transcription-adapter.test.ts | 95 +++++++++- .../skills/ai-core/media-generation/SKILL.md | 2 +- .../activities/generateTranscription/index.ts | 14 +- packages/ai/src/types.ts | 15 +- testing/e2e/fixtures/transcription/basic.json | 2 +- .../fixtures/transcription/diarization.json | 32 ++++ .../e2e/src/components/TranscriptionUI.tsx | 57 +++++- testing/e2e/src/lib/feature-support.ts | 1 + testing/e2e/src/lib/features.ts | 4 + testing/e2e/src/lib/media-providers.ts | 7 +- testing/e2e/src/lib/server-functions.ts | 7 + testing/e2e/src/lib/types.ts | 2 + testing/e2e/src/routes/$provider/$feature.tsx | 3 + .../src/routes/api.transcription.stream.ts | 26 ++- testing/e2e/src/routes/api.transcription.ts | 26 ++- testing/e2e/tests/transcription.spec.ts | 40 +++++ 31 files changed, 501 insertions(+), 243 deletions(-) delete mode 100644 packages/ai-openai/src/audio/transcribe-provider-options.ts create mode 100644 testing/e2e/fixtures/transcription/diarization.json diff --git a/docs/adapters/openai.md b/docs/adapters/openai.md index af5de6d1f..05e5da9b5 100644 --- a/docs/adapters/openai.md +++ b/docs/adapters/openai.md @@ -333,7 +333,7 @@ for (const segment of result.segments ?? []) { } ``` -`gpt-4o-transcribe-diarize` defaults to `responseFormat: "diarized_json"` and `chunking_strategy: "auto"`. OpenAI does not support `prompt`, `include`, or `timestamp_granularities` with diarized transcription. +`gpt-4o-transcribe-diarize` defaults to `modelOptions.response_format: "diarized_json"` and `chunking_strategy: "auto"`. OpenAI does not support `prompt`, `include`, or `timestamp_granularities` with diarized transcription. ## Environment Variables diff --git a/docs/comparison/vercel-ai-sdk.md b/docs/comparison/vercel-ai-sdk.md index c6c8e001b..5e1f3954a 100644 --- a/docs/comparison/vercel-ai-sdk.md +++ b/docs/comparison/vercel-ai-sdk.md @@ -409,7 +409,7 @@ const result = await generateSpeech({ }) ``` -**Transcription** - `generateTranscription()` supports 6 output formats (json, text, srt, verbose_json, vtt, diarized_json), word-level timestamps with confidence scores, and four providers (OpenAI, Grok, ElevenLabs, fal.ai), with speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model. +**Transcription** - `generateTranscription()` supports common output formats (json, text, srt, verbose_json, vtt), word-level timestamps with confidence scores, and four providers (OpenAI, Grok, ElevenLabs, fal.ai), with speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model. ```ts import { generateTranscription } from '@tanstack/ai' diff --git a/docs/media/generation-hooks.md b/docs/media/generation-hooks.md index eda6c2496..743ad1458 100644 --- a/docs/media/generation-hooks.md +++ b/docs/media/generation-hooks.md @@ -214,7 +214,7 @@ The `generate` function accepts a `TranscriptionGenerateInput`: | `audio` | `string \| File \| Blob \| ArrayBuffer` | Audio data -- base64 string, File, Blob, or ArrayBuffer (required) | | `language` | `string` | Language in ISO-639-1 format (e.g., `"en"`) | | `prompt` | `string` | Optional prompt to guide the transcription | -| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Output format | +| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt'` | Common output format | | `modelOptions` | `Record` | Model-specific options | ## useSummarize diff --git a/docs/media/transcription.md b/docs/media/transcription.md index c38a82585..b7b86dbbc 100644 --- a/docs/media/transcription.md +++ b/docs/media/transcription.md @@ -105,7 +105,7 @@ for (const segment of result.segments ?? []) { | `audio` | `File \| string` | Audio data (File object or base64 string) - required | | `language` | `string` | Language code (e.g., "en", "es", "fr") | | `prompt` | `string` | Optional prompt to guide transcription style or terms. Not supported with `gpt-4o-transcribe-diarize`. | -| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Output format | +| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt'` | Common output format | ### Supported Languages @@ -148,6 +148,7 @@ const result = await generateTranscription({ | `temperature` | `number` | Sampling temperature (0 to 1) | | `timestamp_granularities` | `Array<'word' \| 'segment'>` | Timestamp granularity to populate (`whisper-1` only; requires top-level `responseFormat: 'verbose_json'`) | | `include` | `string[]` | Additional values to include in the response (e.g., `logprobs`) | +| `response_format` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Raw OpenAI response format. Use `diarized_json` here for speaker-labeled diarization output. | | `chunking_strategy` | `'auto' \| { type: 'server_vad', ... } \| null` | Audio chunking strategy for `gpt-4o-transcribe-diarize`; required by OpenAI for diarization inputs longer than 30 seconds | | `known_speaker_names` | `string[]` | Up to four speaker labels for diarization | | `known_speaker_references` | `string[]` | 2-10 second data URL audio samples matching `known_speaker_names` | @@ -163,11 +164,12 @@ const result = await generateTranscription({ | `srt` | SubRip subtitle format | | `verbose_json` | Detailed JSON with timestamps and segments | | `vtt` | WebVTT subtitle format | -| `diarized_json` | JSON with speaker-labeled segments. Only supported by `gpt-4o-transcribe-diarize`. | + +OpenAI's `gpt-4o-transcribe-diarize` also supports `modelOptions.response_format: 'diarized_json'` for speaker-labeled segments. ### Speaker Diarization -Use `gpt-4o-transcribe-diarize` when you need speaker labels. TanStack AI defaults this model to `responseFormat: 'diarized_json'` and sends `chunking_strategy: 'auto'` unless you provide a chunking strategy yourself. +Use `gpt-4o-transcribe-diarize` when you need speaker labels. TanStack AI defaults this model to `modelOptions.response_format: 'diarized_json'` and sends `chunking_strategy: 'auto'` unless you provide a chunking strategy yourself. ```typescript const result = await generateTranscription({ @@ -573,6 +575,6 @@ const adapter = createOpenaiTranscription('whisper-1', 'your-openai-api-key') 5. **Prompting**: Use the `prompt` option to provide context or expected vocabulary (e.g., technical terms, names). -6. **Timestamps**: Request `verbose_json` format and enable `timestamp_granularities: ['word', 'segment']` when you need timing information for captions or synchronization. +6. **Timestamps**: Request `responseFormat: 'verbose_json'` and set `modelOptions.timestamp_granularities` when you need timing information for captions or synchronization. -7. **Diarization**: Use `gpt-4o-transcribe-diarize` with `diarized_json` output for multi-speaker audio. Keep `chunking_strategy: 'auto'` unless you need custom VAD tuning. +7. **Diarization**: Use `gpt-4o-transcribe-diarize` with `modelOptions.response_format: 'diarized_json'` output for multi-speaker audio. Keep `chunking_strategy: 'auto'` unless you need custom VAD tuning. diff --git a/docs/reference/interfaces/TranscriptionOptions.md b/docs/reference/interfaces/TranscriptionOptions.md index ab31f7418..3a87dcb06 100644 --- a/docs/reference/interfaces/TranscriptionOptions.md +++ b/docs/reference/interfaces/TranscriptionOptions.md @@ -95,7 +95,7 @@ An optional prompt to guide the transcription ### responseFormat? ```ts -optional responseFormat: "text" | "json" | "srt" | "verbose_json" | "vtt" | "diarized_json"; +optional responseFormat: "text" | "json" | "srt" | "verbose_json" | "vtt"; ``` Defined in: [packages/ai/src/types.ts:1724](https://github.com/TanStack/ai/blob/main/packages/ai/src/types.ts#L1724) diff --git a/examples/ts-react-chat/src/lib/audio-providers.ts b/examples/ts-react-chat/src/lib/audio-providers.ts index 5ff72fae2..283f21779 100644 --- a/examples/ts-react-chat/src/lib/audio-providers.ts +++ b/examples/ts-react-chat/src/lib/audio-providers.ts @@ -6,6 +6,8 @@ * and audio generation flows. */ +import type { TranscriptionGenerateInput } from '@tanstack/ai-client' + export type SpeechProviderId = | 'openai' | 'gemini' @@ -87,13 +89,22 @@ export const SPEECH_PROVIDERS: ReadonlyArray = [ }, ] -export type TranscriptionProviderId = 'openai' | 'fal' | 'grok' | 'elevenlabs' +export type TranscriptionProviderId = + | 'openai' + | 'openai-diarize' + | 'fal' + | 'grok' + | 'elevenlabs' export interface TranscriptionProviderConfig { id: TranscriptionProviderId label: string model: string description: string + transcriptionOptions?: Pick< + TranscriptionGenerateInput, + 'responseFormat' | 'modelOptions' + > } export const TRANSCRIPTION_PROVIDERS: ReadonlyArray = @@ -104,6 +115,19 @@ export const TRANSCRIPTION_PROVIDERS: ReadonlyArray model: 'whisper-1', description: 'OpenAI Whisper transcription with optional streaming.', }, + { + id: 'openai-diarize', + label: 'OpenAI Diarize', + model: 'gpt-4o-transcribe-diarize', + description: + 'OpenAI diarized transcription with speaker-labeled segments.', + transcriptionOptions: { + modelOptions: { + response_format: 'diarized_json', + chunking_strategy: 'auto', + }, + }, + }, { id: 'fal', label: 'Fal Whisper', diff --git a/examples/ts-react-chat/src/lib/server-audio-adapters.ts b/examples/ts-react-chat/src/lib/server-audio-adapters.ts index ff9d8f47c..46cbb7c42 100644 --- a/examples/ts-react-chat/src/lib/server-audio-adapters.ts +++ b/examples/ts-react-chat/src/lib/server-audio-adapters.ts @@ -65,6 +65,8 @@ export function buildTranscriptionAdapter( switch (config.id) { case 'openai': return openaiTranscription(config.model as 'whisper-1') + case 'openai-diarize': + return openaiTranscription(config.model as 'gpt-4o-transcribe-diarize') case 'fal': return falTranscription(config.model) case 'grok': diff --git a/examples/ts-react-chat/src/lib/server-fns.ts b/examples/ts-react-chat/src/lib/server-fns.ts index b9b8ef62b..1c8109be4 100644 --- a/examples/ts-react-chat/src/lib/server-fns.ts +++ b/examples/ts-react-chat/src/lib/server-fns.ts @@ -78,7 +78,11 @@ const SPEECH_PROVIDER_SCHEMA = z .optional() const TRANSCRIPTION_PROVIDER_SCHEMA = z - .enum(['openai', 'fal', 'grok', 'elevenlabs']) + .enum(['openai', 'openai-diarize', 'fal', 'grok', 'elevenlabs']) + .optional() + +const TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA = z + .enum(['json', 'text', 'srt', 'verbose_json', 'vtt']) .optional() const AUDIO_PROVIDER_SCHEMA = z @@ -144,6 +148,8 @@ export const transcribeFn = createServerFn({ method: 'POST' }) z.object({ audio: z.string(), language: z.string().optional(), + responseFormat: TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA, + modelOptions: z.record(z.string(), z.any()).optional(), provider: TRANSCRIPTION_PROVIDER_SCHEMA, }), ) @@ -162,6 +168,8 @@ export const transcribeFn = createServerFn({ method: 'POST' }) adapter, audio: data.audio, language: data.language, + responseFormat: data.responseFormat, + modelOptions: data.modelOptions, }) }) @@ -316,6 +324,8 @@ export const transcribeStreamFn = createServerFn({ method: 'POST' }) z.object({ audio: z.string(), language: z.string().optional(), + responseFormat: TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA, + modelOptions: z.record(z.string(), z.any()).optional(), provider: TRANSCRIPTION_PROVIDER_SCHEMA, }), ) @@ -335,6 +345,8 @@ export const transcribeStreamFn = createServerFn({ method: 'POST' }) adapter, audio: data.audio, language: data.language, + responseFormat: data.responseFormat, + modelOptions: data.modelOptions, stream: true, }), ) diff --git a/examples/ts-react-chat/src/routes/api.transcribe.ts b/examples/ts-react-chat/src/routes/api.transcribe.ts index b841ea904..a26800547 100644 --- a/examples/ts-react-chat/src/routes/api.transcribe.ts +++ b/examples/ts-react-chat/src/routes/api.transcribe.ts @@ -8,12 +8,18 @@ import { } from '../lib/server-audio-adapters' const TRANSCRIPTION_PROVIDER_SCHEMA = z - .enum(['openai', 'fal', 'grok', 'elevenlabs']) + .enum(['openai', 'openai-diarize', 'fal', 'grok', 'elevenlabs']) + .optional() + +const TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA = z + .enum(['json', 'text', 'srt', 'verbose_json', 'vtt']) .optional() const TRANSCRIBE_BODY_SCHEMA = z.object({ audio: z.string().min(1), language: z.string().optional(), + responseFormat: TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA, + modelOptions: z.record(z.string(), z.any()).optional(), provider: TRANSCRIPTION_PROVIDER_SCHEMA, }) @@ -55,7 +61,8 @@ export const Route = createFileRoute('/api/transcribe')({ }) } - const { audio, language, provider } = parsed.data + const { audio, language, responseFormat, modelOptions, provider } = + parsed.data try { const adapter = buildTranscriptionAdapter(provider ?? 'openai') @@ -64,6 +71,8 @@ export const Route = createFileRoute('/api/transcribe')({ adapter, audio, language, + responseFormat, + modelOptions, stream: true, }) diff --git a/examples/ts-react-chat/src/routes/generations.transcription.tsx b/examples/ts-react-chat/src/routes/generations.transcription.tsx index b03f8838f..d889cd452 100644 --- a/examples/ts-react-chat/src/routes/generations.transcription.tsx +++ b/examples/ts-react-chat/src/routes/generations.transcription.tsx @@ -34,6 +34,8 @@ function TranscriptionForm({ data: { audio: input.audio as string, language: input.language, + responseFormat: input.responseFormat, + modelOptions: input.modelOptions, provider: config.id, }, }), @@ -45,6 +47,8 @@ function TranscriptionForm({ data: { audio: input.audio as string, language: input.language, + responseFormat: input.responseFormat, + modelOptions: input.modelOptions, provider: config.id, }, }), @@ -75,7 +79,11 @@ function TranscriptionUI({ ) const dataUrl = `data:${file.type};base64,${base64}` - await generate({ audio: dataUrl, language: 'en' }) + await generate({ + audio: dataUrl, + language: 'en', + ...config.transcriptionOptions, + }) if (fileInputRef.current) { fileInputRef.current.value = '' @@ -159,6 +167,11 @@ function TranscriptionUI({ {seg.start.toFixed(1)}s - {seg.end.toFixed(1)}s + {seg.speaker && ( + + {seg.speaker} + + )} {seg.text} ))} diff --git a/knip.json b/knip.json index 67ae81303..0b58e1a4c 100644 --- a/knip.json +++ b/knip.json @@ -13,7 +13,6 @@ "packages/ai-openai/live-tests/**", "packages/ai-openai/src/**/*.test.ts", "packages/ai-openai/src/audio/audio-provider-options.ts", - "packages/ai-openai/src/audio/transcribe-provider-options.ts", "packages/ai-openai/src/image/image-provider-options.ts", "packages/ai-devtools/src/production.ts", "codemods/**/__testfixtures__/**" diff --git a/packages/ai-client/src/generation-types.ts b/packages/ai-client/src/generation-types.ts index b3b824e29..d2d8ed47a 100644 --- a/packages/ai-client/src/generation-types.ts +++ b/packages/ai-client/src/generation-types.ts @@ -1,4 +1,5 @@ import type { StreamChunk } from '@tanstack/ai/client' +import type { TranscriptionResponseFormat } from '@tanstack/ai' import type { ConnectConnectionAdapter } from './connection-adapters' import type { AIDevtoolsClientMetadata } from './devtools' import type { @@ -265,13 +266,7 @@ export interface TranscriptionGenerateInput { /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: - | 'json' - | 'text' - | 'srt' - | 'verbose_json' - | 'vtt' - | 'diarized_json' + responseFormat?: TranscriptionResponseFormat /** Model-specific options */ modelOptions?: Record } diff --git a/packages/ai-openai/src/adapters/transcription.ts b/packages/ai-openai/src/adapters/transcription.ts index d805a78f8..1e26bb5af 100644 --- a/packages/ai-openai/src/adapters/transcription.ts +++ b/packages/ai-openai/src/adapters/transcription.ts @@ -11,16 +11,22 @@ import type { } from '@tanstack/ai' import type OpenAI_SDK from 'openai' import type { OpenAITranscriptionModel } from '../model-meta' -import type { OpenAITranscriptionProviderOptions } from '../audio/transcription-provider-options' +import type { + OpenAITranscriptionProviderOptions, + OpenAITranscriptionResponseFormat, +} from '../audio/transcription-provider-options' import type { OpenAIClientConfig } from '../utils/client' const DIARIZE_MODELS = ['gpt-4o-transcribe-diarize'] as const const DIARIZE_RESPONSE_FORMATS = ['json', 'text', 'diarized_json'] as const type DiarizeModel = (typeof DIARIZE_MODELS)[number] -type OpenAITranscriptionResponseFormat = NonNullable< - TranscriptionOptions['responseFormat'] -> +type OpenAITranscriptionResponseMode = 'diarized' | 'verbose' | 'plain' + +interface OpenAITranscriptionRequestPlan { + request: OpenAI_SDK.Audio.TranscriptionCreateParamsNonStreaming + responseMode: OpenAITranscriptionResponseMode +} function isDiarizeModel(model: string): model is DiarizeModel { return DIARIZE_MODELS.includes(model as DiarizeModel) @@ -134,58 +140,15 @@ export class OpenAITranscriptionAdapter< async transcribe( options: TranscriptionOptions, ): Promise { - const { model, audio, language, prompt, responseFormat, modelOptions } = - options - - const file = this.prepareAudioFile(audio) - const isDiarizeTranscriptionModel = isDiarizeModel(model) - const useDiarized = - responseFormat === 'diarized_json' || - (isDiarizeTranscriptionModel && responseFormat === undefined) - this.validateDiarizationOptions({ - model, - prompt, - responseFormat, - modelOptions, - }) - - // With exactOptionalPropertyTypes, vendor SDK request shapes reject - // `T | undefined` in optional fields. Build the request incrementally and - // only set optional fields when they're actually defined. - const responseFormatValue = useDiarized - ? 'diarized_json' - : this.mapResponseFormat(responseFormat) - const request: OpenAI_SDK.Audio.TranscriptionCreateParamsNonStreaming = { - model, - file, - ...(modelOptions ?? {}), - } - if (language !== undefined) { - request.language = language - } - if (prompt !== undefined) { - request.prompt = prompt - } - if ( - isDiarizeTranscriptionModel && - modelOptions?.chunking_strategy === undefined - ) { - request.chunking_strategy = 'auto' - } - request.response_format = responseFormatValue - - // Only Whisper supports verbose_json. The gpt-4o-* transcribe models - // accept only json/text and reject verbose_json with HTTP 400. - const useVerbose = - (!useDiarized && responseFormat === 'verbose_json') || - (!responseFormat && model === 'whisper-1') + const { model, language } = options + const { request, responseMode } = this.buildTranscriptionRequest(options) try { options.logger.request( - `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose} diarized=${useDiarized}`, + `activity=transcription provider=${this.name} model=${model} verbose=${responseMode === 'verbose'} diarized=${responseMode === 'diarized'}`, { provider: this.name, model }, ) - if (useDiarized) { + if (responseMode === 'diarized') { const response = (await this.client.audio.transcriptions.create( request, )) as OpenAI_SDK.Audio.TranscriptionDiarized @@ -215,7 +178,7 @@ export class OpenAITranscriptionAdapter< } } - if (useVerbose) { + if (responseMode === 'verbose') { const response = (await this.client.audio.transcriptions.create({ ...request, response_format: 'verbose_json', @@ -280,6 +243,89 @@ export class OpenAITranscriptionAdapter< } } + private buildTranscriptionRequest( + options: TranscriptionOptions, + ): OpenAITranscriptionRequestPlan { + const { model, audio, language, prompt, responseFormat, modelOptions } = + options + const file = this.prepareAudioFile(audio) + const isDiarizeTranscriptionModel = isDiarizeModel(model) + const topLevelResponseFormat = responseFormat as + | OpenAITranscriptionResponseFormat + | undefined + const effectiveResponseFormat = + topLevelResponseFormat ?? modelOptions?.response_format + + this.validateDiarizationOptions({ + model, + prompt, + responseFormat: topLevelResponseFormat, + modelOptions, + }) + + const responseMode = this.resolveResponseMode({ + model, + isDiarizeTranscriptionModel, + effectiveResponseFormat, + }) + const responseFormatValue = + responseMode === 'diarized' + ? 'diarized_json' + : this.mapResponseFormat(effectiveResponseFormat) + + // With exactOptionalPropertyTypes, vendor SDK request shapes reject + // `T | undefined` in optional fields. Build the request incrementally and + // only set optional fields when they're actually defined. + const request: OpenAI_SDK.Audio.TranscriptionCreateParamsNonStreaming = { + model, + file, + ...(modelOptions ?? {}), + } + if (language !== undefined) { + request.language = language + } + if (prompt !== undefined) { + request.prompt = prompt + } + if ( + isDiarizeTranscriptionModel && + modelOptions?.chunking_strategy === undefined + ) { + request.chunking_strategy = 'auto' + } + request.response_format = responseFormatValue + + return { request, responseMode } + } + + private resolveResponseMode({ + model, + isDiarizeTranscriptionModel, + effectiveResponseFormat, + }: { + model: string + isDiarizeTranscriptionModel: boolean + effectiveResponseFormat?: OpenAITranscriptionResponseFormat + }): OpenAITranscriptionResponseMode { + if ( + effectiveResponseFormat === 'diarized_json' || + (isDiarizeTranscriptionModel && effectiveResponseFormat === undefined) + ) { + return 'diarized' + } + + // Only Whisper supports verbose_json. The gpt-4o-* transcribe models + // accept only json/text and reject verbose_json with HTTP 400. + if ( + effectiveResponseFormat === 'verbose_json' || + (effectiveResponseFormat === undefined && model === 'whisper-1') + ) { + return 'verbose' + } + + return 'plain' + } + protected prepareAudioFile(audio: string | File | Blob | ArrayBuffer): File { if (typeof File !== 'undefined' && audio instanceof File) { return audio @@ -332,13 +378,18 @@ export class OpenAITranscriptionAdapter< modelOptions, }: Pick< TranscriptionOptions, - 'model' | 'prompt' | 'responseFormat' | 'modelOptions' - >): void { + 'model' | 'prompt' | 'modelOptions' + > & { + responseFormat?: OpenAITranscriptionResponseFormat + }): void { const isDiarizeTranscriptionModel = isDiarizeModel(model) + const modelOptionsResponseFormat = modelOptions?.response_format if ( !isDiarizeTranscriptionModel && (responseFormat === 'diarized_json' || + modelOptionsResponseFormat === 'diarized_json' || + modelOptions?.chunking_strategy !== undefined || modelOptions?.known_speaker_names !== undefined || modelOptions?.known_speaker_references !== undefined) ) { @@ -349,11 +400,6 @@ export class OpenAITranscriptionAdapter< if (!isDiarizeTranscriptionModel) return - const modelOptionsResponseFormat = ( - modelOptions as - | { responseFormat?: OpenAITranscriptionResponseFormat } - | undefined - )?.responseFormat const requestedResponseFormats = [ this.mapResponseFormat(responseFormat), ...(modelOptionsResponseFormat !== undefined @@ -372,7 +418,7 @@ export class OpenAITranscriptionAdapter< ) } - if (prompt !== undefined) { + if (prompt !== undefined || modelOptions?.prompt !== undefined) { throw new Error( 'OpenAI diarization transcription models do not support prompts.', ) diff --git a/packages/ai-openai/src/audio/transcribe-provider-options.ts b/packages/ai-openai/src/audio/transcribe-provider-options.ts deleted file mode 100644 index 063e719ff..000000000 --- a/packages/ai-openai/src/audio/transcribe-provider-options.ts +++ /dev/null @@ -1,128 +0,0 @@ -export interface TranscribeProviderOptions { - /** - * The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. - * https://platform.openai.com/docs/api-reference/audio/createTranscription#audio_createtranscription-file - */ - file: File - /** - * The model to use for transcription. - * https://platform.openai.com/docs/api-reference/audio/createTranscription#audio_createtranscription-model - */ - model: string - - chunking_strategy: - | 'auto' - | { - type: 'server_vad' - /** - * Amount of audio to include before the VAD detected speech (in milliseconds). - * @default 300 - */ - prefix_padding_ms?: number - /** - * Duration of silence to detect speech stop (in milliseconds). With shorter values the model will respond more quickly, but may jump in on short pauses from the user. - * @default 200 - */ - silence_duration_ms: number - /** - * Sensitivity threshold (0.0 to 1.0) for voice activity detection. A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments. - * @default 0.5 - */ - threshold?: number - } - /** - * Additional information to include in the transcription response. logprobs will return the log probabilities of the tokens in the response to understand the model's confidence in the transcription. logprobs only works with response_format set to json and only with the models gpt-4o-transcribe and gpt-4o-mini-transcribe. This field is not supported when using gpt-4o-transcribe-diarize. - */ - include?: Array - /** - * Optional list of speaker names that correspond to the audio samples provided in known_speaker_references[]. Each entry should be a short identifier (for example customer or agent). Up to 4 speakers are supported. - */ - known_speaker_names: Array - /** - * Optional list of audio samples (as data URLs) that contain known speaker references matching known_speaker_names[]. Each sample must be between 2 and 10 seconds, and can use any of the same input audio formats supported by file. - */ - known_speaker_references?: Array - /** - * The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency. - */ - language?: string - /** - * An optional prompt to guide the transcription model's style or to help with uncommon words or phrases. - */ - prompt?: string - /** - * The format of the output, in one of these options: json, text, srt, verbose_json, vtt, or diarized_json. For gpt-4o-transcribe and gpt-4o-mini-transcribe, the only supported format is json. For gpt-4o-transcribe-diarize, the supported formats are json, text, and diarized_json, with diarized_json required to receive speaker annotations. - */ - response_format?: - | 'json' - | 'text' - | 'srt' - | 'verbose_json' - | 'vtt' - | 'diarized_json' - - /** - * If set to true, the model response data will be streamed to the client as it is generated using server-sent events - * Note: Streaming is not supported for the whisper-1 model and will be ignored. - */ - stream?: boolean - /** - * The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. - */ - temperature?: number - /** - * The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. This option is not available for gpt-4o-transcribe-diarize. - */ - timestamp_granularities?: Array<'word' | 'segment'> -} - -export const validateTemperature = (options: TranscribeProviderOptions) => { - if (options.temperature) { - if (options.temperature < 0 || options.temperature > 1) { - throw new Error('Temperature must be between 0 and 1.') - } - } -} - -export const validateStream = (options: TranscribeProviderOptions) => { - const unsupportedModels = ['whisper-1'] - if (options.stream) { - if (unsupportedModels.includes(options.model)) { - throw new Error(`The model ${options.model} does not support streaming.`) - } - } -} - -export const validatePrompt = (options: TranscribeProviderOptions) => { - const unsupportedModels = ['gpt-4o-transcribe-diarize'] - if (options.prompt) { - if (unsupportedModels.includes(options.model)) { - throw new Error(`The model ${options.model} does not support prompts.`) - } - } -} - -export const validateKnownSpeakerNames = ( - options: TranscribeProviderOptions, -) => { - if (options.known_speaker_names.length > 4) { - throw new Error('A maximum of 4 known speaker names are supported.') - } -} - -export const validateInclude = (options: TranscribeProviderOptions) => { - const unsupportedModels = ['gpt-4o-transcribe-diarize'] - if (options.include) { - if (unsupportedModels.includes(options.model)) { - throw new Error( - `The model ${options.model} does not support the include field.`, - ) - } - } - - if (options.include && options.response_format !== 'json') { - throw new Error( - 'The include field is only supported when response_format is set to json.', - ) - } -} diff --git a/packages/ai-openai/src/audio/transcription-provider-options.ts b/packages/ai-openai/src/audio/transcription-provider-options.ts index befa5df6a..487ef0155 100644 --- a/packages/ai-openai/src/audio/transcription-provider-options.ts +++ b/packages/ai-openai/src/audio/transcription-provider-options.ts @@ -1,4 +1,9 @@ import type OpenAI from 'openai' +import type { TranscriptionResponseFormat } from '@tanstack/ai' + +export type OpenAITranscriptionResponseFormat = + | TranscriptionResponseFormat + | 'diarized_json' /** * Provider-specific options for OpenAI Transcription @@ -30,6 +35,18 @@ export interface OpenAITranscriptionProviderOptions { * Either or both of these options are supported: word, or segment. */ timestamp_granularities?: Array<'word' | 'segment'> + /** + * Raw OpenAI response_format option. Prefer the top-level responseFormat + * argument for common transcription formats when using + * generateTranscription(). Use `diarized_json` here for OpenAI diarization + * output. + */ + response_format?: OpenAITranscriptionResponseFormat + /** + * Raw OpenAI prompt option. Prefer the top-level prompt argument when using + * generateTranscription(). + */ + prompt?: string /** * Optional list of speaker names that correspond to the audio samples provided in known_speaker_references[]. Each entry should be a short identifier (for example customer or agent). Up to 4 speakers are supported. */ diff --git a/packages/ai-openai/tests/transcription-adapter.test.ts b/packages/ai-openai/tests/transcription-adapter.test.ts index 993bcb135..128bb9da7 100644 --- a/packages/ai-openai/tests/transcription-adapter.test.ts +++ b/packages/ai-openai/tests/transcription-adapter.test.ts @@ -119,8 +119,8 @@ describe('OpenAI transcription adapter', () => { const result = await adapter.transcribe({ model: 'gpt-4o-transcribe-diarize', audio: new File([], 'meeting.wav', { type: 'audio/wav' }), - responseFormat: 'diarized_json', modelOptions: { + response_format: 'diarized_json', chunking_strategy: { type: 'server_vad', threshold: 0.5, @@ -149,6 +149,49 @@ describe('OpenAI transcription adapter', () => { expect(result.segments?.[0]?.id).toBe(0) }) + it('uses snake_case modelOptions response_format for diarized output', async () => { + const mockResponse: OpenAI.Audio.TranscriptionDiarized = { + text: 'Agent: Hello', + duration: 1, + task: 'transcribe', + segments: [ + { + id: 'seg_0', + type: 'transcript.text.segment', + start: 0, + end: 1, + text: 'Hello', + speaker: 'agent', + }, + ], + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + const result = await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'meeting.wav', { type: 'audio/wav' }), + modelOptions: { + response_format: 'diarized_json', + chunking_strategy: null, + }, + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + response_format: 'diarized_json', + chunking_strategy: null, + }), + ) + expect(result.segments?.[0]?.speaker).toBe('agent') + }) + it('respects explicit null chunking for short diarization inputs', async () => { const mockResponse: OpenAI.Audio.TranscriptionDiarized = { text: 'Hello', @@ -229,9 +272,22 @@ describe('OpenAI transcription adapter', () => { 'diarization transcription models only support json, text, and diarized_json', ) } + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + response_format: 'verbose_json', + }, + logger: testLogger, + }), + ).rejects.toThrow( + 'diarization transcription models only support json, text, and diarized_json', + ) }) - it('rejects diarized_json with non-diarization models', async () => { + it('rejects diarization-only options with non-diarization models', async () => { const adapter = new TestOpenAITranscriptionAdapter( { apiKey: 'test-api-key' }, 'whisper-1', @@ -241,7 +297,29 @@ describe('OpenAI transcription adapter', () => { adapter.transcribe({ model: 'whisper-1', audio: new File([], 'audio.wav', { type: 'audio/wav' }), - responseFormat: 'diarized_json', + responseFormat: 'diarized_json' as never, + logger: testLogger, + }), + ).rejects.toThrow('speaker diarization options') + + await expect( + adapter.transcribe({ + model: 'whisper-1', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + response_format: 'diarized_json', + }, + logger: testLogger, + }), + ).rejects.toThrow('speaker diarization options') + + await expect( + adapter.transcribe({ + model: 'whisper-1', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + chunking_strategy: 'auto', + }, logger: testLogger, }), ).rejects.toThrow('speaker diarization options') @@ -262,6 +340,17 @@ describe('OpenAI transcription adapter', () => { }), ).rejects.toThrow('do not support prompts') + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + prompt: 'Use product vocabulary', + }, + logger: testLogger, + }), + ).rejects.toThrow('do not support prompts') + await expect( adapter.transcribe({ model: 'gpt-4o-transcribe-diarize', diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index 239e29308..1ea23f01e 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -282,7 +282,7 @@ const result = await generateTranscription({ ``` For speaker diarization, use `openaiTranscription('gpt-4o-transcribe-diarize')`. -It defaults to `responseFormat: 'diarized_json'` and `chunking_strategy: 'auto'`; +It defaults to `modelOptions.response_format: 'diarized_json'` and `chunking_strategy: 'auto'`; do not pass `prompt`, `include`, or `timestamp_granularities` with this model. Client hook: diff --git a/packages/ai/src/activities/generateTranscription/index.ts b/packages/ai/src/activities/generateTranscription/index.ts index 9705d57b0..5c2653e68 100644 --- a/packages/ai/src/activities/generateTranscription/index.ts +++ b/packages/ai/src/activities/generateTranscription/index.ts @@ -11,7 +11,11 @@ import { resolveDebugOption } from '../../logger/resolve' import type { InternalLogger } from '../../logger/internal-logger' import type { DebugOption } from '../../logger/types' import type { TranscriptionAdapter } from './adapter' -import type { StreamChunk, TranscriptionResult } from '../../types' +import type { + StreamChunk, + TranscriptionResponseFormat, + TranscriptionResult, +} from '../../types' // =========================== // Activity Kind @@ -59,13 +63,7 @@ export interface TranscriptionActivityOptions< /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: - | 'json' - | 'text' - | 'srt' - | 'verbose_json' - | 'vtt' - | 'diarized_json' + responseFormat?: TranscriptionResponseFormat /** Provider-specific options for transcription */ modelOptions?: TranscriptionProviderOptions /** diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index c49f5ea41..9ed744855 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -1709,6 +1709,13 @@ export interface TTSResult { * Options for audio transcription. * These are the common options supported across providers. */ +export type TranscriptionResponseFormat = + | 'json' + | 'text' + | 'srt' + | 'verbose_json' + | 'vtt' + export interface TranscriptionOptions< TProviderOptions extends object = object, > { @@ -1721,13 +1728,7 @@ export interface TranscriptionOptions< /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: - | 'json' - | 'text' - | 'srt' - | 'verbose_json' - | 'vtt' - | 'diarized_json' + responseFormat?: TranscriptionResponseFormat /** Model-specific options for transcription */ modelOptions?: TProviderOptions /** diff --git a/testing/e2e/fixtures/transcription/basic.json b/testing/e2e/fixtures/transcription/basic.json index 2a1616e26..a936b4bff 100644 --- a/testing/e2e/fixtures/transcription/basic.json +++ b/testing/e2e/fixtures/transcription/basic.json @@ -1,7 +1,7 @@ { "fixtures": [ { - "match": { "userMessage": "audio.mpeg" }, + "match": { "model": "whisper-1", "userMessage": "audio.mpeg" }, "response": { "transcription": { "text": "I would like to buy a Fender Stratocaster please" diff --git a/testing/e2e/fixtures/transcription/diarization.json b/testing/e2e/fixtures/transcription/diarization.json new file mode 100644 index 000000000..bb2d8041b --- /dev/null +++ b/testing/e2e/fixtures/transcription/diarization.json @@ -0,0 +1,32 @@ +{ + "fixtures": [ + { + "match": { "model": "gpt-4o-transcribe-diarize" }, + "response": { + "transcription": { + "text": "agent: Welcome to the store.\ncustomer: I need a Fender Stratocaster.", + "language": "english", + "duration": 3.2, + "segments": [ + { + "id": "seg_0", + "type": "transcript.text.segment", + "start": 0, + "end": 1.4, + "text": "Welcome to the store.", + "speaker": "agent" + }, + { + "id": "seg_1", + "type": "transcript.text.segment", + "start": 1.5, + "end": 3.2, + "text": "I need a Fender Stratocaster.", + "speaker": "customer" + } + ] + } + } + } + ] +} diff --git a/testing/e2e/src/components/TranscriptionUI.tsx b/testing/e2e/src/components/TranscriptionUI.tsx index 47a76bac9..536118f43 100644 --- a/testing/e2e/src/components/TranscriptionUI.tsx +++ b/testing/e2e/src/components/TranscriptionUI.tsx @@ -6,10 +6,16 @@ import { import { generateTranscriptionFn } from '@/lib/server-functions' import type { TranscriptionResult } from '@tanstack/ai' import type { TranscriptionGenerateInput } from '@tanstack/ai-client' -import type { Mode, Provider } from '@/lib/types' +import type { Feature, Mode, Provider } from '@/lib/types' + +type TranscriptionFeature = Extract< + Feature, + 'transcription' | 'transcription-diarization' +> interface TranscriptionUIProps { provider: Provider + feature: TranscriptionFeature mode: Mode testId?: string aimockPort?: number @@ -21,12 +27,29 @@ const TEST_AUDIO_BASE64 = 'data:audio/mpeg;base64,SGVsbG8=' export function TranscriptionUI({ provider, + feature, mode, testId, aimockPort, }: TranscriptionUIProps) { + const isDiarization = feature === 'transcription-diarization' + const transcriptionInput: TranscriptionGenerateInput = { + audio: TEST_AUDIO_BASE64, + language: 'en', + ...(isDiarization + ? { + modelOptions: { + response_format: 'diarized_json', + chunking_strategy: 'auto', + known_speaker_names: ['agent', 'customer'], + known_speaker_references: [TEST_AUDIO_BASE64, TEST_AUDIO_BASE64], + }, + } + : {}), + } + const connectionOptions = () => { - const body = { provider, testId, aimockPort } + const body = { provider, feature, testId, aimockPort } if (mode === 'sse') { return { connection: fetchServerSentEvents('/api/transcription'), body } @@ -40,7 +63,10 @@ export function TranscriptionUI({ data: { audio: input.audio as string, language: input.language, + responseFormat: input.responseFormat, + modelOptions: input.modelOptions, provider, + feature, aimockPort, testId, }, @@ -56,7 +82,7 @@ export function TranscriptionUI({
)} {result && ( -

- {result.text} -

+
+

+ {result.text} +

+ {result.segments && result.segments.length > 0 && ( +
+ {result.segments.map((segment, index) => ( +
+ {segment.speaker && ( + + {segment.speaker} + + )} + {segment.text} +
+ ))} +
+ )} +
)} ) diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index 6d6b950bd..9a61ed97c 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -182,6 +182,7 @@ export const matrix: Record> = { 'sound-effects': new Set(['elevenlabs']), tts: new Set(['openai', 'grok', 'elevenlabs']), transcription: new Set(['openai', 'grok', 'elevenlabs']), + 'transcription-diarization': new Set(['openai']), 'video-gen': new Set(['openai']), // Only Gemini currently surfaces a first-class stateful conversation API via // the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental). diff --git a/testing/e2e/src/lib/features.ts b/testing/e2e/src/lib/features.ts index 446859ce6..4656f7dbb 100644 --- a/testing/e2e/src/lib/features.ts +++ b/testing/e2e/src/lib/features.ts @@ -119,6 +119,10 @@ export const featureConfigs: Record = { tools: [], modelOptions: {}, }, + 'transcription-diarization': { + tools: [], + modelOptions: {}, + }, 'video-gen': { tools: [], modelOptions: {}, diff --git a/testing/e2e/src/lib/media-providers.ts b/testing/e2e/src/lib/media-providers.ts index 2fb534291..20ad722bf 100644 --- a/testing/e2e/src/lib/media-providers.ts +++ b/testing/e2e/src/lib/media-providers.ts @@ -96,11 +96,16 @@ export function createTranscriptionAdapter( provider: Provider, aimockPort?: number, testId?: string, + feature: Feature = 'transcription', ) { const headers = testHeaders(testId) + const openaiTranscriptionModel = + feature === 'transcription-diarization' + ? 'gpt-4o-transcribe-diarize' + : 'whisper-1' const factories: Record any> = { openai: () => - createOpenaiTranscription('whisper-1', DUMMY_KEY, { + createOpenaiTranscription(openaiTranscriptionModel, DUMMY_KEY, { baseURL: openaiUrl(aimockPort), defaultHeaders: headers, }), diff --git a/testing/e2e/src/lib/server-functions.ts b/testing/e2e/src/lib/server-functions.ts index 03132c193..d9124bdd6 100644 --- a/testing/e2e/src/lib/server-functions.ts +++ b/testing/e2e/src/lib/server-functions.ts @@ -7,6 +7,7 @@ import { generateVideo, getVideoJobStatus, } from '@tanstack/ai' +import type { TranscriptionResponseFormat } from '@tanstack/ai' import type { Feature, Provider } from '@/lib/types' import { createAudioAdapter, @@ -77,7 +78,10 @@ export const generateTranscriptionFn = createServerFn({ method: 'POST' }) (data: { audio: string language?: string + responseFormat?: TranscriptionResponseFormat + modelOptions?: Record provider: Provider + feature?: Feature aimockPort?: number testId?: string }) => { @@ -92,11 +96,14 @@ export const generateTranscriptionFn = createServerFn({ method: 'POST' }) data.provider, data.aimockPort, data.testId, + data.feature, ) return generateTranscription({ adapter, audio: data.audio, language: data.language, + responseFormat: data.responseFormat, + modelOptions: data.modelOptions, }) }) diff --git a/testing/e2e/src/lib/types.ts b/testing/e2e/src/lib/types.ts index 018e7744f..23c0dc21b 100644 --- a/testing/e2e/src/lib/types.ts +++ b/testing/e2e/src/lib/types.ts @@ -35,6 +35,7 @@ export type Feature = | 'sound-effects' | 'tts' | 'transcription' + | 'transcription-diarization' | 'video-gen' | 'stateful-interactions' @@ -74,6 +75,7 @@ export const ALL_FEATURES: Feature[] = [ 'sound-effects', 'tts', 'transcription', + 'transcription-diarization', 'video-gen', 'stateful-interactions', ] diff --git a/testing/e2e/src/routes/$provider/$feature.tsx b/testing/e2e/src/routes/$provider/$feature.tsx index ea080c4fc..d236d2c6a 100644 --- a/testing/e2e/src/routes/$provider/$feature.tsx +++ b/testing/e2e/src/routes/$provider/$feature.tsx @@ -44,6 +44,7 @@ const MEDIA_FEATURES = new Set([ 'image-gen', 'tts', 'transcription', + 'transcription-diarization', 'video-gen', 'audio-gen', 'sound-effects', @@ -142,9 +143,11 @@ function MediaFeature({ /> ) case 'transcription': + case 'transcription-diarization': return ( provider: Provider + feature?: Feature testId?: string aimockPort?: number } - const adapter = createTranscriptionAdapter(provider, aimockPort, testId) + const adapter = createTranscriptionAdapter( + provider, + aimockPort, + testId, + feature, + ) try { const stream = generateTranscription({ adapter, audio, language, + responseFormat, + modelOptions, stream: true, }) return toHttpResponse(stream, { abortController }) diff --git a/testing/e2e/src/routes/api.transcription.ts b/testing/e2e/src/routes/api.transcription.ts index 070b29db7..063fe4369 100644 --- a/testing/e2e/src/routes/api.transcription.ts +++ b/testing/e2e/src/routes/api.transcription.ts @@ -1,7 +1,8 @@ import { createFileRoute } from '@tanstack/react-router' import { generateTranscription, toServerSentEventsResponse } from '@tanstack/ai' import { createTranscriptionAdapter } from '@/lib/media-providers' -import type { Provider } from '@/lib/types' +import type { TranscriptionResponseFormat } from '@tanstack/ai' +import type { Feature, Provider } from '@/lib/types' export const Route = createFileRoute('/api/transcription')({ server: { @@ -11,21 +12,40 @@ export const Route = createFileRoute('/api/transcription')({ const abortController = new AbortController() const body = await request.json() const data = body.forwardedProps ?? body.data ?? body - const { audio, language, provider, testId, aimockPort } = data as { + const { + audio, + language, + responseFormat, + modelOptions, + provider, + feature, + testId, + aimockPort, + } = data as { audio: string language?: string + responseFormat?: TranscriptionResponseFormat + modelOptions?: Record provider: Provider + feature?: Feature testId?: string aimockPort?: number } - const adapter = createTranscriptionAdapter(provider, aimockPort, testId) + const adapter = createTranscriptionAdapter( + provider, + aimockPort, + testId, + feature, + ) try { const stream = generateTranscription({ adapter, audio, language, + responseFormat, + modelOptions, stream: true, }) return toServerSentEventsResponse(stream, { abortController }) diff --git a/testing/e2e/tests/transcription.spec.ts b/testing/e2e/tests/transcription.spec.ts index 85822b633..faf1cd0ed 100644 --- a/testing/e2e/tests/transcription.spec.ts +++ b/testing/e2e/tests/transcription.spec.ts @@ -53,3 +53,43 @@ for (const provider of providersFor('transcription')) { }) }) } + +for (const provider of providersFor('transcription-diarization')) { + test.describe(`${provider} -- transcription-diarization`, () => { + for (const mode of ['sse', 'http-stream', 'fetcher'] as const) { + test(`${mode} -- transcribes diarized audio`, async ({ + page, + testId, + aimockPort, + }) => { + await page.goto( + featureUrl( + provider, + 'transcription-diarization', + testId, + aimockPort, + mode, + ), + ) + await clickGenerate(page) + await waitForGenerationComplete(page) + + await expect(page.getByTestId('transcription-text')).toContainText( + 'Fender Stratocaster', + ) + await expect(page.getByTestId('transcription-segments')).toContainText( + 'Welcome to the store', + ) + await expect(page.getByTestId('transcription-segments')).toContainText( + 'I need a Fender Stratocaster', + ) + await expect(page.getByTestId('transcription-speaker-0')).toHaveText( + 'agent', + ) + await expect(page.getByTestId('transcription-speaker-1')).toHaveText( + 'customer', + ) + }) + } + }) +} From 58aa20c1d08dd35d1134b5ace7fc88ed4d5e67b5 Mon Sep 17 00:00:00 2001 From: 8times4 <46720448+8times4@users.noreply.github.com> Date: Sat, 13 Jun 2026 13:08:50 +0200 Subject: [PATCH 4/4] fix coderabbit findings --- examples/ts-react-chat/src/lib/server-fns.ts | 2 +- .../src/routes/api.transcribe.ts | 2 +- .../ai-openai/src/adapters/transcription.ts | 2 +- packages/ai/src/types.ts | 1 + testing/e2e/src/lib/media-providers.ts | 26 +++++++++++++++---- testing/e2e/src/lib/server-functions.ts | 5 +++- .../src/routes/api.transcription.stream.ts | 8 +++--- testing/e2e/src/routes/api.transcription.ts | 8 +++--- 8 files changed, 35 insertions(+), 19 deletions(-) diff --git a/examples/ts-react-chat/src/lib/server-fns.ts b/examples/ts-react-chat/src/lib/server-fns.ts index 1c8109be4..ae37ad3f6 100644 --- a/examples/ts-react-chat/src/lib/server-fns.ts +++ b/examples/ts-react-chat/src/lib/server-fns.ts @@ -82,7 +82,7 @@ const TRANSCRIPTION_PROVIDER_SCHEMA = z .optional() const TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA = z - .enum(['json', 'text', 'srt', 'verbose_json', 'vtt']) + .enum(['json', 'text', 'srt', 'verbose_json', 'vtt', 'diarized_json']) .optional() const AUDIO_PROVIDER_SCHEMA = z diff --git a/examples/ts-react-chat/src/routes/api.transcribe.ts b/examples/ts-react-chat/src/routes/api.transcribe.ts index a26800547..1c04adfac 100644 --- a/examples/ts-react-chat/src/routes/api.transcribe.ts +++ b/examples/ts-react-chat/src/routes/api.transcribe.ts @@ -12,7 +12,7 @@ const TRANSCRIPTION_PROVIDER_SCHEMA = z .optional() const TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA = z - .enum(['json', 'text', 'srt', 'verbose_json', 'vtt']) + .enum(['json', 'text', 'srt', 'verbose_json', 'vtt', 'diarized_json']) .optional() const TRANSCRIBE_BODY_SCHEMA = z.object({ diff --git a/packages/ai-openai/src/adapters/transcription.ts b/packages/ai-openai/src/adapters/transcription.ts index 1e26bb5af..84f249382 100644 --- a/packages/ai-openai/src/adapters/transcription.ts +++ b/packages/ai-openai/src/adapters/transcription.ts @@ -279,7 +279,7 @@ export class OpenAITranscriptionAdapter< const request: OpenAI_SDK.Audio.TranscriptionCreateParamsNonStreaming = { model, file, - ...(modelOptions ?? {}), + ...modelOptions, } if (language !== undefined) { request.language = language diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index 9ed744855..ef817d96c 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -1715,6 +1715,7 @@ export type TranscriptionResponseFormat = | 'srt' | 'verbose_json' | 'vtt' + | 'diarized_json' export interface TranscriptionOptions< TProviderOptions extends object = object, diff --git a/testing/e2e/src/lib/media-providers.ts b/testing/e2e/src/lib/media-providers.ts index 20ad722bf..e30ae1590 100644 --- a/testing/e2e/src/lib/media-providers.ts +++ b/testing/e2e/src/lib/media-providers.ts @@ -15,11 +15,17 @@ import { createElevenLabsSpeech, createElevenLabsTranscription, } from '@tanstack/ai-elevenlabs' +import type { TranscriptionResponseFormat } from '@tanstack/ai' import type { Feature, Provider } from '@/lib/types' const LLMOCK_DEFAULT_BASE = process.env.LLMOCK_URL || 'http://127.0.0.1:4010' const DUMMY_KEY = 'sk-e2e-test-dummy-key' +type TranscriptionAdapterOptions = { + responseFormat?: TranscriptionResponseFormat + modelOptions?: Record +} + function llmockBase(aimockPort?: number): string { if (aimockPort) return `http://127.0.0.1:${aimockPort}` return LLMOCK_DEFAULT_BASE @@ -33,6 +39,19 @@ function testHeaders(testId?: string): Record | undefined { return testId ? { 'X-Test-Id': testId } : undefined } +function getOpenaiTranscriptionModel(options: TranscriptionAdapterOptions) { + const modelOptions = options.modelOptions + const isDiarizationRequest = + options.responseFormat === 'diarized_json' || + modelOptions?.response_format === 'diarized_json' || + modelOptions?.diarize === true || + modelOptions?.chunking_strategy !== undefined || + modelOptions?.known_speaker_names !== undefined || + modelOptions?.known_speaker_references !== undefined + + return isDiarizationRequest ? 'gpt-4o-transcribe-diarize' : 'whisper-1' +} + export function createImageAdapter( provider: Provider, aimockPort?: number, @@ -96,13 +115,10 @@ export function createTranscriptionAdapter( provider: Provider, aimockPort?: number, testId?: string, - feature: Feature = 'transcription', + options: TranscriptionAdapterOptions = {}, ) { const headers = testHeaders(testId) - const openaiTranscriptionModel = - feature === 'transcription-diarization' - ? 'gpt-4o-transcribe-diarize' - : 'whisper-1' + const openaiTranscriptionModel = getOpenaiTranscriptionModel(options) const factories: Record any> = { openai: () => createOpenaiTranscription(openaiTranscriptionModel, DUMMY_KEY, { diff --git a/testing/e2e/src/lib/server-functions.ts b/testing/e2e/src/lib/server-functions.ts index d9124bdd6..60144dc81 100644 --- a/testing/e2e/src/lib/server-functions.ts +++ b/testing/e2e/src/lib/server-functions.ts @@ -96,7 +96,10 @@ export const generateTranscriptionFn = createServerFn({ method: 'POST' }) data.provider, data.aimockPort, data.testId, - data.feature, + { + responseFormat: data.responseFormat, + modelOptions: data.modelOptions, + }, ) return generateTranscription({ adapter, diff --git a/testing/e2e/src/routes/api.transcription.stream.ts b/testing/e2e/src/routes/api.transcription.stream.ts index ebbe9824a..3257a4012 100644 --- a/testing/e2e/src/routes/api.transcription.stream.ts +++ b/testing/e2e/src/routes/api.transcription.stream.ts @@ -1,8 +1,8 @@ import { createFileRoute } from '@tanstack/react-router' import { generateTranscription, toHttpResponse } from '@tanstack/ai' -import { createTranscriptionAdapter } from '@/lib/media-providers' import type { TranscriptionResponseFormat } from '@tanstack/ai' -import type { Feature, Provider } from '@/lib/types' +import type { Provider } from '@/lib/types' +import { createTranscriptionAdapter } from '@/lib/media-providers' export const Route = createFileRoute('/api/transcription/stream')({ server: { @@ -18,7 +18,6 @@ export const Route = createFileRoute('/api/transcription/stream')({ responseFormat, modelOptions, provider, - feature, testId, aimockPort, } = data as { @@ -27,7 +26,6 @@ export const Route = createFileRoute('/api/transcription/stream')({ responseFormat?: TranscriptionResponseFormat modelOptions?: Record provider: Provider - feature?: Feature testId?: string aimockPort?: number } @@ -36,7 +34,7 @@ export const Route = createFileRoute('/api/transcription/stream')({ provider, aimockPort, testId, - feature, + { responseFormat, modelOptions }, ) try { diff --git a/testing/e2e/src/routes/api.transcription.ts b/testing/e2e/src/routes/api.transcription.ts index 063fe4369..f18fd6867 100644 --- a/testing/e2e/src/routes/api.transcription.ts +++ b/testing/e2e/src/routes/api.transcription.ts @@ -1,8 +1,8 @@ import { createFileRoute } from '@tanstack/react-router' import { generateTranscription, toServerSentEventsResponse } from '@tanstack/ai' -import { createTranscriptionAdapter } from '@/lib/media-providers' import type { TranscriptionResponseFormat } from '@tanstack/ai' -import type { Feature, Provider } from '@/lib/types' +import type { Provider } from '@/lib/types' +import { createTranscriptionAdapter } from '@/lib/media-providers' export const Route = createFileRoute('/api/transcription')({ server: { @@ -18,7 +18,6 @@ export const Route = createFileRoute('/api/transcription')({ responseFormat, modelOptions, provider, - feature, testId, aimockPort, } = data as { @@ -27,7 +26,6 @@ export const Route = createFileRoute('/api/transcription')({ responseFormat?: TranscriptionResponseFormat modelOptions?: Record provider: Provider - feature?: Feature testId?: string aimockPort?: number } @@ -36,7 +34,7 @@ export const Route = createFileRoute('/api/transcription')({ provider, aimockPort, testId, - feature, + { responseFormat, modelOptions }, ) try {