TanStack · 8times4 · May 27, 2026 · May 28, 2026 · Jun 12, 2026 · Jun 13, 2026
diff --git a/.changeset/openai-transcription-diarization.md b/.changeset/openai-transcription-diarization.md
@@ -0,0 +1,7 @@
+---
+'@tanstack/ai': minor
+'@tanstack/ai-client': minor
+'@tanstack/ai-openai': minor
+---
+
+Add OpenAI transcription diarization support with `diarized_json` output, speaker-labeled segments, diarization model validation, chunking strategy options, and docs.
diff --git a/docs/adapters/openai.md b/docs/adapters/openai.md
@@ -298,17 +298,43 @@ console.log(result.text); // Transcribed text
 const result = await generateTranscription({
   adapter: openaiTranscription("whisper-1"),
   audio: audioFile,
+  responseFormat: "verbose_json",
+  prompt: "Technical terms: API, SDK",
   modelOptions: {
-    response_format: "verbose_json", // Get timestamps
     temperature: 0,
-    prompt: "Technical terms: API, SDK",
+    timestamp_granularities: ["word", "segment"],
   },
 });
 
 // Access segments with timestamps
 console.log(result.segments);
 ```
 
+### Speaker Diarization
+
+Use `gpt-4o-transcribe-diarize` for speaker-labeled transcripts:
+
+```typescript
+const result = await generateTranscription({
+  adapter: openaiTranscription("gpt-4o-transcribe-diarize"),
+  audio: meetingAudioFile,
+  modelOptions: {
+    chunking_strategy: "auto",
+    known_speaker_names: ["agent", "customer"],
+    known_speaker_references: [
+      "data:audio/wav;base64,...",
+      "data:audio/wav;base64,...",
+    ],
+  },
+});
+
+for (const segment of result.segments ?? []) {
+  console.log(segment.speaker, segment.start, segment.end, segment.text);
+}
+```
+
+`gpt-4o-transcribe-diarize` defaults to `modelOptions.response_format: "diarized_json"` and `chunking_strategy: "auto"`. OpenAI does not support `prompt`, `include`, or `timestamp_granularities` with diarized transcription.
+
 ## Environment Variables
 
 Set your API key in environment variables:
@@ -357,7 +383,7 @@ Creates an OpenAI text-to-speech adapter.
 
 ### `openaiTranscription(model, config?)` / `createOpenaiTranscription(model, apiKey, config?)`
 
-Creates an OpenAI transcription adapter (Whisper).
+Creates an OpenAI transcription adapter for Whisper, GPT-4o transcription, and GPT-4o diarized transcription models.
 
 ### `openaiVideo(model, config?)` / `createOpenaiVideo(model, apiKey, config?)`
 

diff --git a/docs/comparison/vercel-ai-sdk.md b/docs/comparison/vercel-ai-sdk.md
@@ -409,7 +409,7 @@ const result = await generateSpeech({
 })
 ```
 
-**Transcription** - `generateTranscription()` supports 5 output formats (json, text, srt, verbose_json, vtt), word-level timestamps with confidence scores, and four providers (OpenAI, Grok, ElevenLabs, fal.ai), with speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model.
+**Transcription** - `generateTranscription()` supports common output formats (json, text, srt, verbose_json, vtt), word-level timestamps with confidence scores, and four providers (OpenAI, Grok, ElevenLabs, fal.ai), with speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model.
 
 ```ts
 import { generateTranscription } from '@tanstack/ai'

diff --git a/docs/media/generation-hooks.md b/docs/media/generation-hooks.md
@@ -214,7 +214,7 @@ The `generate` function accepts a `TranscriptionGenerateInput`:
 | `audio` | `string \| File \| Blob \| ArrayBuffer` | Audio data -- base64 string, File, Blob, or ArrayBuffer (required) |
 | `language` | `string` | Language in ISO-639-1 format (e.g., `"en"`) |
 | `prompt` | `string` | Optional prompt to guide the transcription |
-| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt'` | Output format |
+| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt'` | Common output format |
 | `modelOptions` | `Record<string, any>` | Model-specific options |
 
 ## useSummarize

diff --git a/docs/media/transcription.md b/docs/media/transcription.md
@@ -2,7 +2,7 @@
 title: Transcription
 id: transcription
 order: 4
-description: "Transcribe audio to text with OpenAI Whisper and GPT-4o-transcribe via TanStack AI's generateTranscription() API."
+description: "Transcribe audio to text with OpenAI Whisper and GPT-4o transcription models, including speaker diarization, via TanStack AI's generateTranscription() API."
 keywords:
   - tanstack ai
   - transcription
@@ -22,7 +22,7 @@ TanStack AI provides support for audio transcription (speech-to-text) through de
 Audio transcription is handled by transcription adapters that follow the same tree-shakeable architecture as other adapters in TanStack AI.
 
 Currently supported:
-- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe
+- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe, GPT-4o-transcribe-diarize
 - **fal.ai**: Whisper, Wizper, speech-to-text turbo, ElevenLabs speech-to-text
 
 ## Basic Usage
@@ -104,6 +104,8 @@ for (const segment of result.segments ?? []) {
 |--------|------|-------------|
 | `audio` | `File \| string` | Audio data (File object or base64 string) - required |
 | `language` | `string` | Language code (e.g., "en", "es", "fr") |
+| `prompt` | `string` | Optional prompt to guide transcription style or terms. Not supported with `gpt-4o-transcribe-diarize`. |
+| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt'` | Common output format |
 
 ### Supported Languages
 
@@ -136,15 +138,20 @@ const result = await generateTranscription({
   prompt: 'Technical terms: API, SDK, CLI', // Top-level: guide transcription
   modelOptions: {
     temperature: 0, // Lower = more deterministic (provider option)
+    timestamp_granularities: ['word', 'segment'],
   },
 })
 ```
 
 | Option | Type | Description |
 |--------|------|-------------|
 | `temperature` | `number` | Sampling temperature (0 to 1) |
-| `timestamp_granularities` | `Array<'word' \| 'segment'>` | Timestamp granularity to populate (requires top-level `responseFormat: 'verbose_json'`) |
+| `timestamp_granularities` | `Array<'word' \| 'segment'>` | Timestamp granularity to populate (`whisper-1` only; requires top-level `responseFormat: 'verbose_json'`) |
 | `include` | `string[]` | Additional values to include in the response (e.g., `logprobs`) |
+| `response_format` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Raw OpenAI response format. Use `diarized_json` here for speaker-labeled diarization output. |
+| `chunking_strategy` | `'auto' \| { type: 'server_vad', ... } \| null` | Audio chunking strategy for `gpt-4o-transcribe-diarize`; required by OpenAI for diarization inputs longer than 30 seconds |
+| `known_speaker_names` | `string[]` | Up to four speaker labels for diarization |
+| `known_speaker_references` | `string[]` | 2-10 second data URL audio samples matching `known_speaker_names` |
 
 > `responseFormat` and `prompt` are **top-level** options on `generateTranscription`, not `modelOptions` keys.
 
@@ -158,6 +165,33 @@ const result = await generateTranscription({
 | `verbose_json` | Detailed JSON with timestamps and segments |
 | `vtt` | WebVTT subtitle format |
 
+OpenAI's `gpt-4o-transcribe-diarize` also supports `modelOptions.response_format: 'diarized_json'` for speaker-labeled segments.
+
+### Speaker Diarization
+
+Use `gpt-4o-transcribe-diarize` when you need speaker labels. TanStack AI defaults this model to `modelOptions.response_format: 'diarized_json'` and sends `chunking_strategy: 'auto'` unless you provide a chunking strategy yourself.
+
+```typescript
+const result = await generateTranscription({
+  adapter: openaiTranscription('gpt-4o-transcribe-diarize'),
+  audio: meetingAudioFile,
+  modelOptions: {
+    chunking_strategy: 'auto',
+    known_speaker_names: ['agent', 'customer'],
+    known_speaker_references: [
+      'data:audio/wav;base64,...',
+      'data:audio/wav;base64,...',
+    ],
+  },
+})
+
+for (const segment of result.segments ?? []) {
+  console.log(segment.speaker, segment.start, segment.end, segment.text)
+}
+```
+
+OpenAI accepts up to four known speaker references. The diarization model does not support `prompt`, `include`, or `timestamp_granularities`; the adapter rejects those combinations before making the API request.
+
 ## Response Format
 
 The transcription result includes:
@@ -541,5 +575,6 @@ const adapter = createOpenaiTranscription('whisper-1', 'your-openai-api-key')
 
 5. **Prompting**: Use the `prompt` option to provide context or expected vocabulary (e.g., technical terms, names).
 
-6. **Timestamps**: Request `verbose_json` format and enable `timestamp_granularities: ['word', 'segment']` when you need timing information for captions or synchronization.
+6. **Timestamps**: Request `responseFormat: 'verbose_json'` and set `modelOptions.timestamp_granularities` when you need timing information for captions or synchronization.
 
+7. **Diarization**: Use `gpt-4o-transcribe-diarize` with `modelOptions.response_format: 'diarized_json'` output for multi-speaker audio. Keep `chunking_strategy: 'auto'` unless you need custom VAD tuning.
diff --git a/examples/ts-react-chat/src/lib/audio-providers.ts b/examples/ts-react-chat/src/lib/audio-providers.ts
@@ -6,6 +6,8 @@
  * and audio generation flows.
  */
 
+import type { TranscriptionGenerateInput } from '@tanstack/ai-client'
+
 export type SpeechProviderId =
   | 'openai'
   | 'gemini'
@@ -87,13 +89,22 @@ export const SPEECH_PROVIDERS: ReadonlyArray<SpeechProviderConfig> = [
   },
 ]
 
-export type TranscriptionProviderId = 'openai' | 'fal' | 'grok' | 'elevenlabs'
+export type TranscriptionProviderId =
+  | 'openai'
+  | 'openai-diarize'
+  | 'fal'
+  | 'grok'
+  | 'elevenlabs'
 
 export interface TranscriptionProviderConfig {
   id: TranscriptionProviderId
   label: string
   model: string
   description: string
+  transcriptionOptions?: Pick<
+    TranscriptionGenerateInput,
+    'responseFormat' | 'modelOptions'
+  >
 }
 
 export const TRANSCRIPTION_PROVIDERS: ReadonlyArray<TranscriptionProviderConfig> =
@@ -104,6 +115,19 @@ export const TRANSCRIPTION_PROVIDERS: ReadonlyArray<TranscriptionProviderConfig>
       model: 'whisper-1',
       description: 'OpenAI Whisper transcription with optional streaming.',
     },
+    {
+      id: 'openai-diarize',
+      label: 'OpenAI Diarize',
+      model: 'gpt-4o-transcribe-diarize',
+      description:
+        'OpenAI diarized transcription with speaker-labeled segments.',
+      transcriptionOptions: {
+        modelOptions: {
+          response_format: 'diarized_json',
+          chunking_strategy: 'auto',
+        },
+      },
+    },
     {
       id: 'fal',
       label: 'Fal Whisper',

diff --git a/examples/ts-react-chat/src/lib/server-audio-adapters.ts b/examples/ts-react-chat/src/lib/server-audio-adapters.ts
@@ -65,6 +65,8 @@ export function buildTranscriptionAdapter(
   switch (config.id) {
     case 'openai':
       return openaiTranscription(config.model as 'whisper-1')
+    case 'openai-diarize':
+      return openaiTranscription(config.model as 'gpt-4o-transcribe-diarize')
     case 'fal':
       return falTranscription(config.model)
     case 'grok':

diff --git a/examples/ts-react-chat/src/lib/server-fns.ts b/examples/ts-react-chat/src/lib/server-fns.ts
@@ -78,7 +78,11 @@ const SPEECH_PROVIDER_SCHEMA = z
   .optional()
 
 const TRANSCRIPTION_PROVIDER_SCHEMA = z
-  .enum(['openai', 'fal', 'grok', 'elevenlabs'])
+  .enum(['openai', 'openai-diarize', 'fal', 'grok', 'elevenlabs'])
+  .optional()
+
+const TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA = z
+  .enum(['json', 'text', 'srt', 'verbose_json', 'vtt', 'diarized_json'])
   .optional()
 
 const AUDIO_PROVIDER_SCHEMA = z
@@ -144,6 +148,8 @@ export const transcribeFn = createServerFn({ method: 'POST' })
     z.object({
       audio: z.string(),
       language: z.string().optional(),
+      responseFormat: TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA,
+      modelOptions: z.record(z.string(), z.any()).optional(),
       provider: TRANSCRIPTION_PROVIDER_SCHEMA,
     }),
   )
@@ -162,6 +168,8 @@ export const transcribeFn = createServerFn({ method: 'POST' })
       adapter,
       audio: data.audio,
       language: data.language,
+      responseFormat: data.responseFormat,
+      modelOptions: data.modelOptions,
     })
   })
 
@@ -316,6 +324,8 @@ export const transcribeStreamFn = createServerFn({ method: 'POST' })
     z.object({
       audio: z.string(),
       language: z.string().optional(),
+      responseFormat: TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA,
+      modelOptions: z.record(z.string(), z.any()).optional(),
       provider: TRANSCRIPTION_PROVIDER_SCHEMA,
     }),
   )
@@ -335,6 +345,8 @@ export const transcribeStreamFn = createServerFn({ method: 'POST' })
         adapter,
         audio: data.audio,
         language: data.language,
+        responseFormat: data.responseFormat,
+        modelOptions: data.modelOptions,
         stream: true,
       }),
     )

diff --git a/examples/ts-react-chat/src/routes/api.transcribe.ts b/examples/ts-react-chat/src/routes/api.transcribe.ts
@@ -8,12 +8,18 @@ import {
 } from '../lib/server-audio-adapters'
 
 const TRANSCRIPTION_PROVIDER_SCHEMA = z
-  .enum(['openai', 'fal', 'grok', 'elevenlabs'])
+  .enum(['openai', 'openai-diarize', 'fal', 'grok', 'elevenlabs'])
+  .optional()
+
+const TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA = z
+  .enum(['json', 'text', 'srt', 'verbose_json', 'vtt', 'diarized_json'])
   .optional()
 
 const TRANSCRIBE_BODY_SCHEMA = z.object({
   audio: z.string().min(1),
   language: z.string().optional(),
+  responseFormat: TRANSCRIPTION_RESPONSE_FORMAT_SCHEMA,
+  modelOptions: z.record(z.string(), z.any()).optional(),
   provider: TRANSCRIPTION_PROVIDER_SCHEMA,
 })
 
@@ -55,7 +61,8 @@ export const Route = createFileRoute('/api/transcribe')({
           })
         }
 
-        const { audio, language, provider } = parsed.data
+        const { audio, language, responseFormat, modelOptions, provider } =
+          parsed.data
 
         try {
           const adapter = buildTranscriptionAdapter(provider ?? 'openai')
@@ -64,6 +71,8 @@ export const Route = createFileRoute('/api/transcribe')({
             adapter,
             audio,
             language,
+            responseFormat,
+            modelOptions,
             stream: true,
           })
 

diff --git a/examples/ts-react-chat/src/routes/generations.transcription.tsx b/examples/ts-react-chat/src/routes/generations.transcription.tsx
@@ -34,6 +34,8 @@ function TranscriptionForm({
             data: {
               audio: input.audio as string,
               language: input.language,
+              responseFormat: input.responseFormat,
+              modelOptions: input.modelOptions,
               provider: config.id,
             },
           }),
@@ -45,6 +47,8 @@ function TranscriptionForm({
           data: {
             audio: input.audio as string,
             language: input.language,
+            responseFormat: input.responseFormat,
+            modelOptions: input.modelOptions,
             provider: config.id,
           },
         }),
@@ -75,7 +79,11 @@ function TranscriptionUI({
     )
     const dataUrl = `data:${file.type};base64,${base64}`
 
-    await generate({ audio: dataUrl, language: 'en' })
+    await generate({
+      audio: dataUrl,
+      language: 'en',
+      ...config.transcriptionOptions,
+    })
 
     if (fileInputRef.current) {
       fileInputRef.current.value = ''
@@ -159,6 +167,11 @@ function TranscriptionUI({
                     <span className="text-gray-500 font-mono whitespace-nowrap">
                       {seg.start.toFixed(1)}s - {seg.end.toFixed(1)}s
                     </span>
+                    {seg.speaker && (
+                      <span className="text-orange-300 font-medium whitespace-nowrap">
+                        {seg.speaker}
+                      </span>
+                    )}
                     <span className="text-white">{seg.text}</span>
                   </div>
                 ))}

diff --git a/knip.json b/knip.json
@@ -13,7 +13,6 @@
     "packages/ai-openai/live-tests/**",
     "packages/ai-openai/src/**/*.test.ts",
     "packages/ai-openai/src/audio/audio-provider-options.ts",
-    "packages/ai-openai/src/audio/transcribe-provider-options.ts",
     "packages/ai-openai/src/image/image-provider-options.ts",
     "packages/ai-devtools/src/production.ts",
     "codemods/**/__testfixtures__/**"

diff --git a/packages/ai-client/src/generation-types.ts b/packages/ai-client/src/generation-types.ts
@@ -1,4 +1,5 @@
 import type { StreamChunk } from '@tanstack/ai/client'
+import type { TranscriptionResponseFormat } from '@tanstack/ai'
 import type { ConnectConnectionAdapter } from './connection-adapters'
 import type { AIDevtoolsClientMetadata } from './devtools'
 import type {
@@ -265,7 +266,7 @@ export interface TranscriptionGenerateInput {
   /** An optional prompt to guide the transcription */
   prompt?: string
   /** The format of the transcription output */
-  responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
+  responseFormat?: TranscriptionResponseFormat
   /** Model-specific options */
   modelOptions?: Record<string, any>
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -409,7 +409,7 @@ const result = await generateSpeech({ @@
     })
     ```
-    **Transcription** - `generateTranscription()` supports 5 output formats (json, text, srt, verbose_json, vtt), word-level timestamps with confidence scores, and four providers (OpenAI, Grok, ElevenLabs, fal.ai), with speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model.
+    **Transcription** - `generateTranscription()` supports common output formats (json, text, srt, verbose_json, vtt), word-level timestamps with confidence scores, and four providers (OpenAI, Grok, ElevenLabs, fal.ai), with speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model.
     ```ts
     import { generateTranscription } from '@tanstack/ai'
@@ Expand Down @@