Error Handling
import { KugelAudio } from 'kugelaudio';
import {
KugelAudioError,
AuthenticationError,
RateLimitError,
InsufficientCreditsError,
ValidationError,
NotFoundError,
ConnectionError,
} from 'kugelaudio';
try {
const audio = await client.tts.generate({ text: 'Hello!' });
} catch (error) {
if (error instanceof AuthenticationError) {
console.error('Invalid API key');
} else if (error instanceof RateLimitError) {
console.error('Rate limit exceeded, please wait');
} else if (error instanceof InsufficientCreditsError) {
console.error('Not enough credits, please top up');
} else if (error instanceof NotFoundError) {
console.error('Voice, model, or dictionary not found');
} else if (error instanceof ValidationError) {
console.error(`Invalid request: ${error.message}`);
} else if (error instanceof ConnectionError) {
console.error('Failed to connect to server');
} else if (error instanceof KugelAudioError) {
console.error(`API error: ${error.message}`);
}
}
KugelAudioError and carry a machine-readable code. The
package also exports the ErrorCodes and WsCloseCodes constant maps for
matching specific codes when you need finer-grained handling than the classes
above.
KugelAudioOptions
type Region = 'eu' | 'us' | 'global';
interface KugelAudioOptions {
apiKey: string; // Required (can be prefixed with 'eu-' for EU)
isMasterKey?: boolean; // Treat apiKey as a master key (server-side)
isToken?: boolean; // Treat apiKey as a JWT token
orgId?: number; // Org to bill against (token auth)
region?: Region; // 'eu' selects the direct EU endpoint
apiUrl?: string; // Default: https://api.kugelaudio.com
ttsUrl?: string; // Default: same as apiUrl
timeout?: number; // Default: 60000 (ms)
keepalivePingInterval?: number | null; // Default: 20000 (ms); 0/null disables
}
GenerateOptions
interface GenerateOptions {
text: string; // Required: Text to synthesize
modelId?: string; // Default: 'kugel-3'
voiceId?: number; // Optional: Voice ID
cfgScale?: number; // Default: 2.0
temperature?: number; // Sampling variance 0.0-1.0; default ~0.5
maxNewTokens?: number; // Default: 2048
sampleRate?: number; // Default: 24000
outputFormat?: string; // 'pcm_24000' | 'ulaw_8000' | 'alaw_8000' | ...
normalize?: boolean; // Default: true - Enable text normalization
language?: string; // ISO 639-1 code for normalization (e.g., 'en', 'de')
wordTimestamps?: boolean; // Default: false - Request word-level timestamps
speed?: number; // Playback speed 0.8-1.2 (default 1.0); pitch-preserving
projectId?: number; // Project-scoped features (master-key callers)
dictionaryIds?: number[]; // Per-request dictionary selection: omit = all
// active dictionaries, [] = none, list = exactly
// those (incl. inactive), bypassing language filter
}
Using
normalize: true without language may cause incorrect normalizations. Always specify language when you know it.AudioChunk
interface AudioChunk {
audio: string; // Base64-encoded PCM16 audio
encoding: string; // 'pcm_s16le' | 'mulaw' | 'alaw' (G.711 when outputFormat set)
index: number; // Chunk index (0-based)
sampleRate: number; // Sample rate (24000)
samples: number; // Number of samples in chunk
}
WordTimestamp
interface WordTimestamp {
word: string; // The word
startMs: number; // Start time in milliseconds
endMs: number; // End time in milliseconds
charStart: number; // Character start index in original text
charEnd: number; // Character end index in original text
score: number; // Alignment confidence score (0.0 - 1.0)
}
AudioResponse
interface AudioResponse {
audio: ArrayBuffer; // Complete PCM16 audio
sampleRate: number; // Sample rate (24000)
samples: number; // Total samples
durationMs: number; // Duration in milliseconds
generationMs: number; // Generation time in milliseconds
rtf: number; // Real-time factor
wordTimestamps: WordTimestamp[]; // Per-word timing (when wordTimestamps: true)
}
GenerationStats
interface GenerationStats {
final: true;
chunks: number; // Number of chunks generated
totalSamples: number; // Total samples generated
durationMs: number; // Audio duration in ms
generationMs: number; // Generation time in ms
rtf: number; // Real-time factor
usage?: SessionUsage; // Per-request usage (audio time + charge); undefined if not reported
}
SessionUsage
Per-conversation usage for billing your own customers. Available onStreamingSession.lastUsage (per session), MultiContextSession.usageFor(...)
and the onContextClosed callback (per context), and GenerationStats.usage
(per one-shot stream() request).
interface SessionUsage {
audioSeconds: number; // Audio generated (the unit we bill on)
costCents: number | null; // Actual charge in EUR cents; null if undetermined
currency?: string; // Currency of costCents ('eur'); set only when costCents is
characters?: number; // Input characters; omitted on multi-context per-context usage
modelId?: string; // Model that produced the audio
costAvailable: boolean; // true when an authoritative charge was returned
}
costCents is null (and costAvailable is false) when the charge cannot
be determined at session end — e.g. a transient billing error or an internal
session. It is never a misleading 0. audioSeconds is always reported.StreamCallbacks
Used with the one-shotclient.tts.stream() endpoint:
interface StreamCallbacks {
onOpen?: () => void;
onChunk?: (chunk: AudioChunk) => void;
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
onFinal?: (stats: GenerationStats) => void;
onError?: (error: Error) => void;
onClose?: () => void;
}
StreamConfig
Configuration forclient.tts.streamingSession() (LLM integration endpoint):
interface StreamConfig {
voiceId?: number;
modelId?: string; // Default: 'kugel-3'
cfgScale?: number;
temperature?: number; // Sampling variance 0.0-1.0; default ~0.5
maxNewTokens?: number;
sampleRate?: number;
outputFormat?: string; // Combined codec + rate token
flushTimeoutMs?: number;
maxBufferLength?: number;
normalize?: boolean;
language?: string; // ISO 639-1 code — specify to avoid auto-detect latency
wordTimestamps?: boolean;
/**
* Minimum buffer sizes (chars) before each successive chunk is auto-emitted.
* Smaller = lower TTFA; larger = better prosody context.
* Default: [5, 80, 150, 250]
*/
chunkLengthSchedule?: number[];
/**
* When true, start generating at the very first clean sentence boundary.
* Equivalent to ElevenLabs auto_mode=true. Lowest possible TTFA.
*/
autoMode?: boolean;
speed?: number; // Playback speed 0.8-1.2 (default 1.0); pitch-preserving
dictionaryIds?: number[]; // Per-session dictionary selection (see GenerateOptions)
}
StreamingSessionCallbacks
interface StreamingSessionCallbacks {
onChunk?: (chunk: AudioChunk) => void;
onChunkComplete?: (chunkId: number, audioSeconds: number, genMs: number) => void;
// End of audio for the turn (ElevenLabs isFinal equivalent) — fires after
// the last audio frame, right before onSessionClosed. Not on barge-in.
onFinal?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
onSessionClosed?: (totalAudioSeconds: number, totalTextChunks: number, totalAudioChunks: number) => void;
onGenerationStarted?: (chunkId: number, text: string) => void;
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
onError?: (error: Error) => void;
}
Model
interface Model {
id: string; // e.g. 'kugel-3'
name: string; // Human-readable name
description: string; // Model description
parameters: string; // Parameter-count label (e.g. '7B')
maxInputLength: number; // Maximum input characters
sampleRate: number; // Output sample rate
}
VoiceListResponse
Paginated response fromvoices.list():
interface VoiceListResponse {
voices: Voice[]; // Voices on this page
total: number; // Total number of matching voices
limit: number; // Page size used
offset: number; // Offset used
}
Voice
type VoiceCategory = 'premade' | 'cloned' | 'designed' | 'conversational' | 'narrative' | 'narrative_story' | 'characters';
type VoiceSex = 'male' | 'female' | 'neutral';
type VoiceAge = 'young' | 'middle_aged' | 'old';
type VoiceQuality = 'low' | 'mid' | 'high';
interface Voice {
id: number; // Voice ID
name: string; // Voice name
description?: string; // Description
category?: VoiceCategory;
sex?: VoiceSex;
age?: VoiceAge;
supportedLanguages: string[]; // ['en', 'de', ...]
sampleText?: string; // Text used for sample generation
avatarUrl?: string; // Avatar image URL
sampleUrl?: string; // Sample audio URL
isPublic: boolean;
verified: boolean;
}
VoiceDetail
Extended voice information (returned by create, update, get, publish, generateSample):interface VoiceDetail {
id: number;
name: string;
description: string;
generativeVoiceDescription: string;
supportedLanguages: string[];
category: string;
age?: string;
sex?: string;
quality: string; // 'low' | 'mid' | 'high'
isPublic: boolean;
verified: boolean;
pendingVerification: boolean;
sampleUrl?: string;
avatarUrl?: string;
sampleText: string;
}
VoiceReference
interface VoiceReference {
id: number;
voiceId: number;
name: string;
referenceText: string;
s3Path: string;
audioUrl?: string;
isGenerated: boolean;
}
CreateVoiceOptions
interface CreateVoiceOptions {
name: string;
sex: string;
description?: string;
category?: string;
age?: string;
quality?: string;
supportedLanguages?: string[];
isPublic?: boolean;
sampleText?: string;
referenceFiles?: Array<File | Blob>;
}
UpdateVoiceOptions
interface UpdateVoiceOptions {
name?: string;
description?: string;
category?: string;
age?: string;
sex?: string;
quality?: string;
supportedLanguages?: string[];
isPublic?: boolean;
sampleText?: string;
}
Dictionary types live on the Dictionaries page; multi-context types live on the Streaming Sessions page.