Installation
Add the dependency to yourpom.xml:
Copy
<dependency>
<groupId>com.kugelaudio</groupId>
<artifactId>kugelaudio</artifactId>
<version>0.1.0</version>
</dependency>
Copy
implementation 'com.kugelaudio:kugelaudio:0.1.0'
Quick Start
Copy
import com.kugelaudio.sdk.KugelAudio;
import com.kugelaudio.sdk.KugelAudioOptions;
import com.kugelaudio.sdk.GenerateRequest;
import com.kugelaudio.sdk.AudioResponse;
KugelAudio client = new KugelAudio(
KugelAudioOptions.builder("your_api_key").build()
);
AudioResponse audio = client.tts().generate(
GenerateRequest.builder("Hello, world!")
.modelId("kugel-1-turbo")
.language("en")
.build()
);
audio.saveWav(java.nio.file.Path.of("output.wav"));
client.close();
Pre-connecting for Low Latency
By default,new KugelAudio(options) immediately starts a WebSocket connection in the background. This means the connection overhead (~300–500ms) is absorbed at startup rather than on the first request.
Copy
// Connection starts in background automatically (autoConnect = true by default)
KugelAudio client = new KugelAudio(
KugelAudioOptions.builder("your_api_key").build()
);
// If you need to guarantee the connection is ready before the first request:
client.connect();
System.out.println("Connected: " + client.isConnected());
// Or use the blocking factory method:
KugelAudio client = KugelAudio.createConnected(
KugelAudioOptions.builder("your_api_key").build()
);
Without pre-connecting, the first TTS request includes WebSocket connection setup (~300–500ms).
Subsequent requests reuse the connection and are fast (~100–150ms TTFA).
The default
autoConnect = true moves this overhead to client construction.Client Configuration
Copy
import com.kugelaudio.sdk.KugelAudio;
import com.kugelaudio.sdk.KugelAudioOptions;
import java.time.Duration;
// Simple setup — reads KUGELAUDIO_API_KEY environment variable
KugelAudio client = KugelAudio.fromEnv();
// Full configuration
KugelAudio client = new KugelAudio(
KugelAudioOptions.builder("your_api_key")
.apiUrl("https://api.kugelaudio.com") // REST + WebSocket base URL
.timeout(Duration.ofSeconds(60)) // HTTP request timeout
.autoConnect(true) // Pre-connect WebSocket (default: true)
.build()
);
Local Development
Copy
KugelAudio client = new KugelAudio(
KugelAudioOptions.builder("your_api_key")
.apiUrl("http://localhost:8000")
.build()
);
Copy
KugelAudio client = new KugelAudio(
KugelAudioOptions.builder("your_api_key")
.apiUrl("http://localhost:8001") // Backend for REST API
.ttsUrl("http://localhost:8000") // TTS server for WebSocket streaming
.build()
);
## Text-to-Speech
### Basic Generation
Generate complete audio and receive it all at once:
```java
import com.kugelaudio.sdk.GenerateRequest;
import com.kugelaudio.sdk.AudioResponse;
AudioResponse audio = client.tts().generate(
GenerateRequest.builder("Hello, this is a test of the KugelAudio text-to-speech system.")
.modelId("kugel-1-turbo") // 'kugel-1-turbo' (fast) or 'kugel-1' (quality)
.voiceId(123) // Optional: specific voice ID
.cfgScale(2.0) // Guidance scale (1.0-5.0)
.maxNewTokens(2048)
.sampleRate(24000)
.normalize(true) // Enable text normalization (default)
.language("en") // Language for normalization (see below)
.wordTimestamps(false)
.build()
);
// Audio properties
System.out.printf("Duration: %.2fms%n", audio.getDurationMs());
System.out.println("Samples: " + audio.getTotalSamples());
System.out.println("Sample rate: " + audio.getSampleRate() + " Hz");
System.out.printf("Generation time: %.0fms%n", audio.getGenerationMs());
System.out.printf("RTF: %.2f%n", audio.getRtf());
// Save to WAV file
audio.saveWav(java.nio.file.Path.of("output.wav"));
// Get raw PCM16 bytes (signed 16-bit little-endian, mono)
byte[] pcmData = audio.getAudio();
// Get WAV bytes with header (in-memory)
byte[] wavBytes = audio.toWavBytes();
// Get normalised float samples [-1.0, 1.0]
float[] floatData = audio.toFloat32();
Streaming Audio
Receive audio chunks as they are generated for lower latency:Copy
import com.kugelaudio.sdk.GenerateRequest;
import com.kugelaudio.sdk.StreamCallbacks;
import com.kugelaudio.sdk.AudioChunk;
client.tts().stream(
GenerateRequest.builder("Hello, this is streaming audio.")
.modelId("kugel-1-turbo")
.language("en")
.build(),
new StreamCallbacks() {
@Override
public void onChunk(AudioChunk chunk) {
System.out.printf("Chunk %d: %d bytes, %d samples%n",
chunk.getIndex(), chunk.getAudio().length, chunk.getSamples());
// playAudio(chunk.getAudio());
}
@Override
public void onComplete(AudioResponse response) {
System.out.printf("Total duration: %.0fms%n", response.getDurationMs());
System.out.printf("Generation time: %.0fms%n", response.getGenerationMs());
}
@Override
public void onError(com.kugelaudio.sdk.KugelAudioException error) {
System.err.println("TTS error: " + error.getMessage());
}
}
);
## Text Normalization
Text normalization converts numbers, dates, times, and other non-verbal text into spoken words:
- "I have 3 apples" → "I have three apples"
- "The meeting is at 2:30 PM" → "The meeting is at two thirty PM"
- "€50.99" → "fifty euros and ninety-nine cents"
```java
// With explicit language (recommended - fastest)
AudioResponse audio = client.tts().generate(
GenerateRequest.builder("I bought 3 items for €50.99 on 01/15/2024.")
.normalize(true)
.language("en") // Specify language for best performance
.build()
);
// With auto-detection (may cause incorrect normalizations)
AudioResponse audio = client.tts().generate(
GenerateRequest.builder("Ich habe 3 Artikel für 50,99€ gekauft.")
.normalize(true)
// language not set - will auto-detect
.build()
);
Supported Languages
| Code | Language | Code | Language |
|---|---|---|---|
de | German | nl | Dutch |
en | English | pl | Polish |
fr | French | sv | Swedish |
es | Spanish | da | Danish |
it | Italian | no | Norwegian |
pt | Portuguese | fi | Finnish |
cs | Czech | hu | Hungarian |
ro | Romanian | el | Greek |
uk | Ukrainian | bg | Bulgarian |
tr | Turkish | vi | Vietnamese |
ar | Arabic | hi | Hindi |
zh | Chinese | ja | Japanese |
ko | Korean |
Using
.normalize(true) without .language(...) may cause incorrect normalizations, especially for short texts or languages that share similar vocabulary. Always specify language when you know it.Spell Tags
Use<spell> tags to spell out text letter by letter — useful for email addresses, codes, and acronyms:
Copy
// Spell out an email address
AudioResponse audio = client.tts().generate(
GenerateRequest.builder("Contact me at <spell>kajo@kugelaudio.com</spell>")
.normalize(true)
.language("en")
.build()
);
// Output: "Contact me at K, A, J, O, at, K, U, G, E, L, A, U, D, I, O, dot, C, O, M"
Special Characters: Characters like
@, ., - are translated to language-specific words.
For example, @ becomes “at” in English, “ät” in German, and “arobase” in French.Model recommendation: For clearer letter-by-letter pronunciation, use
.modelId("kugel-1") instead of kugel-1-turbo.Word Timestamps
Request word-level time alignments alongside audio for subtitle synchronization, lip-sync, or barge-in handling.With Generate
Copy
import com.kugelaudio.sdk.WordTimestamp;
AudioResponse audio = client.tts().generate(
GenerateRequest.builder("Hello, how are you today?")
.modelId("kugel-1-turbo")
.language("en")
.wordTimestamps(true)
.build()
);
for (WordTimestamp ts : audio.getWordTimestamps()) {
System.out.printf("%s: %dms - %dms (score: %.2f)%n",
ts.getWord(), ts.getStartMs(), ts.getEndMs(), ts.getScore());
}
// Hello: 0ms - 320ms (score: 0.98)
// how: 350ms - 480ms (score: 0.95)
With Streaming
Copy
client.tts().stream(
GenerateRequest.builder("Hello, how are you today?")
.modelId("kugel-1-turbo")
.language("en")
.wordTimestamps(true)
.build(),
new StreamCallbacks() {
@Override
public void onChunk(AudioChunk chunk) {
playAudio(chunk.getAudio());
}
@Override
public void onWordTimestamps(java.util.List<WordTimestamp> timestamps) {
for (WordTimestamp ts : timestamps) {
System.out.printf("%s: %dms-%dms%n",
ts.getWord(), ts.getStartMs(), ts.getEndMs());
}
}
}
);
Word timestamps add no extra audio latency. They arrive ~50–200ms after the corresponding audio chunk.
LLM Integration: Streaming Sessions
For real-time TTS when streaming text from an LLM (like GPT-4, Claude, etc.):Copy
import com.kugelaudio.sdk.StreamConfig;
import com.kugelaudio.sdk.StreamCallbacks;
import com.kugelaudio.sdk.StreamingSession;
StreamConfig config = StreamConfig.builder()
.voiceId(123)
.modelId("kugel-1-turbo")
.language("en")
.flushTimeoutMs(500) // Auto-flush after 500ms of no input
.build();
// Simulate LLM token stream
String[] tokens = {"Hello, ", "this ", "is ", "a ", "streamed ", "response."};
try (StreamingSession session = client.streamingSession(config, new StreamCallbacks() {
@Override
public void onChunk(AudioChunk chunk) {
playAudio(chunk.getAudio());
}
@Override
public void onComplete(AudioResponse response) {
System.out.printf("Done: %.0fms%n", response.getDurationMs());
}
})) {
// Send tokens as they arrive from LLM
for (String token : tokens) {
session.send(token);
}
// Flush any remaining text to trigger generation
session.flush();
}
Multi-Context Sessions
Generate audio for multiple speakers or contexts concurrently over a single WebSocket connection:Copy
import com.kugelaudio.sdk.MultiContextConfig;
import com.kugelaudio.sdk.MultiContextSession;
import com.kugelaudio.sdk.MultiContextCallbacks;
import com.kugelaudio.sdk.CreateContextOptions;
import com.kugelaudio.sdk.AudioChunk;
MultiContextConfig config = MultiContextConfig.builder()
.language("en")
.sampleRate(24000)
.build();
try (MultiContextSession session = client.multiContextSession(config)) {
session.connect(new MultiContextCallbacks() {
@Override
public void onChunk(String contextId, AudioChunk chunk) {
System.out.printf("[%s] Chunk %d: %d samples%n",
contextId, chunk.getIndex(), chunk.getSamples());
playAudio(contextId, chunk.getAudio());
}
@Override
public void onContextComplete(String contextId) {
System.out.println("[" + contextId + "] Generation complete");
}
@Override
public void onError(String contextId, com.kugelaudio.sdk.KugelAudioException error) {
System.err.println("[" + contextId + "] Error: " + error.getMessage());
}
});
// Create contexts for different speakers
session.createContext("speaker-a", CreateContextOptions.builder().voiceId(101).build());
session.createContext("speaker-b", CreateContextOptions.builder().voiceId(202).build());
// Send text to each context independently
session.send("speaker-a", "Hello from speaker A!");
session.send("speaker-b", "And greetings from speaker B!");
session.flush("speaker-a");
session.flush("speaker-b");
}
## Voices
### List Available Voices
```java
import com.kugelaudio.sdk.Voice;
import java.util.List;
// List all available voices
List<Voice> voices = client.voices().list();
for (Voice voice : voices) {
System.out.println(voice.getId() + ": " + voice.getName());
System.out.println(" Sex: " + voice.getSex());
System.out.println(" Language: " + voice.getLanguage());
}
// Filter by language, public status, and limit
List<Voice> germanVoices = client.voices().list("de", true, 10);
Get a Specific Voice
Copy
import com.kugelaudio.sdk.VoiceDetail;
VoiceDetail voice = client.voices().get(123);
System.out.println("Voice: " + voice.getName());
for (var ref : voice.getReferences()) {
System.out.println(" Reference: " + ref.getId());
}
Create a Voice
Copy
import java.nio.file.Path;
import java.util.List;
VoiceDetail voice = client.voices().create(
"My Custom Voice",
"female",
"en",
List.of(Path.of("reference1.wav"), Path.of("reference2.wav"))
);
System.out.println("Created voice: " + voice.getId());
Update a Voice
Copy
import java.util.Map;
VoiceDetail voice = client.voices().update(123, Map.of(
"name", "Updated Name"
));
Delete a Voice
Copy
client.voices().delete(123);
Manage Reference Audio
Copy
import com.kugelaudio.sdk.VoiceReference;
// List references
List<VoiceReference> refs = client.voices().listReferences(123);
for (VoiceReference ref : refs) {
System.out.println(ref.getId() + ": " + ref.getReferenceText());
}
// Add a new reference
VoiceReference ref = client.voices().addReference(
123,
Path.of("new_reference.wav"),
"Optional transcript of the audio."
);
// Delete a reference
client.voices().deleteReference(123, 456);
Publish a Voice
Request that your voice be made publicly available. An admin will verify it before it becomes visible to others.Copy
VoiceDetail voice = client.voices().publish(123);
System.out.println("Is public: " + voice.isPublic());
Generate Voice Sample
Copy
VoiceDetail voice = client.voices().generateSample(123);
System.out.println("Sample URL: " + voice.getSampleUrl());
Models
List Available Models
Copy
import com.kugelaudio.sdk.Model;
List<Model> models = client.models().list();
for (Model model : models) {
System.out.println(model.getId() + ": " + model.getName());
System.out.println(" Description: " + model.getDescription());
System.out.println(" Parameters: " + model.getParameters());
System.out.println(" Max Input: " + model.getMaxInputLength() + " characters");
System.out.println(" Sample Rate: " + model.getSampleRate() + " Hz");
}
Error Handling
Copy
import com.kugelaudio.sdk.*;
try {
AudioResponse audio = client.tts().generate(
GenerateRequest.builder("Hello!").language("en").build()
);
} catch (AuthenticationException e) {
System.err.println("Invalid API key");
} catch (RateLimitException e) {
System.err.println("Rate limit exceeded, please wait");
} catch (InsufficientCreditsException e) {
System.err.println("Not enough credits, please top up");
} catch (ValidationException e) {
System.err.println("Invalid request: " + e.getMessage());
} catch (ConnectionException e) {
System.err.println("Failed to connect to server");
} catch (KugelAudioException e) {
System.err.println("API error: " + e.getMessage());
}
## Data Models
### AudioChunk
A single audio chunk from streaming:
```java
AudioChunk chunk;
byte[] pcm = chunk.getAudio(); // Raw PCM16 (signed 16-bit little-endian, mono)
int index = chunk.getIndex(); // 0-based chunk index
int rate = chunk.getSampleRate(); // 24000
int n = chunk.getSamples(); // Number of samples in this chunk
float[] f32 = chunk.toFloat32(); // Normalised [-1.0, 1.0]
AudioResponse
Complete audio result fromgenerate():
Copy
AudioResponse audio;
byte[] pcm = audio.getAudio(); // Raw PCM16
int rate = audio.getSampleRate(); // 24000
int samples = audio.getTotalSamples();
double durationMs = audio.getDurationMs();
double genMs = audio.getGenerationMs();
double rtf = audio.getRtf(); // Real-time factor
List<WordTimestamp> timestamps = audio.getWordTimestamps(); // empty unless requested
audio.saveWav(Path.of("output.wav")); // Write WAV file
byte[] wav = audio.toWavBytes(); // WAV bytes with 44-byte header
float[] f32 = audio.toFloat32(); // Normalised [-1.0, 1.0]
WordTimestamp
Word-level time alignment:Copy
WordTimestamp ts;
String word = ts.getWord(); // The aligned word
long startMs = ts.getStartMs(); // Start time in milliseconds
long endMs = ts.getEndMs(); // End time in milliseconds
int charStart = ts.getCharStart(); // Start character offset in original text
int charEnd = ts.getCharEnd(); // End character offset in original text
double score = ts.getScore(); // Alignment confidence (0.0 – 1.0)
Model
Copy
Model model;
String id = model.getId(); // 'kugel-1-turbo' or 'kugel-1'
String name = model.getName();
String description = model.getDescription();
int maxInput = model.getMaxInputLength();
int sampleRate = model.getSampleRate();
Voice
Copy
Voice voice;
int id = voice.getId();
String name = voice.getName();
String sex = voice.getSex(); // 'male', 'female', 'neutral'
String language = voice.getLanguage();
String sampleUrl = voice.getSampleUrl();
boolean isPublic = voice.isPublic();
Audio Utilities
TheAudioFormats utility class provides codec helpers:
Copy
import com.kugelaudio.sdk.AudioFormats;
// Write PCM16 data directly to a WAV file
AudioFormats.writePcm16Wav(Path.of("out.wav"), pcmBytes, 24000, (short) 1);
// Get audio duration in milliseconds
int durationMs = AudioFormats.durationMs(pcmBytes, 24000, 16, 1);
// Convert PCM16 to μ-law (for telephony / Twilio)
byte[] ulaw = AudioFormats.pcm16ToUlaw(pcmBytes);
// Convert μ-law back to PCM16
byte[] pcm = AudioFormats.ulawToPcm16(ulawBytes);
Complete Example
Copy
import com.kugelaudio.sdk.*;
import java.nio.file.Path;
import java.util.List;
public class KugelAudioExample {
public static void main(String[] args) throws Exception {
try (KugelAudio client = new KugelAudio(
KugelAudioOptions.builder("your_api_key").build())) {
// List available models
System.out.println("Available Models:");
for (Model model : client.models().list()) {
System.out.printf(" - %s: %s (%s)%n",
model.getId(), model.getName(), model.getParameters());
}
// List available voices
System.out.println("\nAvailable Voices:");
for (Voice voice : client.voices().list("en", true, 5)) {
System.out.printf(" - %d: %s%n", voice.getId(), voice.getName());
}
// Generate audio
System.out.println("\nGenerating audio...");
AudioResponse audio = client.tts().generate(
GenerateRequest.builder(
"Welcome to KugelAudio. This is an example of high-quality text-to-speech synthesis."
)
.modelId("kugel-1-turbo")
.language("en")
.build()
);
System.out.printf("Generated %.2fs of audio in %.0fms (RTF: %.2f)%n",
audio.getDurationMs() / 1000.0, audio.getGenerationMs(), audio.getRtf());
// Save to file
audio.saveWav(Path.of("example.wav"));
System.out.println("Saved to example.wav");
}
}
}