import asyncioimport base64import jsonimport waveimport websocketsAPI_KEY = "YOUR_API_KEY"WS_URL = "wss://api.kugelaudio.com"async def generate_speech(text: str, voice_id: int = 268): """Generate speech via WebSocket and save to WAV file.""" ws_url = f"{WS_URL}/ws/tts?api_key={API_KEY}" audio_chunks = [] async with websockets.connect(ws_url) as ws: # Send TTS request await ws.send(json.dumps({ "text": text, "model_id": "kugel-1-turbo", "voice_id": voice_id, "cfg_scale": 2.0, "sample_rate": 24000, })) # Receive audio chunks async for msg in ws: data = json.loads(msg) if data.get("error"): raise Exception(data["error"]) if data.get("audio"): audio_chunks.append(base64.b64decode(data["audio"])) print(f"Chunk {data['idx']}: {data['samples']} samples") if data.get("final"): print(f"Done: {data['dur_ms']:.0f}ms audio, generated in {data['gen_ms']:.0f}ms") break # Save to WAV with wave.open("output.wav", "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) # 16-bit wf.setframerate(24000) wf.writeframes(b"".join(audio_chunks)) print("Saved to output.wav")# Runasyncio.run(generate_speech("Hello, this is a test of the raw API."))
temperature controls how much the sampler varies across regenerations of the
same text. Lower values are closer to greedy decoding (stable, repeatable
reads); higher values are more expressive but less consistent.
Use case
Suggested range
E-learning, IVR prompts, compliance reads
0.0 – 0.3
General voiceover, conversational UX
0.4 – 0.6 (default 0.4)
Expressive narration, ads, character voices
0.7 – 1.0
The default of 0.4 tracks the TTS Studio natural preset. Lowered from 0.5
to reduce intermittent word-drop on short trailing sentences with kugel-2.