Use this file to discover all available pages before exploring further.
The Gemini Live API enables low-latency bidirectional voice and video interactions with Gemini. The Live API can process text, audio, and video input in real-time, providing text and audio output for natural conversational experiences.
Handle different types of responses from the server:
async def handle_server_message(message: str): response = json.loads(message) # Model's text/audio output if "serverContent" in response: model_turn = response["serverContent"].get("modelTurn", {}) for part in model_turn.get("parts", []): # Text output if "text" in part: print(f"Model: {part['text']}") # Audio output (PCM16 at 24kHz) if "inlineData" in part: audio_data = base64.b64decode(part["inlineData"]["data"]) # Play audio through speakers play_audio(audio_data) # Check if model finished speaking if response["serverContent"].get("turnComplete"): print("Model finished turn") # Model wants to call a function if "toolCall" in response: for call in response["toolCall"].get("functionCalls", []): function_name = call["name"] args = call["args"] # Execute function result = execute_function(function_name, args) # Send result back await send_tool_response(ws, function_name, result) # Model was interrupted (user started speaking) if response.get("interrupted"): print("Barge-in detected - stopping playback") stop_audio_playback()
import asyncioimport websocketsimport jsonimport base64import numpy as npfrom IPython.display import Audio, displayasync def text_to_speech(text_input: str): async with websockets.connect(service_url, additional_headers=headers) as ws: # Setup await ws.send(json.dumps(setup)) await ws.recv() # Send text msg = { "client_content": { "turns": [{"role": "user", "parts": [{"text": text_input}]}], "turn_complete": True } } await ws.send(json.dumps(msg)) # Collect audio response audio_data = [] async for message in ws: response = json.loads(message) # Extract audio if "serverContent" in response: parts = response["serverContent"].get("modelTurn", {}).get("parts", []) for part in parts: if "inlineData" in part: pcm_data = base64.b64decode(part["inlineData"]["data"]) audio_data.append(np.frombuffer(pcm_data, dtype=np.int16)) # Check if complete if response.get("serverContent", {}).get("turnComplete"): break # Play audio if audio_data: display(Audio(np.concatenate(audio_data), rate=24000, autoplay=True))# Use itawait text_to_speech("Hello! How are you today?")
Handle Barge-In ProperlyAlways stop audio playback immediately when receiving an interrupted message. Continuing to play after interruption creates a poor user experience.
if response.get("interrupted"): audio_queue.clear() stop_playback()