#!/usr/bin/env python3 """ LiveTalker Simple Voice Server Working voice input without complex dependencies """ import asyncio import json import logging import time import base64 import numpy as np from typing import Dict, Any import math from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.responses import HTMLResponse from fastapi.middleware.cors import CORSMiddleware import uvicorn logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI(title="LiveTalker Simple Voice Server") # Enable CORS app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) active_connections: Dict[str, Dict] = {} def simple_vad(audio_data: np.ndarray, threshold: float = 0.01) -> tuple: """Simple energy-based voice activity detection""" if len(audio_data) == 0: return False, 0.0 # Calculate RMS energy rms = np.sqrt(np.mean(audio_data ** 2)) # Simple threshold-based detection is_speech = rms > threshold confidence = min(rms / threshold, 1.0) if threshold > 0 else 0.0 return is_speech, confidence @app.get("/") async def root(): """Main interface with working voice input""" html_content = """ LiveTalker - Real Voice Input

🎙️ LiveTalker Voice Assistant

🔗 Connection Status

Click Connect to start

Click to toggle voice input

🎵 Voice Activity Detection

Waiting for audio input...

Audio Chunks

Speech Events

Avg Confidence

🎤 Microphone Status

Permission required

Welcome to LiveTalker! Enable microphone and start talking...

📊 Activity Log

Ready to start voice processing...

🎯

Real-Time VAD

Energy-based voice activity detection

⚡

Live Processing

Real-time audio stream analysis

🔄

Turn Detection

Intelligent conversation management

🌐

WebSocket

Low-latency bidirectional communication

""" return HTMLResponse(content=html_content) @app.get("/health") async def health_check(): """Health check endpoint""" return { "status": "healthy", "timestamp": time.time(), "features": { "microphone_input": True, "simple_vad": True, "real_time_processing": True, "websocket_communication": True }, "vad_method": "energy_based" } @app.get("/stats") async def get_stats(): """System statistics""" return { "active_connections": len(active_connections), "processing_mode": "real_time", "vad_type": "simple_energy_based", "features": { "microphone_capture": "enabled", "audio_analysis": "real_time", "speech_detection": "active" } } @app.websocket("/media-stream") async def websocket_endpoint(websocket: WebSocket): """WebSocket endpoint for voice processing""" await websocket.accept() session_id = f"session_{int(time.time() * 1000)}" session = { "id": session_id, "websocket": websocket, "audio_buffer": [], "conversation": [], "is_listening": False, "speech_segments": [], "last_speech_time": 0 } active_connections[session_id] = session logger.info(f"New voice session: {session_id}") try: # Send initial configuration await websocket.send_json({ "type": "config", "session_id": session_id, "message": "Voice processing ready - simple VAD active", "vad_type": "energy_based" }) async for message in websocket.iter_json(): await handle_voice_message(session, message) except WebSocketDisconnect: logger.info(f"Voice session disconnected: {session_id}") except Exception as e: logger.error(f"Voice session error: {e}") try: await websocket.send_json({ "type": "error", "error": str(e) }) except: pass finally: if session_id in active_connections: del active_connections[session_id] async def handle_voice_message(session: Dict, message: Dict[str, Any]): """Handle incoming voice messages""" msg_type = message.get("type") if msg_type == "start_conversation": session["is_listening"] = True await session["websocket"].send_json({ "type": "conversation_started", "message": "Voice conversation started! Speak now and I'll detect your voice." }) logger.info(f"Started voice conversation for {session['id']}") elif msg_type == "audio" and session["is_listening"]: await process_audio_chunk(session, message) elif msg_type == "stop_listening": session["is_listening"] = False await session["websocket"].send_json({ "type": "stopped", "message": "Stopped listening for voice input" }) async def process_audio_chunk(session: Dict, message: Dict[str, Any]): """Process incoming audio with simple VAD""" try: # Decode base64 audio data audio_data = base64.b64decode(message["data"]) # Convert PCM 16-bit to float audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 if len(audio_np) == 0: return # Apply simple VAD is_speech, confidence = simple_vad(audio_np, threshold=0.01) # Send VAD result to client await session["websocket"].send_json({ "type": "vad_result", "is_speech": is_speech, "confidence": confidence, "timestamp": time.time(), "audio_length": len(audio_np) }) # Accumulate speech segments if is_speech: session["audio_buffer"].extend(audio_np.tolist()) session["last_speech_time"] = time.time() # If we have a significant amount of speech audio, process it if len(session["audio_buffer"]) > 16000: # About 1 second at 16kHz await process_speech_segment(session) session["audio_buffer"] = [] else: # If silence detected and we have accumulated audio, process it if (len(session["audio_buffer"]) > 8000 and # At least 0.5 seconds time.time() - session["last_speech_time"] > 0.5): # 0.5s silence await process_speech_segment(session) session["audio_buffer"] = [] except Exception as e: logger.error(f"Error processing audio chunk: {e}") await session["websocket"].send_json({ "type": "error", "error": f"Audio processing error: {str(e)}" }) async def process_speech_segment(session: Dict): """Process accumulated speech segment""" try: if len(session["audio_buffer"]) < 8000: # Less than 0.5 seconds return duration = len(session["audio_buffer"]) / 16000 # Duration in seconds # For demonstration, create a simulated transcription simulated_text = f"Speech segment detected: {duration:.1f} seconds of audio" # Send speech-to-text result await session["websocket"].send_json({ "type": "speech_to_text", "text": simulated_text, "confidence": 0.85, "duration": duration, "note": "This is simulated - in full implementation would use real STT" }) # Add to conversation session["conversation"].append({ "role": "user", "content": simulated_text, "timestamp": time.time(), "duration": duration }) # Generate AI response (simulated) ai_response = f"I detected your {duration:.1f}-second speech segment! In the full CSM implementation, this would trigger ultra-low latency voice synthesis with 500ms response time." await session["websocket"].send_json({ "type": "ai_response", "text": ai_response, "processing_info": { "input_duration": duration, "expected_ttfc": "500-650ms with CSM", "expected_rtf": "2.5x with optimizations" } }) session["conversation"].append({ "role": "assistant", "content": ai_response, "timestamp": time.time() }) logger.info(f"Processed speech segment: {duration:.1f}s for session {session['id']}") except Exception as e: logger.error(f"Error processing speech segment: {e}") if __name__ == "__main__": print("🎙️ Starting LiveTalker Simple Voice Server...") print("Features:") print(" ✅ Real microphone input capture") print(" ✅ Energy-based Voice Activity Detection") print(" ✅ Real-time audio processing") print(" ✅ WebSocket communication") print(" ✅ Speech segment detection") print(" ✅ Conversation flow simulation") print("") print("📍 Access URL: http://localhost:8000") print("🎯 Instructions:") print(" 1. Open the URL in your browser") print(" 2. Click 'Connect' to establish WebSocket") print(" 3. Click 'Enable Microphone' to request permissions") print(" 4. Click 'Start Listening' to begin voice detection") print(" 5. Speak normally - the system will detect your voice!") print("") uvicorn.run( app, host="0.0.0.0", port=8000, log_level="info" )