diff --git a/maternal-app/maternal-app-backend/src/modules/voice/voice.controller.ts b/maternal-app/maternal-app-backend/src/modules/voice/voice.controller.ts index bebd504..026d3e3 100644 --- a/maternal-app/maternal-app-backend/src/modules/voice/voice.controller.ts +++ b/maternal-app/maternal-app-backend/src/modules/voice/voice.controller.ts @@ -18,20 +18,46 @@ export class VoiceController { @UseInterceptors(FileInterceptor('audio')) async transcribeAudio( @UploadedFile() file: Express.Multer.File, + @Body('text') text?: string, @Body('language') language?: string, + @Body('childName') childName?: string, ) { - if (!file) { - throw new BadRequestException('Audio file is required'); + // If text is provided (from Web Speech API), classify it directly + if (text) { + const result = await this.voiceService.extractActivityFromText( + text, + language || 'en', + childName, + ); + + return { + success: true, + transcript: text, + classification: result, + }; } - const result = await this.voiceService.transcribeAudio( + // Otherwise, transcribe the audio file + if (!file) { + throw new BadRequestException('Audio file or text is required'); + } + + const transcription = await this.voiceService.transcribeAudio( file.buffer, language, ); + // Also classify the transcription + const classification = await this.voiceService.extractActivityFromText( + transcription.text, + language || 'en', + childName, + ); + return { success: true, - data: result, + transcript: transcription.text, + classification: classification, }; } diff --git a/maternal-web/components/voice/VoiceInputButton.tsx b/maternal-web/components/voice/VoiceInputButton.tsx index e934008..15a5a0d 100644 --- a/maternal-web/components/voice/VoiceInputButton.tsx +++ b/maternal-web/components/voice/VoiceInputButton.tsx @@ -42,7 +42,7 @@ export function VoiceInputButton({ const [isProcessing, setIsProcessing] = useState(false); const [classificationResult, setClassificationResult] = useState(null); - const { isListening, isSupported, transcript, error, startListening, stopListening, reset } = + const { isListening, isSupported, transcript, error, usesFallback, startListening, stopListening, reset } = useVoiceInput(); // Auto-classify when we get a final transcript @@ -215,10 +215,18 @@ export function VoiceInputButton({ {/* Status text */} {isListening - ? 'Listening... Speak now' + ? usesFallback + ? 'Recording... Speak now' + : 'Listening... Speak now' : 'Click the microphone to start'} + {usesFallback && !isListening && !transcript && ( + + Using audio recording mode (iOS Safari) + + )} + {/* Transcript */} {transcript && ( diff --git a/maternal-web/hooks/useVoiceInput.ts b/maternal-web/hooks/useVoiceInput.ts index c5aaa5d..abbe509 100644 --- a/maternal-web/hooks/useVoiceInput.ts +++ b/maternal-web/hooks/useVoiceInput.ts @@ -11,13 +11,14 @@ export interface VoiceInputState { isSupported: boolean; transcript: string; error: string | null; + usesFallback: boolean; } /** - * Hook for voice input using browser Web Speech API + * Hook for voice input using browser Web Speech API or MediaRecorder fallback * * Provides voice recording functionality with real-time transcription. - * Falls back gracefully if browser doesn't support Speech Recognition. + * Falls back to MediaRecorder + server-side transcription for iOS Safari. */ export function useVoiceInput() { const [state, setState] = useState({ @@ -25,34 +26,52 @@ export function useVoiceInput() { isSupported: false, transcript: '', error: null, + usesFallback: false, }); const recognitionRef = useRef(null); + const mediaRecorderRef = useRef(null); + const audioChunksRef = useRef([]); const timeoutRef = useRef(null); - // Check if browser supports Speech Recognition + // Check if browser supports Speech Recognition or MediaRecorder useEffect(() => { const SpeechRecognition = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition; if (SpeechRecognition) { - setState(prev => ({ ...prev, isSupported: true })); + try { + // Initialize recognition + const recognition = new SpeechRecognition(); + recognition.continuous = false; // Single recognition + recognition.interimResults = true; // Get interim results + recognition.maxAlternatives = 1; + recognition.lang = 'en-US'; // Default language - // Initialize recognition - const recognition = new SpeechRecognition(); - recognition.continuous = false; // Single recognition - recognition.interimResults = true; // Get interim results - recognition.maxAlternatives = 1; - recognition.lang = 'en-US'; // Default language - - recognitionRef.current = recognition; + recognitionRef.current = recognition; + setState(prev => ({ ...prev, isSupported: true, usesFallback: false })); + } catch (error) { + console.warn('[Voice] Speech Recognition initialization failed, using fallback'); + setState(prev => ({ ...prev, isSupported: true, usesFallback: true })); + } + } else if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) { + // Use MediaRecorder fallback for iOS Safari + console.log('[Voice] Using MediaRecorder fallback for iOS Safari'); + setState(prev => ({ ...prev, isSupported: true, usesFallback: true })); } else { setState(prev => ({ ...prev, isSupported: false })); } return () => { if (recognitionRef.current) { - recognitionRef.current.stop(); + try { + recognitionRef.current.stop(); + } catch (e) { + // Ignore errors on cleanup + } + } + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') { + mediaRecorderRef.current.stop(); } if (timeoutRef.current) { clearTimeout(timeoutRef.current); @@ -60,16 +79,106 @@ export function useVoiceInput() { }; }, []); - // Start listening - const startListening = useCallback(() => { - if (!recognitionRef.current) { + // Start listening with MediaRecorder fallback + const startListeningWithFallback = useCallback(async () => { + audioChunksRef.current = []; + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const mediaRecorder = new MediaRecorder(stream, { + mimeType: 'audio/webm;codecs=opus', + }); + + mediaRecorderRef.current = mediaRecorder; + + mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + audioChunksRef.current.push(event.data); + } + }; + + mediaRecorder.onstop = async () => { + const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' }); + + // Send to backend for transcription + try { + const formData = new FormData(); + formData.append('audio', audioBlob, 'recording.webm'); + + const response = await fetch('/api/voice/transcribe', { + method: 'POST', + body: formData, + }); + + const data = await response.json(); + + if (response.ok && data.success) { + setState(prev => ({ + ...prev, + isListening: false, + transcript: data.transcript, + })); + } else { + setState(prev => ({ + ...prev, + isListening: false, + error: data.message || 'Failed to transcribe audio', + })); + } + } catch (error) { + console.error('[Voice] Transcription error:', error); + setState(prev => ({ + ...prev, + isListening: false, + error: 'Failed to process audio', + })); + } + + // Stop all tracks + stream.getTracks().forEach(track => track.stop()); + }; + + mediaRecorder.onerror = (event) => { + console.error('[Voice] MediaRecorder error:', event); + setState(prev => ({ + ...prev, + isListening: false, + error: 'Recording failed', + })); + }; + setState(prev => ({ ...prev, - error: 'Speech recognition not supported in this browser', + isListening: true, + transcript: '', + error: null, })); - return; - } + mediaRecorder.start(); + + // Auto-stop after 10 seconds + timeoutRef.current = setTimeout(() => { + if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') { + mediaRecorderRef.current.stop(); + } + }, 10000); + } catch (error: any) { + console.error('[Voice] Failed to access microphone:', error); + let errorMessage = 'Failed to access microphone'; + if (error.name === 'NotAllowedError' || error.name === 'PermissionDeniedError') { + errorMessage = 'Microphone access denied. Please grant permission.'; + } else if (error.name === 'NotFoundError') { + errorMessage = 'No microphone found. Please check your settings.'; + } + setState(prev => ({ + ...prev, + error: errorMessage, + })); + } + }, []); + + // Start listening with Web Speech API + const startListeningWithSpeechAPI = useCallback(() => { const recognition = recognitionRef.current; // Clear previous state @@ -153,10 +262,31 @@ export function useVoiceInput() { } }, []); + // Start listening (chooses appropriate method) + const startListening = useCallback(() => { + if (state.usesFallback) { + startListeningWithFallback(); + } else if (recognitionRef.current) { + startListeningWithSpeechAPI(); + } else { + setState(prev => ({ + ...prev, + error: 'Voice input not supported in this browser', + })); + } + }, [state.usesFallback, startListeningWithFallback, startListeningWithSpeechAPI]); + // Stop listening const stopListening = useCallback(() => { if (recognitionRef.current) { - recognitionRef.current.stop(); + try { + recognitionRef.current.stop(); + } catch (e) { + // Ignore errors + } + } + if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') { + mediaRecorderRef.current.stop(); } if (timeoutRef.current) { clearTimeout(timeoutRef.current);