Add iOS Safari support for voice commands with MediaRecorder fallback

Frontend changes: - Add MediaRecorder fallback for iOS Safari (no Web Speech API support) - Automatically detect browser capabilities and use appropriate method - Add usesFallback flag to track which method is being used - Update UI to show "Recording..." vs "Listening..." based on method - Add iOS-specific indicator text - Handle microphone permissions and errors properly Backend changes: - Update /api/v1/voice/transcribe to accept both audio files and text - Support text-based classification (from Web Speech API) - Support audio file transcription + classification (from MediaRecorder) - Return unified response format with transcript and classification How it works: - Chrome/Edge: Uses Web Speech API for realtime transcription - iOS Safari: Records audio with MediaRecorder, sends to server for transcription - Fallback is transparent to the user with appropriate UI feedback 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-02 05:59:26 +00:00
parent ff69848ec5
commit 330c776124
3 changed files with 190 additions and 26 deletions
--- a/maternal-app/maternal-app-backend/src/modules/voice/voice.controller.ts
+++ b/maternal-app/maternal-app-backend/src/modules/voice/voice.controller.ts
@@ -18,20 +18,46 @@ export class VoiceController {
  @UseInterceptors(FileInterceptor('audio'))
  async transcribeAudio(
    @UploadedFile() file: Express.Multer.File,
+    @Body('text') text?: string,
    @Body('language') language?: string,
+    @Body('childName') childName?: string,
  ) {
-    if (!file) {
-      throw new BadRequestException('Audio file is required');
+    // If text is provided (from Web Speech API), classify it directly
+    if (text) {
+      const result = await this.voiceService.extractActivityFromText(
+        text,
+        language || 'en',
+        childName,
+      );
+
+      return {
+        success: true,
+        transcript: text,
+        classification: result,
+      };
    }

-    const result = await this.voiceService.transcribeAudio(
+    // Otherwise, transcribe the audio file
+    if (!file) {
+      throw new BadRequestException('Audio file or text is required');
+    }
+
+    const transcription = await this.voiceService.transcribeAudio(
      file.buffer,
      language,
    );

+    // Also classify the transcription
+    const classification = await this.voiceService.extractActivityFromText(
+      transcription.text,
+      language || 'en',
+      childName,
+    );
+
    return {
      success: true,
-      data: result,
+      transcript: transcription.text,
+      classification: classification,
    };
  }

--- a/maternal-web/components/voice/VoiceInputButton.tsx
+++ b/maternal-web/components/voice/VoiceInputButton.tsx
@@ -42,7 +42,7 @@ export function VoiceInputButton({
  const [isProcessing, setIsProcessing] = useState(false);
  const [classificationResult, setClassificationResult] = useState<any>(null);

-  const { isListening, isSupported, transcript, error, startListening, stopListening, reset } =
+  const { isListening, isSupported, transcript, error, usesFallback, startListening, stopListening, reset } =
    useVoiceInput();

  // Auto-classify when we get a final transcript
@@ -215,10 +215,18 @@ export function VoiceInputButton({
            {/* Status text */}
            <Typography variant="body1" color="text.secondary" gutterBottom>
              {isListening
-                ? 'Listening... Speak now'
+                ? usesFallback
+                  ? 'Recording... Speak now'
+                  : 'Listening... Speak now'
                : 'Click the microphone to start'}
            </Typography>

+            {usesFallback && !isListening && !transcript && (
+              <Typography variant="caption" color="text.secondary" sx={{ mt: 1, display: 'block' }}>
+                Using audio recording mode (iOS Safari)
+              </Typography>
+            )}
+
            {/* Transcript */}
            {transcript && (
              <Box sx={{ mt: 3, p: 2, bgcolor: 'grey.100', borderRadius: 1 }}>
--- a/maternal-web/hooks/useVoiceInput.ts
+++ b/maternal-web/hooks/useVoiceInput.ts
@@ -11,13 +11,14 @@ export interface VoiceInputState {
  isSupported: boolean;
  transcript: string;
  error: string | null;
+  usesFallback: boolean;
 }

 /**
- * Hook for voice input using browser Web Speech API
+ * Hook for voice input using browser Web Speech API or MediaRecorder fallback
 *
 * Provides voice recording functionality with real-time transcription.
- * Falls back gracefully if browser doesn't support Speech Recognition.
+ * Falls back to MediaRecorder + server-side transcription for iOS Safari.
 */
 export function useVoiceInput() {
  const [state, setState] = useState<VoiceInputState>({
@@ -25,34 +26,52 @@ export function useVoiceInput() {
    isSupported: false,
    transcript: '',
    error: null,
+    usesFallback: false,
  });

  const recognitionRef = useRef<any>(null);
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const audioChunksRef = useRef<Blob[]>([]);
  const timeoutRef = useRef<NodeJS.Timeout | null>(null);

-  // Check if browser supports Speech Recognition
+  // Check if browser supports Speech Recognition or MediaRecorder
  useEffect(() => {
    const SpeechRecognition =
      (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;

    if (SpeechRecognition) {
-      setState(prev => ({ ...prev, isSupported: true }));
+      try {
+        // Initialize recognition
+        const recognition = new SpeechRecognition();
+        recognition.continuous = false; // Single recognition
+        recognition.interimResults = true; // Get interim results
+        recognition.maxAlternatives = 1;
+        recognition.lang = 'en-US'; // Default language

-      // Initialize recognition
-      const recognition = new SpeechRecognition();
-      recognition.continuous = false; // Single recognition
-      recognition.interimResults = true; // Get interim results
-      recognition.maxAlternatives = 1;
-      recognition.lang = 'en-US'; // Default language
-
-      recognitionRef.current = recognition;
+        recognitionRef.current = recognition;
+        setState(prev => ({ ...prev, isSupported: true, usesFallback: false }));
+      } catch (error) {
+        console.warn('[Voice] Speech Recognition initialization failed, using fallback');
+        setState(prev => ({ ...prev, isSupported: true, usesFallback: true }));
+      }
+    } else if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
+      // Use MediaRecorder fallback for iOS Safari
+      console.log('[Voice] Using MediaRecorder fallback for iOS Safari');
+      setState(prev => ({ ...prev, isSupported: true, usesFallback: true }));
    } else {
      setState(prev => ({ ...prev, isSupported: false }));
    }

    return () => {
      if (recognitionRef.current) {
-        recognitionRef.current.stop();
+        try {
+          recognitionRef.current.stop();
+        } catch (e) {
+          // Ignore errors on cleanup
+        }
+      }
+      if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
+        mediaRecorderRef.current.stop();
      }
      if (timeoutRef.current) {
        clearTimeout(timeoutRef.current);
@@ -60,16 +79,106 @@ export function useVoiceInput() {
    };
  }, []);

-  // Start listening
-  const startListening = useCallback(() => {
-    if (!recognitionRef.current) {
+  // Start listening with MediaRecorder fallback
+  const startListeningWithFallback = useCallback(async () => {
+    audioChunksRef.current = [];
+
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      const mediaRecorder = new MediaRecorder(stream, {
+        mimeType: 'audio/webm;codecs=opus',
+      });
+
+      mediaRecorderRef.current = mediaRecorder;
+
+      mediaRecorder.ondataavailable = (event) => {
+        if (event.data.size > 0) {
+          audioChunksRef.current.push(event.data);
+        }
+      };
+
+      mediaRecorder.onstop = async () => {
+        const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' });
+
+        // Send to backend for transcription
+        try {
+          const formData = new FormData();
+          formData.append('audio', audioBlob, 'recording.webm');
+
+          const response = await fetch('/api/voice/transcribe', {
+            method: 'POST',
+            body: formData,
+          });
+
+          const data = await response.json();
+
+          if (response.ok && data.success) {
+            setState(prev => ({
+              ...prev,
+              isListening: false,
+              transcript: data.transcript,
+            }));
+          } else {
+            setState(prev => ({
+              ...prev,
+              isListening: false,
+              error: data.message || 'Failed to transcribe audio',
+            }));
+          }
+        } catch (error) {
+          console.error('[Voice] Transcription error:', error);
+          setState(prev => ({
+            ...prev,
+            isListening: false,
+            error: 'Failed to process audio',
+          }));
+        }
+
+        // Stop all tracks
+        stream.getTracks().forEach(track => track.stop());
+      };
+
+      mediaRecorder.onerror = (event) => {
+        console.error('[Voice] MediaRecorder error:', event);
+        setState(prev => ({
+          ...prev,
+          isListening: false,
+          error: 'Recording failed',
+        }));
+      };
+
      setState(prev => ({
        ...prev,
-        error: 'Speech recognition not supported in this browser',
+        isListening: true,
+        transcript: '',
+        error: null,
      }));
-      return;
-    }

+      mediaRecorder.start();
+
+      // Auto-stop after 10 seconds
+      timeoutRef.current = setTimeout(() => {
+        if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
+          mediaRecorderRef.current.stop();
+        }
+      }, 10000);
+    } catch (error: any) {
+      console.error('[Voice] Failed to access microphone:', error);
+      let errorMessage = 'Failed to access microphone';
+      if (error.name === 'NotAllowedError' || error.name === 'PermissionDeniedError') {
+        errorMessage = 'Microphone access denied. Please grant permission.';
+      } else if (error.name === 'NotFoundError') {
+        errorMessage = 'No microphone found. Please check your settings.';
+      }
+      setState(prev => ({
+        ...prev,
+        error: errorMessage,
+      }));
+    }
+  }, []);
+
+  // Start listening with Web Speech API
+  const startListeningWithSpeechAPI = useCallback(() => {
    const recognition = recognitionRef.current;

    // Clear previous state
@@ -153,10 +262,31 @@ export function useVoiceInput() {
    }
  }, []);

+  // Start listening (chooses appropriate method)
+  const startListening = useCallback(() => {
+    if (state.usesFallback) {
+      startListeningWithFallback();
+    } else if (recognitionRef.current) {
+      startListeningWithSpeechAPI();
+    } else {
+      setState(prev => ({
+        ...prev,
+        error: 'Voice input not supported in this browser',
+      }));
+    }
+  }, [state.usesFallback, startListeningWithFallback, startListeningWithSpeechAPI]);
+
  // Stop listening
  const stopListening = useCallback(() => {
    if (recognitionRef.current) {
-      recognitionRef.current.stop();
+      try {
+        recognitionRef.current.stop();
+      } catch (e) {
+        // Ignore errors
+      }
+    }
+    if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
+      mediaRecorderRef.current.stop();
    }
    if (timeoutRef.current) {
      clearTimeout(timeoutRef.current);