Add iOS Safari support for voice commands with MediaRecorder fallback

Frontend changes: - Add MediaRecorder fallback for iOS Safari (no Web Speech API support) - Automatically detect browser capabilities and use appropriate method - Add usesFallback flag to track which method is being used - Update UI to show "Recording..." vs "Listening..." based on method - Add iOS-specific indicator text - Handle microphone permissions and errors properly Backend changes: - Update /api/v1/voice/transcribe to accept both audio files and text - Support text-based classification (from Web Speech API) - Support audio file transcription + classification (from MediaRecorder) - Return unified response format with transcript and classification How it works: - Chrome/Edge: Uses Web Speech API for realtime transcription - iOS Safari: Records audio with MediaRecorder, sends to server for transcription - Fallback is transparent to the user with appropriate UI feedback 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-02 05:59:26 +00:00
parent ff69848ec5
commit 330c776124
3 changed files with 190 additions and 26 deletions
--- a/maternal-app/maternal-app-backend/src/modules/voice/voice.controller.ts
+++ b/maternal-app/maternal-app-backend/src/modules/voice/voice.controller.ts
@@ -18,20 +18,46 @@ export class VoiceController {
  @UseInterceptors(FileInterceptor('audio'))
  async transcribeAudio(
    @UploadedFile() file: Express.Multer.File,
    @Body('text') text?: string,
    @Body('language') language?: string,
    @Body('childName') childName?: string,
  ) {
-    if (!file) {
+    // If text is provided (from Web Speech API), classify it directly
-      throw new BadRequestException('Audio file is required');
+    if (text) {
      const result = await this.voiceService.extractActivityFromText(
        text,
        language || 'en',
        childName,
      );
      return {
        success: true,
        transcript: text,
        classification: result,
      };
    }
-    const result = await this.voiceService.transcribeAudio(
+    // Otherwise, transcribe the audio file
    if (!file) {
      throw new BadRequestException('Audio file or text is required');
    }
    const transcription = await this.voiceService.transcribeAudio(
      file.buffer,
      language,
    );
    // Also classify the transcription
    const classification = await this.voiceService.extractActivityFromText(
      transcription.text,
      language || 'en',
      childName,
    );
    return {
      success: true,
-      data: result,
+      transcript: transcription.text,
      classification: classification,
    };
  }
--- a/maternal-web/components/voice/VoiceInputButton.tsx
+++ b/maternal-web/components/voice/VoiceInputButton.tsx
@@ -42,7 +42,7 @@ export function VoiceInputButton({
  const [isProcessing, setIsProcessing] = useState(false);
  const [classificationResult, setClassificationResult] = useState<any>(null);
-  const { isListening, isSupported, transcript, error, startListening, stopListening, reset } =
+  const { isListening, isSupported, transcript, error, usesFallback, startListening, stopListening, reset } =
    useVoiceInput();
  // Auto-classify when we get a final transcript
@@ -215,10 +215,18 @@ export function VoiceInputButton({
            {/* Status text */}
            <Typography variant="body1" color="text.secondary" gutterBottom>
              {isListening
-                ? 'Listening... Speak now'
+                ? usesFallback
                  ? 'Recording... Speak now'
                  : 'Listening... Speak now'
                : 'Click the microphone to start'}
            </Typography>
            {usesFallback && !isListening && !transcript && (
              <Typography variant="caption" color="text.secondary" sx={{ mt: 1, display: 'block' }}>
                Using audio recording mode (iOS Safari)
              </Typography>
            )}
            {/* Transcript */}
            {transcript && (
              <Box sx={{ mt: 3, p: 2, bgcolor: 'grey.100', borderRadius: 1 }}>
--- a/maternal-web/hooks/useVoiceInput.ts
+++ b/maternal-web/hooks/useVoiceInput.ts
@@ -11,13 +11,14 @@ export interface VoiceInputState {
  isSupported: boolean;
  transcript: string;
  error: string | null;
  usesFallback: boolean;
 }
 /**
- * Hook for voice input using browser Web Speech API
+ * Hook for voice input using browser Web Speech API or MediaRecorder fallback
 *
 * Provides voice recording functionality with real-time transcription.
- * Falls back gracefully if browser doesn't support Speech Recognition.
+ * Falls back to MediaRecorder + server-side transcription for iOS Safari.
 */
 export function useVoiceInput() {
  const [state, setState] = useState<VoiceInputState>({
@@ -25,34 +26,52 @@ export function useVoiceInput() {
    isSupported: false,
    transcript: '',
    error: null,
    usesFallback: false,
  });
  const recognitionRef = useRef<any>(null);
  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
  const audioChunksRef = useRef<Blob[]>([]);
  const timeoutRef = useRef<NodeJS.Timeout | null>(null);
-  // Check if browser supports Speech Recognition
+  // Check if browser supports Speech Recognition or MediaRecorder
  useEffect(() => {
    const SpeechRecognition =
      (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;
    if (SpeechRecognition) {
-      setState(prev => ({ ...prev, isSupported: true }));
+      try {
        // Initialize recognition
        const recognition = new SpeechRecognition();
        recognition.continuous = false; // Single recognition
        recognition.interimResults = true; // Get interim results
        recognition.maxAlternatives = 1;
        recognition.lang = 'en-US'; // Default language
-      // Initialize recognition
+        recognitionRef.current = recognition;
-      const recognition = new SpeechRecognition();
+        setState(prev => ({ ...prev, isSupported: true, usesFallback: false }));
-      recognition.continuous = false; // Single recognition
+      } catch (error) {
-      recognition.interimResults = true; // Get interim results
+        console.warn('[Voice] Speech Recognition initialization failed, using fallback');
-      recognition.maxAlternatives = 1;
+        setState(prev => ({ ...prev, isSupported: true, usesFallback: true }));
-      recognition.lang = 'en-US'; // Default language
+      }
-
+    } else if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
-      recognitionRef.current = recognition;
+      // Use MediaRecorder fallback for iOS Safari
      console.log('[Voice] Using MediaRecorder fallback for iOS Safari');
      setState(prev => ({ ...prev, isSupported: true, usesFallback: true }));
    } else {
      setState(prev => ({ ...prev, isSupported: false }));
    }
    return () => {
      if (recognitionRef.current) {
-        recognitionRef.current.stop();
+        try {
          recognitionRef.current.stop();
        } catch (e) {
          // Ignore errors on cleanup
        }
      }
      if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
        mediaRecorderRef.current.stop();
      }
      if (timeoutRef.current) {
        clearTimeout(timeoutRef.current);
@@ -60,16 +79,106 @@ export function useVoiceInput() {
    };
  }, []);
-  // Start listening
+  // Start listening with MediaRecorder fallback
-  const startListening = useCallback(() => {
+  const startListeningWithFallback = useCallback(async () => {
-    if (!recognitionRef.current) {
+    audioChunksRef.current = [];
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      const mediaRecorder = new MediaRecorder(stream, {
        mimeType: 'audio/webm;codecs=opus',
      });
      mediaRecorderRef.current = mediaRecorder;
      mediaRecorder.ondataavailable = (event) => {
        if (event.data.size > 0) {
          audioChunksRef.current.push(event.data);
        }
      };
      mediaRecorder.onstop = async () => {
        const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' });
        // Send to backend for transcription
        try {
          const formData = new FormData();
          formData.append('audio', audioBlob, 'recording.webm');
          const response = await fetch('/api/voice/transcribe', {
            method: 'POST',
            body: formData,
          });
          const data = await response.json();
          if (response.ok && data.success) {
            setState(prev => ({
              ...prev,
              isListening: false,
              transcript: data.transcript,
            }));
          } else {
            setState(prev => ({
              ...prev,
              isListening: false,
              error: data.message || 'Failed to transcribe audio',
            }));
          }
        } catch (error) {
          console.error('[Voice] Transcription error:', error);
          setState(prev => ({
            ...prev,
            isListening: false,
            error: 'Failed to process audio',
          }));
        }
        // Stop all tracks
        stream.getTracks().forEach(track => track.stop());
      };
      mediaRecorder.onerror = (event) => {
        console.error('[Voice] MediaRecorder error:', event);
        setState(prev => ({
          ...prev,
          isListening: false,
          error: 'Recording failed',
        }));
      };
      setState(prev => ({
        ...prev,
-        error: 'Speech recognition not supported in this browser',
+        isListening: true,
        transcript: '',
        error: null,
      }));
      return;
    }
      mediaRecorder.start();
      // Auto-stop after 10 seconds
      timeoutRef.current = setTimeout(() => {
        if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
          mediaRecorderRef.current.stop();
        }
      }, 10000);
    } catch (error: any) {
      console.error('[Voice] Failed to access microphone:', error);
      let errorMessage = 'Failed to access microphone';
      if (error.name === 'NotAllowedError' || error.name === 'PermissionDeniedError') {
        errorMessage = 'Microphone access denied. Please grant permission.';
      } else if (error.name === 'NotFoundError') {
        errorMessage = 'No microphone found. Please check your settings.';
      }
      setState(prev => ({
        ...prev,
        error: errorMessage,
      }));
    }
  }, []);
  // Start listening with Web Speech API
  const startListeningWithSpeechAPI = useCallback(() => {
    const recognition = recognitionRef.current;
    // Clear previous state
@@ -153,10 +262,31 @@ export function useVoiceInput() {
    }
  }, []);
  // Start listening (chooses appropriate method)
  const startListening = useCallback(() => {
    if (state.usesFallback) {
      startListeningWithFallback();
    } else if (recognitionRef.current) {
      startListeningWithSpeechAPI();
    } else {
      setState(prev => ({
        ...prev,
        error: 'Voice input not supported in this browser',
      }));
    }
  }, [state.usesFallback, startListeningWithFallback, startListeningWithSpeechAPI]);
  // Stop listening
  const stopListening = useCallback(() => {
    if (recognitionRef.current) {
-      recognitionRef.current.stop();
+      try {
        recognitionRef.current.stop();
      } catch (e) {
        // Ignore errors
      }
    }
    if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
      mediaRecorderRef.current.stop();
    }
    if (timeoutRef.current) {
      clearTimeout(timeoutRef.current);