Add iOS Safari support for voice commands with MediaRecorder fallback
Some checks failed
CI/CD Pipeline / Lint and Test (push) Has been cancelled
CI/CD Pipeline / E2E Tests (push) Has been cancelled
CI/CD Pipeline / Build Application (push) Has been cancelled

Frontend changes:
- Add MediaRecorder fallback for iOS Safari (no Web Speech API support)
- Automatically detect browser capabilities and use appropriate method
- Add usesFallback flag to track which method is being used
- Update UI to show "Recording..." vs "Listening..." based on method
- Add iOS-specific indicator text
- Handle microphone permissions and errors properly

Backend changes:
- Update /api/v1/voice/transcribe to accept both audio files and text
- Support text-based classification (from Web Speech API)
- Support audio file transcription + classification (from MediaRecorder)
- Return unified response format with transcript and classification

How it works:
- Chrome/Edge: Uses Web Speech API for realtime transcription
- iOS Safari: Records audio with MediaRecorder, sends to server for transcription
- Fallback is transparent to the user with appropriate UI feedback

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-02 05:59:26 +00:00
parent ff69848ec5
commit 330c776124
3 changed files with 190 additions and 26 deletions

View File

@@ -11,13 +11,14 @@ export interface VoiceInputState {
isSupported: boolean;
transcript: string;
error: string | null;
usesFallback: boolean;
}
/**
* Hook for voice input using browser Web Speech API
* Hook for voice input using browser Web Speech API or MediaRecorder fallback
*
* Provides voice recording functionality with real-time transcription.
* Falls back gracefully if browser doesn't support Speech Recognition.
* Falls back to MediaRecorder + server-side transcription for iOS Safari.
*/
export function useVoiceInput() {
const [state, setState] = useState<VoiceInputState>({
@@ -25,34 +26,52 @@ export function useVoiceInput() {
isSupported: false,
transcript: '',
error: null,
usesFallback: false,
});
const recognitionRef = useRef<any>(null);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const audioChunksRef = useRef<Blob[]>([]);
const timeoutRef = useRef<NodeJS.Timeout | null>(null);
// Check if browser supports Speech Recognition
// Check if browser supports Speech Recognition or MediaRecorder
useEffect(() => {
const SpeechRecognition =
(window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;
if (SpeechRecognition) {
setState(prev => ({ ...prev, isSupported: true }));
try {
// Initialize recognition
const recognition = new SpeechRecognition();
recognition.continuous = false; // Single recognition
recognition.interimResults = true; // Get interim results
recognition.maxAlternatives = 1;
recognition.lang = 'en-US'; // Default language
// Initialize recognition
const recognition = new SpeechRecognition();
recognition.continuous = false; // Single recognition
recognition.interimResults = true; // Get interim results
recognition.maxAlternatives = 1;
recognition.lang = 'en-US'; // Default language
recognitionRef.current = recognition;
recognitionRef.current = recognition;
setState(prev => ({ ...prev, isSupported: true, usesFallback: false }));
} catch (error) {
console.warn('[Voice] Speech Recognition initialization failed, using fallback');
setState(prev => ({ ...prev, isSupported: true, usesFallback: true }));
}
} else if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
// Use MediaRecorder fallback for iOS Safari
console.log('[Voice] Using MediaRecorder fallback for iOS Safari');
setState(prev => ({ ...prev, isSupported: true, usesFallback: true }));
} else {
setState(prev => ({ ...prev, isSupported: false }));
}
return () => {
if (recognitionRef.current) {
recognitionRef.current.stop();
try {
recognitionRef.current.stop();
} catch (e) {
// Ignore errors on cleanup
}
}
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
mediaRecorderRef.current.stop();
}
if (timeoutRef.current) {
clearTimeout(timeoutRef.current);
@@ -60,16 +79,106 @@ export function useVoiceInput() {
};
}, []);
// Start listening
const startListening = useCallback(() => {
if (!recognitionRef.current) {
// Start listening with MediaRecorder fallback
const startListeningWithFallback = useCallback(async () => {
audioChunksRef.current = [];
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const mediaRecorder = new MediaRecorder(stream, {
mimeType: 'audio/webm;codecs=opus',
});
mediaRecorderRef.current = mediaRecorder;
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunksRef.current.push(event.data);
}
};
mediaRecorder.onstop = async () => {
const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' });
// Send to backend for transcription
try {
const formData = new FormData();
formData.append('audio', audioBlob, 'recording.webm');
const response = await fetch('/api/voice/transcribe', {
method: 'POST',
body: formData,
});
const data = await response.json();
if (response.ok && data.success) {
setState(prev => ({
...prev,
isListening: false,
transcript: data.transcript,
}));
} else {
setState(prev => ({
...prev,
isListening: false,
error: data.message || 'Failed to transcribe audio',
}));
}
} catch (error) {
console.error('[Voice] Transcription error:', error);
setState(prev => ({
...prev,
isListening: false,
error: 'Failed to process audio',
}));
}
// Stop all tracks
stream.getTracks().forEach(track => track.stop());
};
mediaRecorder.onerror = (event) => {
console.error('[Voice] MediaRecorder error:', event);
setState(prev => ({
...prev,
isListening: false,
error: 'Recording failed',
}));
};
setState(prev => ({
...prev,
error: 'Speech recognition not supported in this browser',
isListening: true,
transcript: '',
error: null,
}));
return;
}
mediaRecorder.start();
// Auto-stop after 10 seconds
timeoutRef.current = setTimeout(() => {
if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
mediaRecorderRef.current.stop();
}
}, 10000);
} catch (error: any) {
console.error('[Voice] Failed to access microphone:', error);
let errorMessage = 'Failed to access microphone';
if (error.name === 'NotAllowedError' || error.name === 'PermissionDeniedError') {
errorMessage = 'Microphone access denied. Please grant permission.';
} else if (error.name === 'NotFoundError') {
errorMessage = 'No microphone found. Please check your settings.';
}
setState(prev => ({
...prev,
error: errorMessage,
}));
}
}, []);
// Start listening with Web Speech API
const startListeningWithSpeechAPI = useCallback(() => {
const recognition = recognitionRef.current;
// Clear previous state
@@ -153,10 +262,31 @@ export function useVoiceInput() {
}
}, []);
// Start listening (chooses appropriate method)
const startListening = useCallback(() => {
if (state.usesFallback) {
startListeningWithFallback();
} else if (recognitionRef.current) {
startListeningWithSpeechAPI();
} else {
setState(prev => ({
...prev,
error: 'Voice input not supported in this browser',
}));
}
}, [state.usesFallback, startListeningWithFallback, startListeningWithSpeechAPI]);
// Stop listening
const stopListening = useCallback(() => {
if (recognitionRef.current) {
recognitionRef.current.stop();
try {
recognitionRef.current.stop();
} catch (e) {
// Ignore errors
}
}
if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
mediaRecorderRef.current.stop();
}
if (timeoutRef.current) {
clearTimeout(timeoutRef.current);