Add iOS Safari support for voice commands with MediaRecorder fallback
Frontend changes: - Add MediaRecorder fallback for iOS Safari (no Web Speech API support) - Automatically detect browser capabilities and use appropriate method - Add usesFallback flag to track which method is being used - Update UI to show "Recording..." vs "Listening..." based on method - Add iOS-specific indicator text - Handle microphone permissions and errors properly Backend changes: - Update /api/v1/voice/transcribe to accept both audio files and text - Support text-based classification (from Web Speech API) - Support audio file transcription + classification (from MediaRecorder) - Return unified response format with transcript and classification How it works: - Chrome/Edge: Uses Web Speech API for realtime transcription - iOS Safari: Records audio with MediaRecorder, sends to server for transcription - Fallback is transparent to the user with appropriate UI feedback 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -18,20 +18,46 @@ export class VoiceController {
|
|||||||
@UseInterceptors(FileInterceptor('audio'))
|
@UseInterceptors(FileInterceptor('audio'))
|
||||||
async transcribeAudio(
|
async transcribeAudio(
|
||||||
@UploadedFile() file: Express.Multer.File,
|
@UploadedFile() file: Express.Multer.File,
|
||||||
|
@Body('text') text?: string,
|
||||||
@Body('language') language?: string,
|
@Body('language') language?: string,
|
||||||
|
@Body('childName') childName?: string,
|
||||||
) {
|
) {
|
||||||
if (!file) {
|
// If text is provided (from Web Speech API), classify it directly
|
||||||
throw new BadRequestException('Audio file is required');
|
if (text) {
|
||||||
|
const result = await this.voiceService.extractActivityFromText(
|
||||||
|
text,
|
||||||
|
language || 'en',
|
||||||
|
childName,
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
transcript: text,
|
||||||
|
classification: result,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = await this.voiceService.transcribeAudio(
|
// Otherwise, transcribe the audio file
|
||||||
|
if (!file) {
|
||||||
|
throw new BadRequestException('Audio file or text is required');
|
||||||
|
}
|
||||||
|
|
||||||
|
const transcription = await this.voiceService.transcribeAudio(
|
||||||
file.buffer,
|
file.buffer,
|
||||||
language,
|
language,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Also classify the transcription
|
||||||
|
const classification = await this.voiceService.extractActivityFromText(
|
||||||
|
transcription.text,
|
||||||
|
language || 'en',
|
||||||
|
childName,
|
||||||
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: result,
|
transcript: transcription.text,
|
||||||
|
classification: classification,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ export function VoiceInputButton({
|
|||||||
const [isProcessing, setIsProcessing] = useState(false);
|
const [isProcessing, setIsProcessing] = useState(false);
|
||||||
const [classificationResult, setClassificationResult] = useState<any>(null);
|
const [classificationResult, setClassificationResult] = useState<any>(null);
|
||||||
|
|
||||||
const { isListening, isSupported, transcript, error, startListening, stopListening, reset } =
|
const { isListening, isSupported, transcript, error, usesFallback, startListening, stopListening, reset } =
|
||||||
useVoiceInput();
|
useVoiceInput();
|
||||||
|
|
||||||
// Auto-classify when we get a final transcript
|
// Auto-classify when we get a final transcript
|
||||||
@@ -215,10 +215,18 @@ export function VoiceInputButton({
|
|||||||
{/* Status text */}
|
{/* Status text */}
|
||||||
<Typography variant="body1" color="text.secondary" gutterBottom>
|
<Typography variant="body1" color="text.secondary" gutterBottom>
|
||||||
{isListening
|
{isListening
|
||||||
? 'Listening... Speak now'
|
? usesFallback
|
||||||
|
? 'Recording... Speak now'
|
||||||
|
: 'Listening... Speak now'
|
||||||
: 'Click the microphone to start'}
|
: 'Click the microphone to start'}
|
||||||
</Typography>
|
</Typography>
|
||||||
|
|
||||||
|
{usesFallback && !isListening && !transcript && (
|
||||||
|
<Typography variant="caption" color="text.secondary" sx={{ mt: 1, display: 'block' }}>
|
||||||
|
Using audio recording mode (iOS Safari)
|
||||||
|
</Typography>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Transcript */}
|
{/* Transcript */}
|
||||||
{transcript && (
|
{transcript && (
|
||||||
<Box sx={{ mt: 3, p: 2, bgcolor: 'grey.100', borderRadius: 1 }}>
|
<Box sx={{ mt: 3, p: 2, bgcolor: 'grey.100', borderRadius: 1 }}>
|
||||||
|
|||||||
@@ -11,13 +11,14 @@ export interface VoiceInputState {
|
|||||||
isSupported: boolean;
|
isSupported: boolean;
|
||||||
transcript: string;
|
transcript: string;
|
||||||
error: string | null;
|
error: string | null;
|
||||||
|
usesFallback: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hook for voice input using browser Web Speech API
|
* Hook for voice input using browser Web Speech API or MediaRecorder fallback
|
||||||
*
|
*
|
||||||
* Provides voice recording functionality with real-time transcription.
|
* Provides voice recording functionality with real-time transcription.
|
||||||
* Falls back gracefully if browser doesn't support Speech Recognition.
|
* Falls back to MediaRecorder + server-side transcription for iOS Safari.
|
||||||
*/
|
*/
|
||||||
export function useVoiceInput() {
|
export function useVoiceInput() {
|
||||||
const [state, setState] = useState<VoiceInputState>({
|
const [state, setState] = useState<VoiceInputState>({
|
||||||
@@ -25,34 +26,52 @@ export function useVoiceInput() {
|
|||||||
isSupported: false,
|
isSupported: false,
|
||||||
transcript: '',
|
transcript: '',
|
||||||
error: null,
|
error: null,
|
||||||
|
usesFallback: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
const recognitionRef = useRef<any>(null);
|
const recognitionRef = useRef<any>(null);
|
||||||
|
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
||||||
|
const audioChunksRef = useRef<Blob[]>([]);
|
||||||
const timeoutRef = useRef<NodeJS.Timeout | null>(null);
|
const timeoutRef = useRef<NodeJS.Timeout | null>(null);
|
||||||
|
|
||||||
// Check if browser supports Speech Recognition
|
// Check if browser supports Speech Recognition or MediaRecorder
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const SpeechRecognition =
|
const SpeechRecognition =
|
||||||
(window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;
|
(window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;
|
||||||
|
|
||||||
if (SpeechRecognition) {
|
if (SpeechRecognition) {
|
||||||
setState(prev => ({ ...prev, isSupported: true }));
|
try {
|
||||||
|
// Initialize recognition
|
||||||
|
const recognition = new SpeechRecognition();
|
||||||
|
recognition.continuous = false; // Single recognition
|
||||||
|
recognition.interimResults = true; // Get interim results
|
||||||
|
recognition.maxAlternatives = 1;
|
||||||
|
recognition.lang = 'en-US'; // Default language
|
||||||
|
|
||||||
// Initialize recognition
|
recognitionRef.current = recognition;
|
||||||
const recognition = new SpeechRecognition();
|
setState(prev => ({ ...prev, isSupported: true, usesFallback: false }));
|
||||||
recognition.continuous = false; // Single recognition
|
} catch (error) {
|
||||||
recognition.interimResults = true; // Get interim results
|
console.warn('[Voice] Speech Recognition initialization failed, using fallback');
|
||||||
recognition.maxAlternatives = 1;
|
setState(prev => ({ ...prev, isSupported: true, usesFallback: true }));
|
||||||
recognition.lang = 'en-US'; // Default language
|
}
|
||||||
|
} else if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
|
||||||
recognitionRef.current = recognition;
|
// Use MediaRecorder fallback for iOS Safari
|
||||||
|
console.log('[Voice] Using MediaRecorder fallback for iOS Safari');
|
||||||
|
setState(prev => ({ ...prev, isSupported: true, usesFallback: true }));
|
||||||
} else {
|
} else {
|
||||||
setState(prev => ({ ...prev, isSupported: false }));
|
setState(prev => ({ ...prev, isSupported: false }));
|
||||||
}
|
}
|
||||||
|
|
||||||
return () => {
|
return () => {
|
||||||
if (recognitionRef.current) {
|
if (recognitionRef.current) {
|
||||||
recognitionRef.current.stop();
|
try {
|
||||||
|
recognitionRef.current.stop();
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore errors on cleanup
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
|
||||||
|
mediaRecorderRef.current.stop();
|
||||||
}
|
}
|
||||||
if (timeoutRef.current) {
|
if (timeoutRef.current) {
|
||||||
clearTimeout(timeoutRef.current);
|
clearTimeout(timeoutRef.current);
|
||||||
@@ -60,16 +79,106 @@ export function useVoiceInput() {
|
|||||||
};
|
};
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
// Start listening
|
// Start listening with MediaRecorder fallback
|
||||||
const startListening = useCallback(() => {
|
const startListeningWithFallback = useCallback(async () => {
|
||||||
if (!recognitionRef.current) {
|
audioChunksRef.current = [];
|
||||||
|
|
||||||
|
try {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
const mediaRecorder = new MediaRecorder(stream, {
|
||||||
|
mimeType: 'audio/webm;codecs=opus',
|
||||||
|
});
|
||||||
|
|
||||||
|
mediaRecorderRef.current = mediaRecorder;
|
||||||
|
|
||||||
|
mediaRecorder.ondataavailable = (event) => {
|
||||||
|
if (event.data.size > 0) {
|
||||||
|
audioChunksRef.current.push(event.data);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
mediaRecorder.onstop = async () => {
|
||||||
|
const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' });
|
||||||
|
|
||||||
|
// Send to backend for transcription
|
||||||
|
try {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append('audio', audioBlob, 'recording.webm');
|
||||||
|
|
||||||
|
const response = await fetch('/api/voice/transcribe', {
|
||||||
|
method: 'POST',
|
||||||
|
body: formData,
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
|
||||||
|
if (response.ok && data.success) {
|
||||||
|
setState(prev => ({
|
||||||
|
...prev,
|
||||||
|
isListening: false,
|
||||||
|
transcript: data.transcript,
|
||||||
|
}));
|
||||||
|
} else {
|
||||||
|
setState(prev => ({
|
||||||
|
...prev,
|
||||||
|
isListening: false,
|
||||||
|
error: data.message || 'Failed to transcribe audio',
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Voice] Transcription error:', error);
|
||||||
|
setState(prev => ({
|
||||||
|
...prev,
|
||||||
|
isListening: false,
|
||||||
|
error: 'Failed to process audio',
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop all tracks
|
||||||
|
stream.getTracks().forEach(track => track.stop());
|
||||||
|
};
|
||||||
|
|
||||||
|
mediaRecorder.onerror = (event) => {
|
||||||
|
console.error('[Voice] MediaRecorder error:', event);
|
||||||
|
setState(prev => ({
|
||||||
|
...prev,
|
||||||
|
isListening: false,
|
||||||
|
error: 'Recording failed',
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
setState(prev => ({
|
setState(prev => ({
|
||||||
...prev,
|
...prev,
|
||||||
error: 'Speech recognition not supported in this browser',
|
isListening: true,
|
||||||
|
transcript: '',
|
||||||
|
error: null,
|
||||||
}));
|
}));
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
mediaRecorder.start();
|
||||||
|
|
||||||
|
// Auto-stop after 10 seconds
|
||||||
|
timeoutRef.current = setTimeout(() => {
|
||||||
|
if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
|
||||||
|
mediaRecorderRef.current.stop();
|
||||||
|
}
|
||||||
|
}, 10000);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Voice] Failed to access microphone:', error);
|
||||||
|
let errorMessage = 'Failed to access microphone';
|
||||||
|
if (error.name === 'NotAllowedError' || error.name === 'PermissionDeniedError') {
|
||||||
|
errorMessage = 'Microphone access denied. Please grant permission.';
|
||||||
|
} else if (error.name === 'NotFoundError') {
|
||||||
|
errorMessage = 'No microphone found. Please check your settings.';
|
||||||
|
}
|
||||||
|
setState(prev => ({
|
||||||
|
...prev,
|
||||||
|
error: errorMessage,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// Start listening with Web Speech API
|
||||||
|
const startListeningWithSpeechAPI = useCallback(() => {
|
||||||
const recognition = recognitionRef.current;
|
const recognition = recognitionRef.current;
|
||||||
|
|
||||||
// Clear previous state
|
// Clear previous state
|
||||||
@@ -153,10 +262,31 @@ export function useVoiceInput() {
|
|||||||
}
|
}
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
|
// Start listening (chooses appropriate method)
|
||||||
|
const startListening = useCallback(() => {
|
||||||
|
if (state.usesFallback) {
|
||||||
|
startListeningWithFallback();
|
||||||
|
} else if (recognitionRef.current) {
|
||||||
|
startListeningWithSpeechAPI();
|
||||||
|
} else {
|
||||||
|
setState(prev => ({
|
||||||
|
...prev,
|
||||||
|
error: 'Voice input not supported in this browser',
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}, [state.usesFallback, startListeningWithFallback, startListeningWithSpeechAPI]);
|
||||||
|
|
||||||
// Stop listening
|
// Stop listening
|
||||||
const stopListening = useCallback(() => {
|
const stopListening = useCallback(() => {
|
||||||
if (recognitionRef.current) {
|
if (recognitionRef.current) {
|
||||||
recognitionRef.current.stop();
|
try {
|
||||||
|
recognitionRef.current.stop();
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
|
||||||
|
mediaRecorderRef.current.stop();
|
||||||
}
|
}
|
||||||
if (timeoutRef.current) {
|
if (timeoutRef.current) {
|
||||||
clearTimeout(timeoutRef.current);
|
clearTimeout(timeoutRef.current);
|
||||||
|
|||||||
Reference in New Issue
Block a user