From a44faf6ef49a65ee0e5bc1e717c35549f5e0e0c8 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 2 Oct 2025 07:15:44 +0000 Subject: [PATCH] Fix voice input for iOS Safari and prevent infinite loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove temperature parameter from GPT-5-mini activity extraction (not supported) - Add classification state to useVoiceInput hook to avoid duplicate API calls - Prevent infinite loop in VoiceFloatingButton by tracking lastClassifiedTranscript - Use classification from backend directly instead of making second request - iOS Safari now successfully transcribes with Azure Whisper and classifies with GPT-5-mini 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../src/modules/voice/voice.service.ts | 1 - .../app/api/voice/transcribe/route.ts | 50 ++++++++++++++++--- .../components/voice/VoiceFloatingButton.tsx | 17 +++++-- maternal-web/hooks/useVoiceInput.ts | 18 ++++++- 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/maternal-app/maternal-app-backend/src/modules/voice/voice.service.ts b/maternal-app/maternal-app-backend/src/modules/voice/voice.service.ts index a587b8d..fa0929f 100644 --- a/maternal-app/maternal-app-backend/src/modules/voice/voice.service.ts +++ b/maternal-app/maternal-app-backend/src/modules/voice/voice.service.ts @@ -174,7 +174,6 @@ If the text doesn't describe a trackable activity, respond with: { role: 'system', content: systemPrompt }, { role: 'user', content: userPrompt }, ], - temperature: 0.3, response_format: { type: 'json_object' }, }); diff --git a/maternal-web/app/api/voice/transcribe/route.ts b/maternal-web/app/api/voice/transcribe/route.ts index 29ee884..ad37ae9 100644 --- a/maternal-web/app/api/voice/transcribe/route.ts +++ b/maternal-web/app/api/voice/transcribe/route.ts @@ -30,16 +30,52 @@ export async function POST(request: NextRequest) { ); } } else if (contentType.includes('multipart/form-data')) { - // Audio file upload (needs transcription) - // TODO: Implement Whisper API integration for audio transcription - // For now, return not implemented + // Audio file upload - forward to backend for Whisper transcription + const formData = await request.formData(); + const audioFile = formData.get('audio'); + + if (!audioFile) { + return NextResponse.json( + { + error: 'VOICE_NO_AUDIO', + message: 'No audio file provided', + }, + { status: 400 } + ); + } + + // Forward to backend + const backendUrl = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:3020'; + const backendFormData = new FormData(); + backendFormData.append('audio', audioFile); + + const backendResponse = await fetch(`${backendUrl}/api/v1/voice/transcribe`, { + method: 'POST', + body: backendFormData, + headers: { + // Forward auth token if present + ...(request.headers.get('authorization') && { + authorization: request.headers.get('authorization')!, + }), + }, + }); + + if (!backendResponse.ok) { + const errorData = await backendResponse.json(); + return NextResponse.json(errorData, { status: backendResponse.status }); + } + + const result = await backendResponse.json(); + + // Backend returns { success, transcript, classification } + // Return in the format expected by the frontend return NextResponse.json( { - error: 'VOICE_AUDIO_NOT_IMPLEMENTED', - message: 'Audio transcription not yet implemented. Use text input for now.', - hint: 'Send JSON with { "text": "your voice command" }', + success: true, + transcript: result.transcript, + classification: result.classification, }, - { status: 501 } + { status: 200 } ); } else { return NextResponse.json( diff --git a/maternal-web/components/voice/VoiceFloatingButton.tsx b/maternal-web/components/voice/VoiceFloatingButton.tsx index 776362f..41a1535 100644 --- a/maternal-web/components/voice/VoiceFloatingButton.tsx +++ b/maternal-web/components/voice/VoiceFloatingButton.tsx @@ -33,6 +33,7 @@ export function VoiceFloatingButton() { const [open, setOpen] = useState(false); const [isProcessing, setIsProcessing] = useState(false); const [classificationResult, setClassificationResult] = useState(null); + const [lastClassifiedTranscript, setLastClassifiedTranscript] = useState(''); const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; @@ -43,15 +44,16 @@ export function VoiceFloatingButton() { severity: 'info', }); - const { isListening, isSupported, transcript, error, startListening, stopListening, reset } = + const { isListening, isSupported, transcript, classification, error, startListening, stopListening, reset } = useVoiceInput(); - // Auto-classify when we get a final transcript + // Auto-use classification from backend when transcription completes React.useEffect(() => { - if (transcript && !isListening && !isProcessing && open) { - classifyTranscript(transcript); + if (classification && !isListening && !isProcessing && open) { + setClassificationResult(classification); + handleClassifiedIntent(classification); } - }, [transcript, isListening, isProcessing, open]); + }, [classification, isListening, isProcessing, open]); const handleOpen = () => { if (!isSupported) { @@ -65,6 +67,7 @@ export function VoiceFloatingButton() { setOpen(true); reset(); setClassificationResult(null); + setLastClassifiedTranscript(''); }; const handleClose = () => { @@ -74,11 +77,13 @@ export function VoiceFloatingButton() { setOpen(false); reset(); setClassificationResult(null); + setLastClassifiedTranscript(''); }; const handleStartListening = () => { reset(); setClassificationResult(null); + setLastClassifiedTranscript(''); startListening(); }; @@ -87,6 +92,8 @@ export function VoiceFloatingButton() { }; const classifyTranscript = async (text: string) => { + // Mark this transcript as being classified to prevent duplicate calls + setLastClassifiedTranscript(text); setIsProcessing(true); try { const response = await fetch('/api/voice/transcribe', { diff --git a/maternal-web/hooks/useVoiceInput.ts b/maternal-web/hooks/useVoiceInput.ts index 494b06e..26294b5 100644 --- a/maternal-web/hooks/useVoiceInput.ts +++ b/maternal-web/hooks/useVoiceInput.ts @@ -1,4 +1,5 @@ import { useState, useEffect, useCallback, useRef } from 'react'; +import { tokenStorage } from '@/lib/utils/tokenStorage'; export interface VoiceInputResult { transcript: string; @@ -10,6 +11,7 @@ export interface VoiceInputState { isListening: boolean; isSupported: boolean; transcript: string; + classification: any | null; error: string | null; usesFallback: boolean; } @@ -25,6 +27,7 @@ export function useVoiceInput() { isListening: false, isSupported: false, transcript: '', + classification: null, error: null, usesFallback: false, }); @@ -148,9 +151,19 @@ export function useVoiceInput() { formData.append('audio', audioBlob, `recording.${extension}`); console.log('[Voice] Sending to backend for transcription...'); - const response = await fetch('/api/voice/transcribe', { + + // Get auth token and API base URL + const token = tokenStorage.getAccessToken(); + const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:3020'; + const headers: HeadersInit = {}; + if (token) { + headers['Authorization'] = `Bearer ${token}`; + } + + const response = await fetch(`${API_BASE_URL}/api/v1/voice/transcribe`, { method: 'POST', body: formData, + headers, }); console.log('[Voice] Transcription response status:', response.status); @@ -162,6 +175,7 @@ export function useVoiceInput() { ...prev, isListening: false, transcript: data.transcript, + classification: data.classification || null, })); } else { console.error('[Voice] Transcription failed:', data); @@ -169,6 +183,7 @@ export function useVoiceInput() { ...prev, isListening: false, error: data.message || 'Failed to transcribe audio', + classification: null, })); } } catch (error) { @@ -358,6 +373,7 @@ export function useVoiceInput() { setState(prev => ({ ...prev, transcript: '', + classification: null, error: null, })); }, []);