# Voice Input Processing Guide - Maternal Organization App ## Voice Processing Architecture ### Overview Voice input enables hands-free logging during childcare activities. The system processes natural language in 5 languages, extracting structured data from casual speech patterns. ### Processing Pipeline ``` Audio Input → Speech Recognition → Language Detection → Intent Classification → Entity Extraction → Action Execution → Confirmation Feedback ``` --- ## Whisper API Integration ### Configuration ```typescript // services/whisperService.ts import OpenAI from 'openai'; class WhisperService { private client: OpenAI; constructor() { this.client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }); } async transcribeAudio(audioBuffer: Buffer, language?: string): Promise { try { const response = await this.client.audio.transcriptions.create({ file: audioBuffer, model: 'whisper-1', language: language || 'en', // ISO-639-1 code response_format: 'verbose_json', timestamp_granularities: ['word'], }); return { text: response.text, language: response.language, confidence: this.calculateConfidence(response), words: response.words, }; } catch (error) { return this.handleTranscriptionError(error); } } } ``` ### Audio Preprocessing ```typescript // utils/audioPreprocessing.ts export const preprocessAudio = async (audioFile: File): Promise => { // Validate format const validFormats = ['wav', 'mp3', 'm4a', 'webm']; if (!validFormats.includes(getFileExtension(audioFile))) { throw new Error('Unsupported audio format'); } // Check file size (max 25MB for Whisper) if (audioFile.size > 25 * 1024 * 1024) { // Compress or chunk the audio return await compressAudio(audioFile); } // Noise reduction for better accuracy return await reduceNoise(audioFile); }; ``` --- ## Natural Language Command Patterns ### Intent Classification ```typescript enum VoiceIntent { LOG_FEEDING = 'LOG_FEEDING', LOG_SLEEP = 'LOG_SLEEP', LOG_DIAPER = 'LOG_DIAPER', LOG_MEDICATION = 'LOG_MEDICATION', START_TIMER = 'START_TIMER', STOP_TIMER = 'STOP_TIMER', ASK_QUESTION = 'ASK_QUESTION', CHECK_STATUS = 'CHECK_STATUS', CANCEL = 'CANCEL' } interface IntentPattern { intent: VoiceIntent; patterns: RegExp[]; requiredEntities: string[]; examples: string[]; } ``` ### English Language Patterns ```typescript const englishPatterns: IntentPattern[] = [ { intent: VoiceIntent.LOG_FEEDING, patterns: [ /(?:baby |she |he )?(?:fed|ate|drank|had|nursed)/i, /(?:bottle|breast|nursing|feeding)/i, /(?:finished|done) (?:eating|feeding|nursing)/i, ], requiredEntities: ['amount?', 'time?', 'type?'], examples: [ "Baby fed 4 ounces", "Just nursed for 15 minutes on the left", "She had 120ml of formula at 3pm", "Finished feeding, both sides, 20 minutes total" ] }, { intent: VoiceIntent.LOG_SLEEP, patterns: [ /(?:went|going) (?:to )?(?:sleep|bed|nap)/i, /(?:woke|wake|waking) up/i, /(?:nap|sleep)(?:ping|ed)? (?:for|since)/i, /(?:fell) asleep/i, ], requiredEntities: ['time?', 'duration?'], examples: [ "Down for a nap", "Woke up from nap", "Sleeping since 2pm", "Just fell asleep in the stroller" ] }, { intent: VoiceIntent.LOG_DIAPER, patterns: [ /(?:chang|dirty|wet|soil|poop|pee)/i, /diaper/i, /(?:number|#) (?:one|two|1|2)/i, ], requiredEntities: ['type?'], examples: [ "Changed wet diaper", "Dirty diaper with rash", "Just changed a poopy one", "Diaper change, both wet and dirty" ] } ]; ``` ### Multi-Language Patterns ```typescript // Spanish patterns const spanishPatterns: IntentPattern[] = [ { intent: VoiceIntent.LOG_FEEDING, patterns: [ /(?:comió|tomó|bebió|amamanté)/i, /(?:biberón|pecho|lactancia)/i, ], examples: [ "Tomó 120ml de fórmula", "Amamanté 15 minutos lado izquierdo", "Ya comió papilla" ] } ]; // French patterns const frenchPatterns: IntentPattern[] = [ { intent: VoiceIntent.LOG_FEEDING, patterns: [ /(?:mangé|bu|allaité|nourri)/i, /(?:biberon|sein|tétée)/i, ], examples: [ "Biberon de 120ml", "Allaité 15 minutes côté gauche", "A mangé sa purée" ] } ]; // Portuguese patterns const portuguesePatterns: IntentPattern[] = [ { intent: VoiceIntent.LOG_FEEDING, patterns: [ /(?:comeu|tomou|bebeu|amamentei)/i, /(?:mamadeira|peito|amamentação)/i, ], examples: [ "Tomou 120ml de fórmula", "Amamentei 15 minutos lado esquerdo" ] } ]; // Chinese patterns const chinesePatterns: IntentPattern[] = [ { intent: VoiceIntent.LOG_FEEDING, patterns: [ /(?:喂|吃|喝|哺乳)/, /(?:奶瓶|母乳|配方奶)/, ], examples: [ "喝了120毫升配方奶", "母乳喂养15分钟", "吃了辅食" ] } ]; ``` --- ## Entity Extraction ### Entity Types ```typescript interface ExtractedEntities { amount?: { value: number; unit: 'oz' | 'ml' | 'minutes'; }; time?: { value: Date; precision: 'exact' | 'approximate'; }; duration?: { value: number; unit: 'minutes' | 'hours'; }; side?: 'left' | 'right' | 'both'; type?: 'breast' | 'bottle' | 'solid' | 'wet' | 'dirty' | 'both'; location?: string; notes?: string; } ``` ### Extraction Logic ```typescript class EntityExtractor { extractAmount(text: string): ExtractedEntities['amount'] { // Numeric amounts with units const amountPattern = /(\d+(?:\.\d+)?)\s*(oz|ounce|ml|milliliter|minute|min)/i; const match = text.match(amountPattern); if (match) { return { value: parseFloat(match[1]), unit: this.normalizeUnit(match[2]) }; } // Word numbers const wordNumbers = { 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'ten': 10, 'fifteen': 15, 'twenty': 20, 'thirty': 30, }; for (const [word, value] of Object.entries(wordNumbers)) { if (text.includes(word)) { return { value, unit: this.inferUnit(text) }; } } return undefined; } extractTime(text: string, timezone: string): ExtractedEntities['time'] { const now = new Date(); // Relative times if (/just|now|right now/i.test(text)) { return { value: now, precision: 'exact' }; } if (/ago/i.test(text)) { const minutesAgo = this.extractMinutesAgo(text); return { value: new Date(now.getTime() - minutesAgo * 60000), precision: 'approximate' }; } // Clock times const timePattern = /(\d{1,2}):?(\d{2})?\s*(am|pm)?/i; const match = text.match(timePattern); if (match) { return { value: this.parseClockTime(match, timezone), precision: 'exact' }; } return { value: now, precision: 'approximate' }; } extractSide(text: string): ExtractedEntities['side'] { if (/left|izquierdo|gauche|esquerdo|左/i.test(text)) return 'left'; if (/right|derecho|droit|direito|右/i.test(text)) return 'right'; if (/both|ambos|deux|ambos|两|両/i.test(text)) return 'both'; return undefined; } } ``` --- ## Intent Processing Engine ### Main Processing Flow ```typescript class VoiceCommandProcessor { async processVoiceInput( audioBuffer: Buffer, context: UserContext ): Promise { // 1. Transcribe audio const transcription = await this.whisperService.transcribeAudio( audioBuffer, context.language ); if (transcription.confidence < 0.5) { return this.handleLowConfidence(transcription); } // 2. Detect intent const intent = await this.detectIntent( transcription.text, context.language ); // 3. Extract entities const entities = await this.extractEntities( transcription.text, intent, context ); // 4. Validate command const validation = this.validateCommand(intent, entities); if (!validation.isValid) { return this.requestClarification(validation.missingInfo); } // 5. Execute action return this.executeCommand(intent, entities, context); } private async detectIntent( text: string, language: string ): Promise { const patterns = this.getPatternsByLanguage(language); for (const pattern of patterns) { for (const regex of pattern.patterns) { if (regex.test(text)) { return pattern.intent; } } } // Fallback to AI intent detection return this.detectIntentWithAI(text, language); } } ``` --- ## Error Recovery ### Common Recognition Errors ```typescript interface RecognitionError { type: 'LOW_CONFIDENCE' | 'AMBIGUOUS' | 'MISSING_DATA' | 'INVALID_VALUE'; originalText: string; suggestions?: string[]; } class ErrorRecovery { handleLowConfidence(transcription: TranscriptionResult): ProcessedCommand { // Check for common misheard phrases const corrections = this.checkCommonMishears(transcription.text); if (corrections.confidence > 0.7) { return this.retryWithCorrection(corrections.text); } return { success: false, action: 'CONFIRM', message: `Did you say "${transcription.text}"?`, alternatives: this.getSimilarPhrases(transcription.text) }; } checkCommonMishears(text: string): CorrectionResult { const corrections = { 'for ounces': 'four ounces', 'to ounces': 'two ounces', 'write side': 'right side', 'laugh side': 'left side', 'wet and dirty': 'wet and dirty', 'wedding dirty': 'wet and dirty', }; for (const [misheard, correct] of Object.entries(corrections)) { if (text.includes(misheard)) { return { text: text.replace(misheard, correct), confidence: 0.8 }; } } return { text, confidence: 0.3 }; } } ``` ### Clarification Prompts ```typescript const clarificationPrompts = { MISSING_AMOUNT: { en: "How much did baby eat?", es: "¿Cuánto comió el bebé?", fr: "Combien a mangé bébé?", pt: "Quanto o bebê comeu?", zh: "宝宝吃了多少?" }, MISSING_TIME: { en: "When did this happen?", es: "¿Cuándo ocurrió esto?", fr: "Quand cela s'est-il passé?", pt: "Quando isso aconteceu?", zh: "这是什么时候发生的?" }, AMBIGUOUS_INTENT: { en: "What would you like to log?", es: "¿Qué te gustaría registrar?", fr: "Que souhaitez-vous enregistrer?", pt: "O que você gostaria de registrar?", zh: "您想记录什么?" } }; ``` --- ## Offline Voice Processing ### Fallback Strategy ```typescript class OfflineVoiceProcessor { async processOffline(audioBuffer: Buffer): Promise { // Use device's native speech recognition if (Platform.OS === 'ios') { return this.useiOSSpeechRecognition(audioBuffer); } else if (Platform.OS === 'android') { return this.useAndroidSpeechRecognition(audioBuffer); } // Queue for later processing return this.queueForOnlineProcessing(audioBuffer); } private async useiOSSpeechRecognition(audio: Buffer) { // Use SFSpeechRecognizer const recognizer = new SFSpeechRecognizer(); return recognizer.recognize(audio); } private async useAndroidSpeechRecognition(audio: Buffer) { // Use Android SpeechRecognizer const recognizer = new AndroidSpeechRecognizer(); return recognizer.recognize(audio); } } ``` --- ## Confirmation & Feedback ### Voice Feedback System ```typescript interface VoiceConfirmation { text: string; speech: string; // SSML for TTS visual: { icon: string; color: string; animation: string; }; haptic?: 'success' | 'warning' | 'error'; } const confirmations = { FEEDING_LOGGED: { text: "Feeding logged", speech: "Got it! Logged 4 ounces.", visual: { icon: 'check_circle', color: 'success', animation: 'bounce' }, haptic: 'success' } }; ``` --- ## Testing Voice Commands ### Test Scenarios ```typescript const voiceTestCases = [ // English { input: "Baby ate 4 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, unit: 'oz' }}, { input: "Nursed for fifteen minutes on the left", expected: { intent: 'LOG_FEEDING', duration: 15, side: 'left' }}, // Spanish { input: "Tomó 120 mililitros", expected: { intent: 'LOG_FEEDING', amount: 120, unit: 'ml' }}, // Edge cases { input: "Fed... um... about 4 or 5 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, confidence: 'low' }}, { input: "Changed a really dirty diaper", expected: { intent: 'LOG_DIAPER', type: 'dirty', notes: 'really dirty' }}, ]; ``` --- ## Performance Optimization ### Audio Streaming ```typescript class StreamingVoiceProcessor { private audioChunks: Buffer[] = []; private isProcessing = false; async processStream(chunk: Buffer) { this.audioChunks.push(chunk); if (!this.isProcessing && this.hasEnoughAudio()) { this.isProcessing = true; const result = await this.processChunks(); this.isProcessing = false; return result; } } private hasEnoughAudio(): boolean { // Need at least 0.5 seconds of audio const totalSize = this.audioChunks.reduce((sum, chunk) => sum + chunk.length, 0); return totalSize > 8000; // ~0.5s at 16kHz } } ``` ### Caching Common Commands ```typescript const commandCache = new LRUCache({ max: 100, ttl: 1000 * 60 * 60, // 1 hour }); // Cache exact matches for common phrases const cachedPhrases = [ "wet diaper", "dirty diaper", "just nursed", "bottle feeding done", "down for a nap", "woke up" ]; ```