14 KiB
14 KiB
Voice Input Processing Guide - Maternal Organization App
Voice Processing Architecture
Overview
Voice input enables hands-free logging during childcare activities. The system processes natural language in 5 languages, extracting structured data from casual speech patterns.
Processing Pipeline
Audio Input → Speech Recognition → Language Detection →
Intent Classification → Entity Extraction → Action Execution →
Confirmation Feedback
Whisper API Integration
Configuration
// services/whisperService.ts
import OpenAI from 'openai';
class WhisperService {
private client: OpenAI;
constructor() {
this.client = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
}
async transcribeAudio(audioBuffer: Buffer, language?: string): Promise<TranscriptionResult> {
try {
const response = await this.client.audio.transcriptions.create({
file: audioBuffer,
model: 'whisper-1',
language: language || 'en', // ISO-639-1 code
response_format: 'verbose_json',
timestamp_granularities: ['word'],
});
return {
text: response.text,
language: response.language,
confidence: this.calculateConfidence(response),
words: response.words,
};
} catch (error) {
return this.handleTranscriptionError(error);
}
}
}
Audio Preprocessing
// utils/audioPreprocessing.ts
export const preprocessAudio = async (audioFile: File): Promise<Buffer> => {
// Validate format
const validFormats = ['wav', 'mp3', 'm4a', 'webm'];
if (!validFormats.includes(getFileExtension(audioFile))) {
throw new Error('Unsupported audio format');
}
// Check file size (max 25MB for Whisper)
if (audioFile.size > 25 * 1024 * 1024) {
// Compress or chunk the audio
return await compressAudio(audioFile);
}
// Noise reduction for better accuracy
return await reduceNoise(audioFile);
};
Natural Language Command Patterns
Intent Classification
enum VoiceIntent {
LOG_FEEDING = 'LOG_FEEDING',
LOG_SLEEP = 'LOG_SLEEP',
LOG_DIAPER = 'LOG_DIAPER',
LOG_MEDICATION = 'LOG_MEDICATION',
START_TIMER = 'START_TIMER',
STOP_TIMER = 'STOP_TIMER',
ASK_QUESTION = 'ASK_QUESTION',
CHECK_STATUS = 'CHECK_STATUS',
CANCEL = 'CANCEL'
}
interface IntentPattern {
intent: VoiceIntent;
patterns: RegExp[];
requiredEntities: string[];
examples: string[];
}
English Language Patterns
const englishPatterns: IntentPattern[] = [
{
intent: VoiceIntent.LOG_FEEDING,
patterns: [
/(?:baby |she |he )?(?:fed|ate|drank|had|nursed)/i,
/(?:bottle|breast|nursing|feeding)/i,
/(?:finished|done) (?:eating|feeding|nursing)/i,
],
requiredEntities: ['amount?', 'time?', 'type?'],
examples: [
"Baby fed 4 ounces",
"Just nursed for 15 minutes on the left",
"She had 120ml of formula at 3pm",
"Finished feeding, both sides, 20 minutes total"
]
},
{
intent: VoiceIntent.LOG_SLEEP,
patterns: [
/(?:went|going) (?:to )?(?:sleep|bed|nap)/i,
/(?:woke|wake|waking) up/i,
/(?:nap|sleep)(?:ping|ed)? (?:for|since)/i,
/(?:fell) asleep/i,
],
requiredEntities: ['time?', 'duration?'],
examples: [
"Down for a nap",
"Woke up from nap",
"Sleeping since 2pm",
"Just fell asleep in the stroller"
]
},
{
intent: VoiceIntent.LOG_DIAPER,
patterns: [
/(?:chang|dirty|wet|soil|poop|pee)/i,
/diaper/i,
/(?:number|#) (?:one|two|1|2)/i,
],
requiredEntities: ['type?'],
examples: [
"Changed wet diaper",
"Dirty diaper with rash",
"Just changed a poopy one",
"Diaper change, both wet and dirty"
]
}
];
Multi-Language Patterns
// Spanish patterns
const spanishPatterns: IntentPattern[] = [
{
intent: VoiceIntent.LOG_FEEDING,
patterns: [
/(?:comió|tomó|bebió|amamanté)/i,
/(?:biberón|pecho|lactancia)/i,
],
examples: [
"Tomó 120ml de fórmula",
"Amamanté 15 minutos lado izquierdo",
"Ya comió papilla"
]
}
];
// French patterns
const frenchPatterns: IntentPattern[] = [
{
intent: VoiceIntent.LOG_FEEDING,
patterns: [
/(?:mangé|bu|allaité|nourri)/i,
/(?:biberon|sein|tétée)/i,
],
examples: [
"Biberon de 120ml",
"Allaité 15 minutes côté gauche",
"A mangé sa purée"
]
}
];
// Portuguese patterns
const portuguesePatterns: IntentPattern[] = [
{
intent: VoiceIntent.LOG_FEEDING,
patterns: [
/(?:comeu|tomou|bebeu|amamentei)/i,
/(?:mamadeira|peito|amamentação)/i,
],
examples: [
"Tomou 120ml de fórmula",
"Amamentei 15 minutos lado esquerdo"
]
}
];
// Chinese patterns
const chinesePatterns: IntentPattern[] = [
{
intent: VoiceIntent.LOG_FEEDING,
patterns: [
/(?:喂|吃|喝|哺乳)/,
/(?:奶瓶|母乳|配方奶)/,
],
examples: [
"喝了120毫升配方奶",
"母乳喂养15分钟",
"吃了辅食"
]
}
];
Entity Extraction
Entity Types
interface ExtractedEntities {
amount?: {
value: number;
unit: 'oz' | 'ml' | 'minutes';
};
time?: {
value: Date;
precision: 'exact' | 'approximate';
};
duration?: {
value: number;
unit: 'minutes' | 'hours';
};
side?: 'left' | 'right' | 'both';
type?: 'breast' | 'bottle' | 'solid' | 'wet' | 'dirty' | 'both';
location?: string;
notes?: string;
}
Extraction Logic
class EntityExtractor {
extractAmount(text: string): ExtractedEntities['amount'] {
// Numeric amounts with units
const amountPattern = /(\d+(?:\.\d+)?)\s*(oz|ounce|ml|milliliter|minute|min)/i;
const match = text.match(amountPattern);
if (match) {
return {
value: parseFloat(match[1]),
unit: this.normalizeUnit(match[2])
};
}
// Word numbers
const wordNumbers = {
'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
'ten': 10, 'fifteen': 15, 'twenty': 20, 'thirty': 30,
};
for (const [word, value] of Object.entries(wordNumbers)) {
if (text.includes(word)) {
return { value, unit: this.inferUnit(text) };
}
}
return undefined;
}
extractTime(text: string, timezone: string): ExtractedEntities['time'] {
const now = new Date();
// Relative times
if (/just|now|right now/i.test(text)) {
return { value: now, precision: 'exact' };
}
if (/ago/i.test(text)) {
const minutesAgo = this.extractMinutesAgo(text);
return {
value: new Date(now.getTime() - minutesAgo * 60000),
precision: 'approximate'
};
}
// Clock times
const timePattern = /(\d{1,2}):?(\d{2})?\s*(am|pm)?/i;
const match = text.match(timePattern);
if (match) {
return {
value: this.parseClockTime(match, timezone),
precision: 'exact'
};
}
return { value: now, precision: 'approximate' };
}
extractSide(text: string): ExtractedEntities['side'] {
if (/left|izquierdo|gauche|esquerdo|左/i.test(text)) return 'left';
if (/right|derecho|droit|direito|右/i.test(text)) return 'right';
if (/both|ambos|deux|ambos|两|両/i.test(text)) return 'both';
return undefined;
}
}
Intent Processing Engine
Main Processing Flow
class VoiceCommandProcessor {
async processVoiceInput(
audioBuffer: Buffer,
context: UserContext
): Promise<ProcessedCommand> {
// 1. Transcribe audio
const transcription = await this.whisperService.transcribeAudio(
audioBuffer,
context.language
);
if (transcription.confidence < 0.5) {
return this.handleLowConfidence(transcription);
}
// 2. Detect intent
const intent = await this.detectIntent(
transcription.text,
context.language
);
// 3. Extract entities
const entities = await this.extractEntities(
transcription.text,
intent,
context
);
// 4. Validate command
const validation = this.validateCommand(intent, entities);
if (!validation.isValid) {
return this.requestClarification(validation.missingInfo);
}
// 5. Execute action
return this.executeCommand(intent, entities, context);
}
private async detectIntent(
text: string,
language: string
): Promise<VoiceIntent> {
const patterns = this.getPatternsByLanguage(language);
for (const pattern of patterns) {
for (const regex of pattern.patterns) {
if (regex.test(text)) {
return pattern.intent;
}
}
}
// Fallback to AI intent detection
return this.detectIntentWithAI(text, language);
}
}
Error Recovery
Common Recognition Errors
interface RecognitionError {
type: 'LOW_CONFIDENCE' | 'AMBIGUOUS' | 'MISSING_DATA' | 'INVALID_VALUE';
originalText: string;
suggestions?: string[];
}
class ErrorRecovery {
handleLowConfidence(transcription: TranscriptionResult): ProcessedCommand {
// Check for common misheard phrases
const corrections = this.checkCommonMishears(transcription.text);
if (corrections.confidence > 0.7) {
return this.retryWithCorrection(corrections.text);
}
return {
success: false,
action: 'CONFIRM',
message: `Did you say "${transcription.text}"?`,
alternatives: this.getSimilarPhrases(transcription.text)
};
}
checkCommonMishears(text: string): CorrectionResult {
const corrections = {
'for ounces': 'four ounces',
'to ounces': 'two ounces',
'write side': 'right side',
'laugh side': 'left side',
'wet and dirty': 'wet and dirty',
'wedding dirty': 'wet and dirty',
};
for (const [misheard, correct] of Object.entries(corrections)) {
if (text.includes(misheard)) {
return {
text: text.replace(misheard, correct),
confidence: 0.8
};
}
}
return { text, confidence: 0.3 };
}
}
Clarification Prompts
const clarificationPrompts = {
MISSING_AMOUNT: {
en: "How much did baby eat?",
es: "¿Cuánto comió el bebé?",
fr: "Combien a mangé bébé?",
pt: "Quanto o bebê comeu?",
zh: "宝宝吃了多少?"
},
MISSING_TIME: {
en: "When did this happen?",
es: "¿Cuándo ocurrió esto?",
fr: "Quand cela s'est-il passé?",
pt: "Quando isso aconteceu?",
zh: "这是什么时候发生的?"
},
AMBIGUOUS_INTENT: {
en: "What would you like to log?",
es: "¿Qué te gustaría registrar?",
fr: "Que souhaitez-vous enregistrer?",
pt: "O que você gostaria de registrar?",
zh: "您想记录什么?"
}
};
Offline Voice Processing
Fallback Strategy
class OfflineVoiceProcessor {
async processOffline(audioBuffer: Buffer): Promise<BasicTranscription> {
// Use device's native speech recognition
if (Platform.OS === 'ios') {
return this.useiOSSpeechRecognition(audioBuffer);
} else if (Platform.OS === 'android') {
return this.useAndroidSpeechRecognition(audioBuffer);
}
// Queue for later processing
return this.queueForOnlineProcessing(audioBuffer);
}
private async useiOSSpeechRecognition(audio: Buffer) {
// Use SFSpeechRecognizer
const recognizer = new SFSpeechRecognizer();
return recognizer.recognize(audio);
}
private async useAndroidSpeechRecognition(audio: Buffer) {
// Use Android SpeechRecognizer
const recognizer = new AndroidSpeechRecognizer();
return recognizer.recognize(audio);
}
}
Confirmation & Feedback
Voice Feedback System
interface VoiceConfirmation {
text: string;
speech: string; // SSML for TTS
visual: {
icon: string;
color: string;
animation: string;
};
haptic?: 'success' | 'warning' | 'error';
}
const confirmations = {
FEEDING_LOGGED: {
text: "Feeding logged",
speech: "<speak>Got it! <break time='200ms'/> Logged <say-as interpret-as='cardinal'>4</say-as> ounces.</speak>",
visual: {
icon: 'check_circle',
color: 'success',
animation: 'bounce'
},
haptic: 'success'
}
};
Testing Voice Commands
Test Scenarios
const voiceTestCases = [
// English
{ input: "Baby ate 4 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, unit: 'oz' }},
{ input: "Nursed for fifteen minutes on the left", expected: { intent: 'LOG_FEEDING', duration: 15, side: 'left' }},
// Spanish
{ input: "Tomó 120 mililitros", expected: { intent: 'LOG_FEEDING', amount: 120, unit: 'ml' }},
// Edge cases
{ input: "Fed... um... about 4 or 5 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, confidence: 'low' }},
{ input: "Changed a really dirty diaper", expected: { intent: 'LOG_DIAPER', type: 'dirty', notes: 'really dirty' }},
];
Performance Optimization
Audio Streaming
class StreamingVoiceProcessor {
private audioChunks: Buffer[] = [];
private isProcessing = false;
async processStream(chunk: Buffer) {
this.audioChunks.push(chunk);
if (!this.isProcessing && this.hasEnoughAudio()) {
this.isProcessing = true;
const result = await this.processChunks();
this.isProcessing = false;
return result;
}
}
private hasEnoughAudio(): boolean {
// Need at least 0.5 seconds of audio
const totalSize = this.audioChunks.reduce((sum, chunk) => sum + chunk.length, 0);
return totalSize > 8000; // ~0.5s at 16kHz
}
}
Caching Common Commands
const commandCache = new LRUCache<string, ProcessedCommand>({
max: 100,
ttl: 1000 * 60 * 60, // 1 hour
});
// Cache exact matches for common phrases
const cachedPhrases = [
"wet diaper",
"dirty diaper",
"just nursed",
"bottle feeding done",
"down for a nap",
"woke up"
];