590 lines
14 KiB
Markdown
590 lines
14 KiB
Markdown
# Voice Input Processing Guide - Maternal Organization App
|
|
|
|
## Voice Processing Architecture
|
|
|
|
### Overview
|
|
Voice input enables hands-free logging during childcare activities. The system processes natural language in 5 languages, extracting structured data from casual speech patterns.
|
|
|
|
### Processing Pipeline
|
|
```
|
|
Audio Input → Speech Recognition → Language Detection →
|
|
Intent Classification → Entity Extraction → Action Execution →
|
|
Confirmation Feedback
|
|
```
|
|
|
|
---
|
|
|
|
## Whisper API Integration
|
|
|
|
### Configuration
|
|
```typescript
|
|
// services/whisperService.ts
|
|
import OpenAI from 'openai';
|
|
|
|
class WhisperService {
|
|
private client: OpenAI;
|
|
|
|
constructor() {
|
|
this.client = new OpenAI({
|
|
apiKey: process.env.OPENAI_API_KEY,
|
|
});
|
|
}
|
|
|
|
async transcribeAudio(audioBuffer: Buffer, language?: string): Promise<TranscriptionResult> {
|
|
try {
|
|
const response = await this.client.audio.transcriptions.create({
|
|
file: audioBuffer,
|
|
model: 'whisper-1',
|
|
language: language || 'en', // ISO-639-1 code
|
|
response_format: 'verbose_json',
|
|
timestamp_granularities: ['word'],
|
|
});
|
|
|
|
return {
|
|
text: response.text,
|
|
language: response.language,
|
|
confidence: this.calculateConfidence(response),
|
|
words: response.words,
|
|
};
|
|
} catch (error) {
|
|
return this.handleTranscriptionError(error);
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### Audio Preprocessing
|
|
```typescript
|
|
// utils/audioPreprocessing.ts
|
|
export const preprocessAudio = async (audioFile: File): Promise<Buffer> => {
|
|
// Validate format
|
|
const validFormats = ['wav', 'mp3', 'm4a', 'webm'];
|
|
if (!validFormats.includes(getFileExtension(audioFile))) {
|
|
throw new Error('Unsupported audio format');
|
|
}
|
|
|
|
// Check file size (max 25MB for Whisper)
|
|
if (audioFile.size > 25 * 1024 * 1024) {
|
|
// Compress or chunk the audio
|
|
return await compressAudio(audioFile);
|
|
}
|
|
|
|
// Noise reduction for better accuracy
|
|
return await reduceNoise(audioFile);
|
|
};
|
|
```
|
|
|
|
---
|
|
|
|
## Natural Language Command Patterns
|
|
|
|
### Intent Classification
|
|
```typescript
|
|
enum VoiceIntent {
|
|
LOG_FEEDING = 'LOG_FEEDING',
|
|
LOG_SLEEP = 'LOG_SLEEP',
|
|
LOG_DIAPER = 'LOG_DIAPER',
|
|
LOG_MEDICATION = 'LOG_MEDICATION',
|
|
START_TIMER = 'START_TIMER',
|
|
STOP_TIMER = 'STOP_TIMER',
|
|
ASK_QUESTION = 'ASK_QUESTION',
|
|
CHECK_STATUS = 'CHECK_STATUS',
|
|
CANCEL = 'CANCEL'
|
|
}
|
|
|
|
interface IntentPattern {
|
|
intent: VoiceIntent;
|
|
patterns: RegExp[];
|
|
requiredEntities: string[];
|
|
examples: string[];
|
|
}
|
|
```
|
|
|
|
### English Language Patterns
|
|
```typescript
|
|
const englishPatterns: IntentPattern[] = [
|
|
{
|
|
intent: VoiceIntent.LOG_FEEDING,
|
|
patterns: [
|
|
/(?:baby |she |he )?(?:fed|ate|drank|had|nursed)/i,
|
|
/(?:bottle|breast|nursing|feeding)/i,
|
|
/(?:finished|done) (?:eating|feeding|nursing)/i,
|
|
],
|
|
requiredEntities: ['amount?', 'time?', 'type?'],
|
|
examples: [
|
|
"Baby fed 4 ounces",
|
|
"Just nursed for 15 minutes on the left",
|
|
"She had 120ml of formula at 3pm",
|
|
"Finished feeding, both sides, 20 minutes total"
|
|
]
|
|
},
|
|
{
|
|
intent: VoiceIntent.LOG_SLEEP,
|
|
patterns: [
|
|
/(?:went|going) (?:to )?(?:sleep|bed|nap)/i,
|
|
/(?:woke|wake|waking) up/i,
|
|
/(?:nap|sleep)(?:ping|ed)? (?:for|since)/i,
|
|
/(?:fell) asleep/i,
|
|
],
|
|
requiredEntities: ['time?', 'duration?'],
|
|
examples: [
|
|
"Down for a nap",
|
|
"Woke up from nap",
|
|
"Sleeping since 2pm",
|
|
"Just fell asleep in the stroller"
|
|
]
|
|
},
|
|
{
|
|
intent: VoiceIntent.LOG_DIAPER,
|
|
patterns: [
|
|
/(?:chang|dirty|wet|soil|poop|pee)/i,
|
|
/diaper/i,
|
|
/(?:number|#) (?:one|two|1|2)/i,
|
|
],
|
|
requiredEntities: ['type?'],
|
|
examples: [
|
|
"Changed wet diaper",
|
|
"Dirty diaper with rash",
|
|
"Just changed a poopy one",
|
|
"Diaper change, both wet and dirty"
|
|
]
|
|
}
|
|
];
|
|
```
|
|
|
|
### Multi-Language Patterns
|
|
```typescript
|
|
// Spanish patterns
|
|
const spanishPatterns: IntentPattern[] = [
|
|
{
|
|
intent: VoiceIntent.LOG_FEEDING,
|
|
patterns: [
|
|
/(?:comió|tomó|bebió|amamanté)/i,
|
|
/(?:biberón|pecho|lactancia)/i,
|
|
],
|
|
examples: [
|
|
"Tomó 120ml de fórmula",
|
|
"Amamanté 15 minutos lado izquierdo",
|
|
"Ya comió papilla"
|
|
]
|
|
}
|
|
];
|
|
|
|
// French patterns
|
|
const frenchPatterns: IntentPattern[] = [
|
|
{
|
|
intent: VoiceIntent.LOG_FEEDING,
|
|
patterns: [
|
|
/(?:mangé|bu|allaité|nourri)/i,
|
|
/(?:biberon|sein|tétée)/i,
|
|
],
|
|
examples: [
|
|
"Biberon de 120ml",
|
|
"Allaité 15 minutes côté gauche",
|
|
"A mangé sa purée"
|
|
]
|
|
}
|
|
];
|
|
|
|
// Portuguese patterns
|
|
const portuguesePatterns: IntentPattern[] = [
|
|
{
|
|
intent: VoiceIntent.LOG_FEEDING,
|
|
patterns: [
|
|
/(?:comeu|tomou|bebeu|amamentei)/i,
|
|
/(?:mamadeira|peito|amamentação)/i,
|
|
],
|
|
examples: [
|
|
"Tomou 120ml de fórmula",
|
|
"Amamentei 15 minutos lado esquerdo"
|
|
]
|
|
}
|
|
];
|
|
|
|
// Chinese patterns
|
|
const chinesePatterns: IntentPattern[] = [
|
|
{
|
|
intent: VoiceIntent.LOG_FEEDING,
|
|
patterns: [
|
|
/(?:喂|吃|喝|哺乳)/,
|
|
/(?:奶瓶|母乳|配方奶)/,
|
|
],
|
|
examples: [
|
|
"喝了120毫升配方奶",
|
|
"母乳喂养15分钟",
|
|
"吃了辅食"
|
|
]
|
|
}
|
|
];
|
|
```
|
|
|
|
---
|
|
|
|
## Entity Extraction
|
|
|
|
### Entity Types
|
|
```typescript
|
|
interface ExtractedEntities {
|
|
amount?: {
|
|
value: number;
|
|
unit: 'oz' | 'ml' | 'minutes';
|
|
};
|
|
time?: {
|
|
value: Date;
|
|
precision: 'exact' | 'approximate';
|
|
};
|
|
duration?: {
|
|
value: number;
|
|
unit: 'minutes' | 'hours';
|
|
};
|
|
side?: 'left' | 'right' | 'both';
|
|
type?: 'breast' | 'bottle' | 'solid' | 'wet' | 'dirty' | 'both';
|
|
location?: string;
|
|
notes?: string;
|
|
}
|
|
```
|
|
|
|
### Extraction Logic
|
|
```typescript
|
|
class EntityExtractor {
|
|
extractAmount(text: string): ExtractedEntities['amount'] {
|
|
// Numeric amounts with units
|
|
const amountPattern = /(\d+(?:\.\d+)?)\s*(oz|ounce|ml|milliliter|minute|min)/i;
|
|
const match = text.match(amountPattern);
|
|
|
|
if (match) {
|
|
return {
|
|
value: parseFloat(match[1]),
|
|
unit: this.normalizeUnit(match[2])
|
|
};
|
|
}
|
|
|
|
// Word numbers
|
|
const wordNumbers = {
|
|
'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
|
|
'ten': 10, 'fifteen': 15, 'twenty': 20, 'thirty': 30,
|
|
};
|
|
|
|
for (const [word, value] of Object.entries(wordNumbers)) {
|
|
if (text.includes(word)) {
|
|
return { value, unit: this.inferUnit(text) };
|
|
}
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
extractTime(text: string, timezone: string): ExtractedEntities['time'] {
|
|
const now = new Date();
|
|
|
|
// Relative times
|
|
if (/just|now|right now/i.test(text)) {
|
|
return { value: now, precision: 'exact' };
|
|
}
|
|
|
|
if (/ago/i.test(text)) {
|
|
const minutesAgo = this.extractMinutesAgo(text);
|
|
return {
|
|
value: new Date(now.getTime() - minutesAgo * 60000),
|
|
precision: 'approximate'
|
|
};
|
|
}
|
|
|
|
// Clock times
|
|
const timePattern = /(\d{1,2}):?(\d{2})?\s*(am|pm)?/i;
|
|
const match = text.match(timePattern);
|
|
|
|
if (match) {
|
|
return {
|
|
value: this.parseClockTime(match, timezone),
|
|
precision: 'exact'
|
|
};
|
|
}
|
|
|
|
return { value: now, precision: 'approximate' };
|
|
}
|
|
|
|
extractSide(text: string): ExtractedEntities['side'] {
|
|
if (/left|izquierdo|gauche|esquerdo|左/i.test(text)) return 'left';
|
|
if (/right|derecho|droit|direito|右/i.test(text)) return 'right';
|
|
if (/both|ambos|deux|ambos|两|両/i.test(text)) return 'both';
|
|
return undefined;
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Intent Processing Engine
|
|
|
|
### Main Processing Flow
|
|
```typescript
|
|
class VoiceCommandProcessor {
|
|
async processVoiceInput(
|
|
audioBuffer: Buffer,
|
|
context: UserContext
|
|
): Promise<ProcessedCommand> {
|
|
// 1. Transcribe audio
|
|
const transcription = await this.whisperService.transcribeAudio(
|
|
audioBuffer,
|
|
context.language
|
|
);
|
|
|
|
if (transcription.confidence < 0.5) {
|
|
return this.handleLowConfidence(transcription);
|
|
}
|
|
|
|
// 2. Detect intent
|
|
const intent = await this.detectIntent(
|
|
transcription.text,
|
|
context.language
|
|
);
|
|
|
|
// 3. Extract entities
|
|
const entities = await this.extractEntities(
|
|
transcription.text,
|
|
intent,
|
|
context
|
|
);
|
|
|
|
// 4. Validate command
|
|
const validation = this.validateCommand(intent, entities);
|
|
|
|
if (!validation.isValid) {
|
|
return this.requestClarification(validation.missingInfo);
|
|
}
|
|
|
|
// 5. Execute action
|
|
return this.executeCommand(intent, entities, context);
|
|
}
|
|
|
|
private async detectIntent(
|
|
text: string,
|
|
language: string
|
|
): Promise<VoiceIntent> {
|
|
const patterns = this.getPatternsByLanguage(language);
|
|
|
|
for (const pattern of patterns) {
|
|
for (const regex of pattern.patterns) {
|
|
if (regex.test(text)) {
|
|
return pattern.intent;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback to AI intent detection
|
|
return this.detectIntentWithAI(text, language);
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Error Recovery
|
|
|
|
### Common Recognition Errors
|
|
```typescript
|
|
interface RecognitionError {
|
|
type: 'LOW_CONFIDENCE' | 'AMBIGUOUS' | 'MISSING_DATA' | 'INVALID_VALUE';
|
|
originalText: string;
|
|
suggestions?: string[];
|
|
}
|
|
|
|
class ErrorRecovery {
|
|
handleLowConfidence(transcription: TranscriptionResult): ProcessedCommand {
|
|
// Check for common misheard phrases
|
|
const corrections = this.checkCommonMishears(transcription.text);
|
|
|
|
if (corrections.confidence > 0.7) {
|
|
return this.retryWithCorrection(corrections.text);
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
action: 'CONFIRM',
|
|
message: `Did you say "${transcription.text}"?`,
|
|
alternatives: this.getSimilarPhrases(transcription.text)
|
|
};
|
|
}
|
|
|
|
checkCommonMishears(text: string): CorrectionResult {
|
|
const corrections = {
|
|
'for ounces': 'four ounces',
|
|
'to ounces': 'two ounces',
|
|
'write side': 'right side',
|
|
'laugh side': 'left side',
|
|
'wet and dirty': 'wet and dirty',
|
|
'wedding dirty': 'wet and dirty',
|
|
};
|
|
|
|
for (const [misheard, correct] of Object.entries(corrections)) {
|
|
if (text.includes(misheard)) {
|
|
return {
|
|
text: text.replace(misheard, correct),
|
|
confidence: 0.8
|
|
};
|
|
}
|
|
}
|
|
|
|
return { text, confidence: 0.3 };
|
|
}
|
|
}
|
|
```
|
|
|
|
### Clarification Prompts
|
|
```typescript
|
|
const clarificationPrompts = {
|
|
MISSING_AMOUNT: {
|
|
en: "How much did baby eat?",
|
|
es: "¿Cuánto comió el bebé?",
|
|
fr: "Combien a mangé bébé?",
|
|
pt: "Quanto o bebê comeu?",
|
|
zh: "宝宝吃了多少?"
|
|
},
|
|
MISSING_TIME: {
|
|
en: "When did this happen?",
|
|
es: "¿Cuándo ocurrió esto?",
|
|
fr: "Quand cela s'est-il passé?",
|
|
pt: "Quando isso aconteceu?",
|
|
zh: "这是什么时候发生的?"
|
|
},
|
|
AMBIGUOUS_INTENT: {
|
|
en: "What would you like to log?",
|
|
es: "¿Qué te gustaría registrar?",
|
|
fr: "Que souhaitez-vous enregistrer?",
|
|
pt: "O que você gostaria de registrar?",
|
|
zh: "您想记录什么?"
|
|
}
|
|
};
|
|
```
|
|
|
|
---
|
|
|
|
## Offline Voice Processing
|
|
|
|
### Fallback Strategy
|
|
```typescript
|
|
class OfflineVoiceProcessor {
|
|
async processOffline(audioBuffer: Buffer): Promise<BasicTranscription> {
|
|
// Use device's native speech recognition
|
|
if (Platform.OS === 'ios') {
|
|
return this.useiOSSpeechRecognition(audioBuffer);
|
|
} else if (Platform.OS === 'android') {
|
|
return this.useAndroidSpeechRecognition(audioBuffer);
|
|
}
|
|
|
|
// Queue for later processing
|
|
return this.queueForOnlineProcessing(audioBuffer);
|
|
}
|
|
|
|
private async useiOSSpeechRecognition(audio: Buffer) {
|
|
// Use SFSpeechRecognizer
|
|
const recognizer = new SFSpeechRecognizer();
|
|
return recognizer.recognize(audio);
|
|
}
|
|
|
|
private async useAndroidSpeechRecognition(audio: Buffer) {
|
|
// Use Android SpeechRecognizer
|
|
const recognizer = new AndroidSpeechRecognizer();
|
|
return recognizer.recognize(audio);
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Confirmation & Feedback
|
|
|
|
### Voice Feedback System
|
|
```typescript
|
|
interface VoiceConfirmation {
|
|
text: string;
|
|
speech: string; // SSML for TTS
|
|
visual: {
|
|
icon: string;
|
|
color: string;
|
|
animation: string;
|
|
};
|
|
haptic?: 'success' | 'warning' | 'error';
|
|
}
|
|
|
|
const confirmations = {
|
|
FEEDING_LOGGED: {
|
|
text: "Feeding logged",
|
|
speech: "<speak>Got it! <break time='200ms'/> Logged <say-as interpret-as='cardinal'>4</say-as> ounces.</speak>",
|
|
visual: {
|
|
icon: 'check_circle',
|
|
color: 'success',
|
|
animation: 'bounce'
|
|
},
|
|
haptic: 'success'
|
|
}
|
|
};
|
|
```
|
|
|
|
---
|
|
|
|
## Testing Voice Commands
|
|
|
|
### Test Scenarios
|
|
```typescript
|
|
const voiceTestCases = [
|
|
// English
|
|
{ input: "Baby ate 4 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, unit: 'oz' }},
|
|
{ input: "Nursed for fifteen minutes on the left", expected: { intent: 'LOG_FEEDING', duration: 15, side: 'left' }},
|
|
|
|
// Spanish
|
|
{ input: "Tomó 120 mililitros", expected: { intent: 'LOG_FEEDING', amount: 120, unit: 'ml' }},
|
|
|
|
// Edge cases
|
|
{ input: "Fed... um... about 4 or 5 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, confidence: 'low' }},
|
|
{ input: "Changed a really dirty diaper", expected: { intent: 'LOG_DIAPER', type: 'dirty', notes: 'really dirty' }},
|
|
];
|
|
```
|
|
|
|
---
|
|
|
|
## Performance Optimization
|
|
|
|
### Audio Streaming
|
|
```typescript
|
|
class StreamingVoiceProcessor {
|
|
private audioChunks: Buffer[] = [];
|
|
private isProcessing = false;
|
|
|
|
async processStream(chunk: Buffer) {
|
|
this.audioChunks.push(chunk);
|
|
|
|
if (!this.isProcessing && this.hasEnoughAudio()) {
|
|
this.isProcessing = true;
|
|
const result = await this.processChunks();
|
|
this.isProcessing = false;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
private hasEnoughAudio(): boolean {
|
|
// Need at least 0.5 seconds of audio
|
|
const totalSize = this.audioChunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
|
return totalSize > 8000; // ~0.5s at 16kHz
|
|
}
|
|
}
|
|
```
|
|
|
|
### Caching Common Commands
|
|
```typescript
|
|
const commandCache = new LRUCache<string, ProcessedCommand>({
|
|
max: 100,
|
|
ttl: 1000 * 60 * 60, // 1 hour
|
|
});
|
|
|
|
// Cache exact matches for common phrases
|
|
const cachedPhrases = [
|
|
"wet diaper",
|
|
"dirty diaper",
|
|
"just nursed",
|
|
"bottle feeding done",
|
|
"down for a nap",
|
|
"woke up"
|
|
];
|
|
``` |