Add comprehensive .gitignore
This commit is contained in:
590
docs/maternal-app-voice-processing.md
Normal file
590
docs/maternal-app-voice-processing.md
Normal file
@@ -0,0 +1,590 @@
|
||||
# Voice Input Processing Guide - Maternal Organization App
|
||||
|
||||
## Voice Processing Architecture
|
||||
|
||||
### Overview
|
||||
Voice input enables hands-free logging during childcare activities. The system processes natural language in 5 languages, extracting structured data from casual speech patterns.
|
||||
|
||||
### Processing Pipeline
|
||||
```
|
||||
Audio Input → Speech Recognition → Language Detection →
|
||||
Intent Classification → Entity Extraction → Action Execution →
|
||||
Confirmation Feedback
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Whisper API Integration
|
||||
|
||||
### Configuration
|
||||
```typescript
|
||||
// services/whisperService.ts
|
||||
import OpenAI from 'openai';
|
||||
|
||||
class WhisperService {
|
||||
private client: OpenAI;
|
||||
|
||||
constructor() {
|
||||
this.client = new OpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
});
|
||||
}
|
||||
|
||||
async transcribeAudio(audioBuffer: Buffer, language?: string): Promise<TranscriptionResult> {
|
||||
try {
|
||||
const response = await this.client.audio.transcriptions.create({
|
||||
file: audioBuffer,
|
||||
model: 'whisper-1',
|
||||
language: language || 'en', // ISO-639-1 code
|
||||
response_format: 'verbose_json',
|
||||
timestamp_granularities: ['word'],
|
||||
});
|
||||
|
||||
return {
|
||||
text: response.text,
|
||||
language: response.language,
|
||||
confidence: this.calculateConfidence(response),
|
||||
words: response.words,
|
||||
};
|
||||
} catch (error) {
|
||||
return this.handleTranscriptionError(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Audio Preprocessing
|
||||
```typescript
|
||||
// utils/audioPreprocessing.ts
|
||||
export const preprocessAudio = async (audioFile: File): Promise<Buffer> => {
|
||||
// Validate format
|
||||
const validFormats = ['wav', 'mp3', 'm4a', 'webm'];
|
||||
if (!validFormats.includes(getFileExtension(audioFile))) {
|
||||
throw new Error('Unsupported audio format');
|
||||
}
|
||||
|
||||
// Check file size (max 25MB for Whisper)
|
||||
if (audioFile.size > 25 * 1024 * 1024) {
|
||||
// Compress or chunk the audio
|
||||
return await compressAudio(audioFile);
|
||||
}
|
||||
|
||||
// Noise reduction for better accuracy
|
||||
return await reduceNoise(audioFile);
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Natural Language Command Patterns
|
||||
|
||||
### Intent Classification
|
||||
```typescript
|
||||
enum VoiceIntent {
|
||||
LOG_FEEDING = 'LOG_FEEDING',
|
||||
LOG_SLEEP = 'LOG_SLEEP',
|
||||
LOG_DIAPER = 'LOG_DIAPER',
|
||||
LOG_MEDICATION = 'LOG_MEDICATION',
|
||||
START_TIMER = 'START_TIMER',
|
||||
STOP_TIMER = 'STOP_TIMER',
|
||||
ASK_QUESTION = 'ASK_QUESTION',
|
||||
CHECK_STATUS = 'CHECK_STATUS',
|
||||
CANCEL = 'CANCEL'
|
||||
}
|
||||
|
||||
interface IntentPattern {
|
||||
intent: VoiceIntent;
|
||||
patterns: RegExp[];
|
||||
requiredEntities: string[];
|
||||
examples: string[];
|
||||
}
|
||||
```
|
||||
|
||||
### English Language Patterns
|
||||
```typescript
|
||||
const englishPatterns: IntentPattern[] = [
|
||||
{
|
||||
intent: VoiceIntent.LOG_FEEDING,
|
||||
patterns: [
|
||||
/(?:baby |she |he )?(?:fed|ate|drank|had|nursed)/i,
|
||||
/(?:bottle|breast|nursing|feeding)/i,
|
||||
/(?:finished|done) (?:eating|feeding|nursing)/i,
|
||||
],
|
||||
requiredEntities: ['amount?', 'time?', 'type?'],
|
||||
examples: [
|
||||
"Baby fed 4 ounces",
|
||||
"Just nursed for 15 minutes on the left",
|
||||
"She had 120ml of formula at 3pm",
|
||||
"Finished feeding, both sides, 20 minutes total"
|
||||
]
|
||||
},
|
||||
{
|
||||
intent: VoiceIntent.LOG_SLEEP,
|
||||
patterns: [
|
||||
/(?:went|going) (?:to )?(?:sleep|bed|nap)/i,
|
||||
/(?:woke|wake|waking) up/i,
|
||||
/(?:nap|sleep)(?:ping|ed)? (?:for|since)/i,
|
||||
/(?:fell) asleep/i,
|
||||
],
|
||||
requiredEntities: ['time?', 'duration?'],
|
||||
examples: [
|
||||
"Down for a nap",
|
||||
"Woke up from nap",
|
||||
"Sleeping since 2pm",
|
||||
"Just fell asleep in the stroller"
|
||||
]
|
||||
},
|
||||
{
|
||||
intent: VoiceIntent.LOG_DIAPER,
|
||||
patterns: [
|
||||
/(?:chang|dirty|wet|soil|poop|pee)/i,
|
||||
/diaper/i,
|
||||
/(?:number|#) (?:one|two|1|2)/i,
|
||||
],
|
||||
requiredEntities: ['type?'],
|
||||
examples: [
|
||||
"Changed wet diaper",
|
||||
"Dirty diaper with rash",
|
||||
"Just changed a poopy one",
|
||||
"Diaper change, both wet and dirty"
|
||||
]
|
||||
}
|
||||
];
|
||||
```
|
||||
|
||||
### Multi-Language Patterns
|
||||
```typescript
|
||||
// Spanish patterns
|
||||
const spanishPatterns: IntentPattern[] = [
|
||||
{
|
||||
intent: VoiceIntent.LOG_FEEDING,
|
||||
patterns: [
|
||||
/(?:comió|tomó|bebió|amamanté)/i,
|
||||
/(?:biberón|pecho|lactancia)/i,
|
||||
],
|
||||
examples: [
|
||||
"Tomó 120ml de fórmula",
|
||||
"Amamanté 15 minutos lado izquierdo",
|
||||
"Ya comió papilla"
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
// French patterns
|
||||
const frenchPatterns: IntentPattern[] = [
|
||||
{
|
||||
intent: VoiceIntent.LOG_FEEDING,
|
||||
patterns: [
|
||||
/(?:mangé|bu|allaité|nourri)/i,
|
||||
/(?:biberon|sein|tétée)/i,
|
||||
],
|
||||
examples: [
|
||||
"Biberon de 120ml",
|
||||
"Allaité 15 minutes côté gauche",
|
||||
"A mangé sa purée"
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
// Portuguese patterns
|
||||
const portuguesePatterns: IntentPattern[] = [
|
||||
{
|
||||
intent: VoiceIntent.LOG_FEEDING,
|
||||
patterns: [
|
||||
/(?:comeu|tomou|bebeu|amamentei)/i,
|
||||
/(?:mamadeira|peito|amamentação)/i,
|
||||
],
|
||||
examples: [
|
||||
"Tomou 120ml de fórmula",
|
||||
"Amamentei 15 minutos lado esquerdo"
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
// Chinese patterns
|
||||
const chinesePatterns: IntentPattern[] = [
|
||||
{
|
||||
intent: VoiceIntent.LOG_FEEDING,
|
||||
patterns: [
|
||||
/(?:喂|吃|喝|哺乳)/,
|
||||
/(?:奶瓶|母乳|配方奶)/,
|
||||
],
|
||||
examples: [
|
||||
"喝了120毫升配方奶",
|
||||
"母乳喂养15分钟",
|
||||
"吃了辅食"
|
||||
]
|
||||
}
|
||||
];
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Entity Extraction
|
||||
|
||||
### Entity Types
|
||||
```typescript
|
||||
interface ExtractedEntities {
|
||||
amount?: {
|
||||
value: number;
|
||||
unit: 'oz' | 'ml' | 'minutes';
|
||||
};
|
||||
time?: {
|
||||
value: Date;
|
||||
precision: 'exact' | 'approximate';
|
||||
};
|
||||
duration?: {
|
||||
value: number;
|
||||
unit: 'minutes' | 'hours';
|
||||
};
|
||||
side?: 'left' | 'right' | 'both';
|
||||
type?: 'breast' | 'bottle' | 'solid' | 'wet' | 'dirty' | 'both';
|
||||
location?: string;
|
||||
notes?: string;
|
||||
}
|
||||
```
|
||||
|
||||
### Extraction Logic
|
||||
```typescript
|
||||
class EntityExtractor {
|
||||
extractAmount(text: string): ExtractedEntities['amount'] {
|
||||
// Numeric amounts with units
|
||||
const amountPattern = /(\d+(?:\.\d+)?)\s*(oz|ounce|ml|milliliter|minute|min)/i;
|
||||
const match = text.match(amountPattern);
|
||||
|
||||
if (match) {
|
||||
return {
|
||||
value: parseFloat(match[1]),
|
||||
unit: this.normalizeUnit(match[2])
|
||||
};
|
||||
}
|
||||
|
||||
// Word numbers
|
||||
const wordNumbers = {
|
||||
'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
|
||||
'ten': 10, 'fifteen': 15, 'twenty': 20, 'thirty': 30,
|
||||
};
|
||||
|
||||
for (const [word, value] of Object.entries(wordNumbers)) {
|
||||
if (text.includes(word)) {
|
||||
return { value, unit: this.inferUnit(text) };
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
extractTime(text: string, timezone: string): ExtractedEntities['time'] {
|
||||
const now = new Date();
|
||||
|
||||
// Relative times
|
||||
if (/just|now|right now/i.test(text)) {
|
||||
return { value: now, precision: 'exact' };
|
||||
}
|
||||
|
||||
if (/ago/i.test(text)) {
|
||||
const minutesAgo = this.extractMinutesAgo(text);
|
||||
return {
|
||||
value: new Date(now.getTime() - minutesAgo * 60000),
|
||||
precision: 'approximate'
|
||||
};
|
||||
}
|
||||
|
||||
// Clock times
|
||||
const timePattern = /(\d{1,2}):?(\d{2})?\s*(am|pm)?/i;
|
||||
const match = text.match(timePattern);
|
||||
|
||||
if (match) {
|
||||
return {
|
||||
value: this.parseClockTime(match, timezone),
|
||||
precision: 'exact'
|
||||
};
|
||||
}
|
||||
|
||||
return { value: now, precision: 'approximate' };
|
||||
}
|
||||
|
||||
extractSide(text: string): ExtractedEntities['side'] {
|
||||
if (/left|izquierdo|gauche|esquerdo|左/i.test(text)) return 'left';
|
||||
if (/right|derecho|droit|direito|右/i.test(text)) return 'right';
|
||||
if (/both|ambos|deux|ambos|两|両/i.test(text)) return 'both';
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Intent Processing Engine
|
||||
|
||||
### Main Processing Flow
|
||||
```typescript
|
||||
class VoiceCommandProcessor {
|
||||
async processVoiceInput(
|
||||
audioBuffer: Buffer,
|
||||
context: UserContext
|
||||
): Promise<ProcessedCommand> {
|
||||
// 1. Transcribe audio
|
||||
const transcription = await this.whisperService.transcribeAudio(
|
||||
audioBuffer,
|
||||
context.language
|
||||
);
|
||||
|
||||
if (transcription.confidence < 0.5) {
|
||||
return this.handleLowConfidence(transcription);
|
||||
}
|
||||
|
||||
// 2. Detect intent
|
||||
const intent = await this.detectIntent(
|
||||
transcription.text,
|
||||
context.language
|
||||
);
|
||||
|
||||
// 3. Extract entities
|
||||
const entities = await this.extractEntities(
|
||||
transcription.text,
|
||||
intent,
|
||||
context
|
||||
);
|
||||
|
||||
// 4. Validate command
|
||||
const validation = this.validateCommand(intent, entities);
|
||||
|
||||
if (!validation.isValid) {
|
||||
return this.requestClarification(validation.missingInfo);
|
||||
}
|
||||
|
||||
// 5. Execute action
|
||||
return this.executeCommand(intent, entities, context);
|
||||
}
|
||||
|
||||
private async detectIntent(
|
||||
text: string,
|
||||
language: string
|
||||
): Promise<VoiceIntent> {
|
||||
const patterns = this.getPatternsByLanguage(language);
|
||||
|
||||
for (const pattern of patterns) {
|
||||
for (const regex of pattern.patterns) {
|
||||
if (regex.test(text)) {
|
||||
return pattern.intent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to AI intent detection
|
||||
return this.detectIntentWithAI(text, language);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Error Recovery
|
||||
|
||||
### Common Recognition Errors
|
||||
```typescript
|
||||
interface RecognitionError {
|
||||
type: 'LOW_CONFIDENCE' | 'AMBIGUOUS' | 'MISSING_DATA' | 'INVALID_VALUE';
|
||||
originalText: string;
|
||||
suggestions?: string[];
|
||||
}
|
||||
|
||||
class ErrorRecovery {
|
||||
handleLowConfidence(transcription: TranscriptionResult): ProcessedCommand {
|
||||
// Check for common misheard phrases
|
||||
const corrections = this.checkCommonMishears(transcription.text);
|
||||
|
||||
if (corrections.confidence > 0.7) {
|
||||
return this.retryWithCorrection(corrections.text);
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
action: 'CONFIRM',
|
||||
message: `Did you say "${transcription.text}"?`,
|
||||
alternatives: this.getSimilarPhrases(transcription.text)
|
||||
};
|
||||
}
|
||||
|
||||
checkCommonMishears(text: string): CorrectionResult {
|
||||
const corrections = {
|
||||
'for ounces': 'four ounces',
|
||||
'to ounces': 'two ounces',
|
||||
'write side': 'right side',
|
||||
'laugh side': 'left side',
|
||||
'wet and dirty': 'wet and dirty',
|
||||
'wedding dirty': 'wet and dirty',
|
||||
};
|
||||
|
||||
for (const [misheard, correct] of Object.entries(corrections)) {
|
||||
if (text.includes(misheard)) {
|
||||
return {
|
||||
text: text.replace(misheard, correct),
|
||||
confidence: 0.8
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return { text, confidence: 0.3 };
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Clarification Prompts
|
||||
```typescript
|
||||
const clarificationPrompts = {
|
||||
MISSING_AMOUNT: {
|
||||
en: "How much did baby eat?",
|
||||
es: "¿Cuánto comió el bebé?",
|
||||
fr: "Combien a mangé bébé?",
|
||||
pt: "Quanto o bebê comeu?",
|
||||
zh: "宝宝吃了多少?"
|
||||
},
|
||||
MISSING_TIME: {
|
||||
en: "When did this happen?",
|
||||
es: "¿Cuándo ocurrió esto?",
|
||||
fr: "Quand cela s'est-il passé?",
|
||||
pt: "Quando isso aconteceu?",
|
||||
zh: "这是什么时候发生的?"
|
||||
},
|
||||
AMBIGUOUS_INTENT: {
|
||||
en: "What would you like to log?",
|
||||
es: "¿Qué te gustaría registrar?",
|
||||
fr: "Que souhaitez-vous enregistrer?",
|
||||
pt: "O que você gostaria de registrar?",
|
||||
zh: "您想记录什么?"
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Offline Voice Processing
|
||||
|
||||
### Fallback Strategy
|
||||
```typescript
|
||||
class OfflineVoiceProcessor {
|
||||
async processOffline(audioBuffer: Buffer): Promise<BasicTranscription> {
|
||||
// Use device's native speech recognition
|
||||
if (Platform.OS === 'ios') {
|
||||
return this.useiOSSpeechRecognition(audioBuffer);
|
||||
} else if (Platform.OS === 'android') {
|
||||
return this.useAndroidSpeechRecognition(audioBuffer);
|
||||
}
|
||||
|
||||
// Queue for later processing
|
||||
return this.queueForOnlineProcessing(audioBuffer);
|
||||
}
|
||||
|
||||
private async useiOSSpeechRecognition(audio: Buffer) {
|
||||
// Use SFSpeechRecognizer
|
||||
const recognizer = new SFSpeechRecognizer();
|
||||
return recognizer.recognize(audio);
|
||||
}
|
||||
|
||||
private async useAndroidSpeechRecognition(audio: Buffer) {
|
||||
// Use Android SpeechRecognizer
|
||||
const recognizer = new AndroidSpeechRecognizer();
|
||||
return recognizer.recognize(audio);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Confirmation & Feedback
|
||||
|
||||
### Voice Feedback System
|
||||
```typescript
|
||||
interface VoiceConfirmation {
|
||||
text: string;
|
||||
speech: string; // SSML for TTS
|
||||
visual: {
|
||||
icon: string;
|
||||
color: string;
|
||||
animation: string;
|
||||
};
|
||||
haptic?: 'success' | 'warning' | 'error';
|
||||
}
|
||||
|
||||
const confirmations = {
|
||||
FEEDING_LOGGED: {
|
||||
text: "Feeding logged",
|
||||
speech: "<speak>Got it! <break time='200ms'/> Logged <say-as interpret-as='cardinal'>4</say-as> ounces.</speak>",
|
||||
visual: {
|
||||
icon: 'check_circle',
|
||||
color: 'success',
|
||||
animation: 'bounce'
|
||||
},
|
||||
haptic: 'success'
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing Voice Commands
|
||||
|
||||
### Test Scenarios
|
||||
```typescript
|
||||
const voiceTestCases = [
|
||||
// English
|
||||
{ input: "Baby ate 4 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, unit: 'oz' }},
|
||||
{ input: "Nursed for fifteen minutes on the left", expected: { intent: 'LOG_FEEDING', duration: 15, side: 'left' }},
|
||||
|
||||
// Spanish
|
||||
{ input: "Tomó 120 mililitros", expected: { intent: 'LOG_FEEDING', amount: 120, unit: 'ml' }},
|
||||
|
||||
// Edge cases
|
||||
{ input: "Fed... um... about 4 or 5 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, confidence: 'low' }},
|
||||
{ input: "Changed a really dirty diaper", expected: { intent: 'LOG_DIAPER', type: 'dirty', notes: 'really dirty' }},
|
||||
];
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Audio Streaming
|
||||
```typescript
|
||||
class StreamingVoiceProcessor {
|
||||
private audioChunks: Buffer[] = [];
|
||||
private isProcessing = false;
|
||||
|
||||
async processStream(chunk: Buffer) {
|
||||
this.audioChunks.push(chunk);
|
||||
|
||||
if (!this.isProcessing && this.hasEnoughAudio()) {
|
||||
this.isProcessing = true;
|
||||
const result = await this.processChunks();
|
||||
this.isProcessing = false;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
private hasEnoughAudio(): boolean {
|
||||
// Need at least 0.5 seconds of audio
|
||||
const totalSize = this.audioChunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
||||
return totalSize > 8000; // ~0.5s at 16kHz
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Caching Common Commands
|
||||
```typescript
|
||||
const commandCache = new LRUCache<string, ProcessedCommand>({
|
||||
max: 100,
|
||||
ttl: 1000 * 60 * 60, // 1 hour
|
||||
});
|
||||
|
||||
// Cache exact matches for common phrases
|
||||
const cachedPhrases = [
|
||||
"wet diaper",
|
||||
"dirty diaper",
|
||||
"just nursed",
|
||||
"bottle feeding done",
|
||||
"down for a nap",
|
||||
"woke up"
|
||||
];
|
||||
```
|
||||
Reference in New Issue
Block a user