Files
maternal-app/docs/maternal-app-voice-processing.md
andupetcu 98e01ebe80 Phase 1 & 2: Authentication and Children Management
Completed Features:
- Full JWT authentication system with refresh tokens
- User registration and login with device fingerprinting
- Child profile CRUD operations with permission-based access
- Family management with roles and permissions
- Database migrations for core auth and family structure
- Comprehensive test coverage (37 unit + E2E tests)

Tech Stack:
- NestJS backend with TypeORM
- PostgreSQL database
- JWT authentication with Passport
- bcrypt password hashing
- Docker Compose for infrastructure

🤖 Generated with Claude Code
2025-09-30 18:40:10 +03:00

14 KiB

Voice Input Processing Guide - Maternal Organization App

Voice Processing Architecture

Overview

Voice input enables hands-free logging during childcare activities. The system processes natural language in 5 languages, extracting structured data from casual speech patterns.

Processing Pipeline

Audio Input → Speech Recognition → Language Detection → 
Intent Classification → Entity Extraction → Action Execution → 
Confirmation Feedback

Whisper API Integration

Configuration

// services/whisperService.ts
import OpenAI from 'openai';

class WhisperService {
  private client: OpenAI;
  
  constructor() {
    this.client = new OpenAI({
      apiKey: process.env.OPENAI_API_KEY,
    });
  }

  async transcribeAudio(audioBuffer: Buffer, language?: string): Promise<TranscriptionResult> {
    try {
      const response = await this.client.audio.transcriptions.create({
        file: audioBuffer,
        model: 'whisper-1',
        language: language || 'en', // ISO-639-1 code
        response_format: 'verbose_json',
        timestamp_granularities: ['word'],
      });
      
      return {
        text: response.text,
        language: response.language,
        confidence: this.calculateConfidence(response),
        words: response.words,
      };
    } catch (error) {
      return this.handleTranscriptionError(error);
    }
  }
}

Audio Preprocessing

// utils/audioPreprocessing.ts
export const preprocessAudio = async (audioFile: File): Promise<Buffer> => {
  // Validate format
  const validFormats = ['wav', 'mp3', 'm4a', 'webm'];
  if (!validFormats.includes(getFileExtension(audioFile))) {
    throw new Error('Unsupported audio format');
  }
  
  // Check file size (max 25MB for Whisper)
  if (audioFile.size > 25 * 1024 * 1024) {
    // Compress or chunk the audio
    return await compressAudio(audioFile);
  }
  
  // Noise reduction for better accuracy
  return await reduceNoise(audioFile);
};

Natural Language Command Patterns

Intent Classification

enum VoiceIntent {
  LOG_FEEDING = 'LOG_FEEDING',
  LOG_SLEEP = 'LOG_SLEEP',
  LOG_DIAPER = 'LOG_DIAPER',
  LOG_MEDICATION = 'LOG_MEDICATION',
  START_TIMER = 'START_TIMER',
  STOP_TIMER = 'STOP_TIMER',
  ASK_QUESTION = 'ASK_QUESTION',
  CHECK_STATUS = 'CHECK_STATUS',
  CANCEL = 'CANCEL'
}

interface IntentPattern {
  intent: VoiceIntent;
  patterns: RegExp[];
  requiredEntities: string[];
  examples: string[];
}

English Language Patterns

const englishPatterns: IntentPattern[] = [
  {
    intent: VoiceIntent.LOG_FEEDING,
    patterns: [
      /(?:baby |she |he )?(?:fed|ate|drank|had|nursed)/i,
      /(?:bottle|breast|nursing|feeding)/i,
      /(?:finished|done) (?:eating|feeding|nursing)/i,
    ],
    requiredEntities: ['amount?', 'time?', 'type?'],
    examples: [
      "Baby fed 4 ounces",
      "Just nursed for 15 minutes on the left",
      "She had 120ml of formula at 3pm",
      "Finished feeding, both sides, 20 minutes total"
    ]
  },
  {
    intent: VoiceIntent.LOG_SLEEP,
    patterns: [
      /(?:went|going) (?:to )?(?:sleep|bed|nap)/i,
      /(?:woke|wake|waking) up/i,
      /(?:nap|sleep)(?:ping|ed)? (?:for|since)/i,
      /(?:fell) asleep/i,
    ],
    requiredEntities: ['time?', 'duration?'],
    examples: [
      "Down for a nap",
      "Woke up from nap",
      "Sleeping since 2pm",
      "Just fell asleep in the stroller"
    ]
  },
  {
    intent: VoiceIntent.LOG_DIAPER,
    patterns: [
      /(?:chang|dirty|wet|soil|poop|pee)/i,
      /diaper/i,
      /(?:number|#) (?:one|two|1|2)/i,
    ],
    requiredEntities: ['type?'],
    examples: [
      "Changed wet diaper",
      "Dirty diaper with rash",
      "Just changed a poopy one",
      "Diaper change, both wet and dirty"
    ]
  }
];

Multi-Language Patterns

// Spanish patterns
const spanishPatterns: IntentPattern[] = [
  {
    intent: VoiceIntent.LOG_FEEDING,
    patterns: [
      /(?:comió|tomó|bebió|amamanté)/i,
      /(?:biberón|pecho|lactancia)/i,
    ],
    examples: [
      "Tomó 120ml de fórmula",
      "Amamanté 15 minutos lado izquierdo",
      "Ya comió papilla"
    ]
  }
];

// French patterns
const frenchPatterns: IntentPattern[] = [
  {
    intent: VoiceIntent.LOG_FEEDING,
    patterns: [
      /(?:mangé|bu|allaité|nourri)/i,
      /(?:biberon|sein|tétée)/i,
    ],
    examples: [
      "Biberon de 120ml",
      "Allaité 15 minutes côté gauche",
      "A mangé sa purée"
    ]
  }
];

// Portuguese patterns
const portuguesePatterns: IntentPattern[] = [
  {
    intent: VoiceIntent.LOG_FEEDING,
    patterns: [
      /(?:comeu|tomou|bebeu|amamentei)/i,
      /(?:mamadeira|peito|amamentação)/i,
    ],
    examples: [
      "Tomou 120ml de fórmula",
      "Amamentei 15 minutos lado esquerdo"
    ]
  }
];

// Chinese patterns
const chinesePatterns: IntentPattern[] = [
  {
    intent: VoiceIntent.LOG_FEEDING,
    patterns: [
      /(?:喂|吃|喝|哺乳)/,
      /(?:奶瓶|母乳|配方奶)/,
    ],
    examples: [
      "喝了120毫升配方奶",
      "母乳喂养15分钟",
      "吃了辅食"
    ]
  }
];

Entity Extraction

Entity Types

interface ExtractedEntities {
  amount?: {
    value: number;
    unit: 'oz' | 'ml' | 'minutes';
  };
  time?: {
    value: Date;
    precision: 'exact' | 'approximate';
  };
  duration?: {
    value: number;
    unit: 'minutes' | 'hours';
  };
  side?: 'left' | 'right' | 'both';
  type?: 'breast' | 'bottle' | 'solid' | 'wet' | 'dirty' | 'both';
  location?: string;
  notes?: string;
}

Extraction Logic

class EntityExtractor {
  extractAmount(text: string): ExtractedEntities['amount'] {
    // Numeric amounts with units
    const amountPattern = /(\d+(?:\.\d+)?)\s*(oz|ounce|ml|milliliter|minute|min)/i;
    const match = text.match(amountPattern);
    
    if (match) {
      return {
        value: parseFloat(match[1]),
        unit: this.normalizeUnit(match[2])
      };
    }
    
    // Word numbers
    const wordNumbers = {
      'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
      'ten': 10, 'fifteen': 15, 'twenty': 20, 'thirty': 30,
    };
    
    for (const [word, value] of Object.entries(wordNumbers)) {
      if (text.includes(word)) {
        return { value, unit: this.inferUnit(text) };
      }
    }
    
    return undefined;
  }

  extractTime(text: string, timezone: string): ExtractedEntities['time'] {
    const now = new Date();
    
    // Relative times
    if (/just|now|right now/i.test(text)) {
      return { value: now, precision: 'exact' };
    }
    
    if (/ago/i.test(text)) {
      const minutesAgo = this.extractMinutesAgo(text);
      return {
        value: new Date(now.getTime() - minutesAgo * 60000),
        precision: 'approximate'
      };
    }
    
    // Clock times
    const timePattern = /(\d{1,2}):?(\d{2})?\s*(am|pm)?/i;
    const match = text.match(timePattern);
    
    if (match) {
      return {
        value: this.parseClockTime(match, timezone),
        precision: 'exact'
      };
    }
    
    return { value: now, precision: 'approximate' };
  }

  extractSide(text: string): ExtractedEntities['side'] {
    if (/left|izquierdo|gauche|esquerdo|左/i.test(text)) return 'left';
    if (/right|derecho|droit|direito|右/i.test(text)) return 'right';
    if (/both|ambos|deux|ambos|两|両/i.test(text)) return 'both';
    return undefined;
  }
}

Intent Processing Engine

Main Processing Flow

class VoiceCommandProcessor {
  async processVoiceInput(
    audioBuffer: Buffer,
    context: UserContext
  ): Promise<ProcessedCommand> {
    // 1. Transcribe audio
    const transcription = await this.whisperService.transcribeAudio(
      audioBuffer,
      context.language
    );
    
    if (transcription.confidence < 0.5) {
      return this.handleLowConfidence(transcription);
    }
    
    // 2. Detect intent
    const intent = await this.detectIntent(
      transcription.text,
      context.language
    );
    
    // 3. Extract entities
    const entities = await this.extractEntities(
      transcription.text,
      intent,
      context
    );
    
    // 4. Validate command
    const validation = this.validateCommand(intent, entities);
    
    if (!validation.isValid) {
      return this.requestClarification(validation.missingInfo);
    }
    
    // 5. Execute action
    return this.executeCommand(intent, entities, context);
  }

  private async detectIntent(
    text: string,
    language: string
  ): Promise<VoiceIntent> {
    const patterns = this.getPatternsByLanguage(language);
    
    for (const pattern of patterns) {
      for (const regex of pattern.patterns) {
        if (regex.test(text)) {
          return pattern.intent;
        }
      }
    }
    
    // Fallback to AI intent detection
    return this.detectIntentWithAI(text, language);
  }
}

Error Recovery

Common Recognition Errors

interface RecognitionError {
  type: 'LOW_CONFIDENCE' | 'AMBIGUOUS' | 'MISSING_DATA' | 'INVALID_VALUE';
  originalText: string;
  suggestions?: string[];
}

class ErrorRecovery {
  handleLowConfidence(transcription: TranscriptionResult): ProcessedCommand {
    // Check for common misheard phrases
    const corrections = this.checkCommonMishears(transcription.text);
    
    if (corrections.confidence > 0.7) {
      return this.retryWithCorrection(corrections.text);
    }
    
    return {
      success: false,
      action: 'CONFIRM',
      message: `Did you say "${transcription.text}"?`,
      alternatives: this.getSimilarPhrases(transcription.text)
    };
  }

  checkCommonMishears(text: string): CorrectionResult {
    const corrections = {
      'for ounces': 'four ounces',
      'to ounces': 'two ounces',
      'write side': 'right side',
      'laugh side': 'left side',
      'wet and dirty': 'wet and dirty',
      'wedding dirty': 'wet and dirty',
    };
    
    for (const [misheard, correct] of Object.entries(corrections)) {
      if (text.includes(misheard)) {
        return {
          text: text.replace(misheard, correct),
          confidence: 0.8
        };
      }
    }
    
    return { text, confidence: 0.3 };
  }
}

Clarification Prompts

const clarificationPrompts = {
  MISSING_AMOUNT: {
    en: "How much did baby eat?",
    es: "¿Cuánto comió el bebé?",
    fr: "Combien a mangé bébé?",
    pt: "Quanto o bebê comeu?",
    zh: "宝宝吃了多少?"
  },
  MISSING_TIME: {
    en: "When did this happen?",
    es: "¿Cuándo ocurrió esto?",
    fr: "Quand cela s'est-il passé?",
    pt: "Quando isso aconteceu?",
    zh: "这是什么时候发生的?"
  },
  AMBIGUOUS_INTENT: {
    en: "What would you like to log?",
    es: "¿Qué te gustaría registrar?",
    fr: "Que souhaitez-vous enregistrer?",
    pt: "O que você gostaria de registrar?",
    zh: "您想记录什么?"
  }
};

Offline Voice Processing

Fallback Strategy

class OfflineVoiceProcessor {
  async processOffline(audioBuffer: Buffer): Promise<BasicTranscription> {
    // Use device's native speech recognition
    if (Platform.OS === 'ios') {
      return this.useiOSSpeechRecognition(audioBuffer);
    } else if (Platform.OS === 'android') {
      return this.useAndroidSpeechRecognition(audioBuffer);
    }
    
    // Queue for later processing
    return this.queueForOnlineProcessing(audioBuffer);
  }

  private async useiOSSpeechRecognition(audio: Buffer) {
    // Use SFSpeechRecognizer
    const recognizer = new SFSpeechRecognizer();
    return recognizer.recognize(audio);
  }

  private async useAndroidSpeechRecognition(audio: Buffer) {
    // Use Android SpeechRecognizer
    const recognizer = new AndroidSpeechRecognizer();
    return recognizer.recognize(audio);
  }
}

Confirmation & Feedback

Voice Feedback System

interface VoiceConfirmation {
  text: string;
  speech: string; // SSML for TTS
  visual: {
    icon: string;
    color: string;
    animation: string;
  };
  haptic?: 'success' | 'warning' | 'error';
}

const confirmations = {
  FEEDING_LOGGED: {
    text: "Feeding logged",
    speech: "<speak>Got it! <break time='200ms'/> Logged <say-as interpret-as='cardinal'>4</say-as> ounces.</speak>",
    visual: {
      icon: 'check_circle',
      color: 'success',
      animation: 'bounce'
    },
    haptic: 'success'
  }
};

Testing Voice Commands

Test Scenarios

const voiceTestCases = [
  // English
  { input: "Baby ate 4 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, unit: 'oz' }},
  { input: "Nursed for fifteen minutes on the left", expected: { intent: 'LOG_FEEDING', duration: 15, side: 'left' }},
  
  // Spanish
  { input: "Tomó 120 mililitros", expected: { intent: 'LOG_FEEDING', amount: 120, unit: 'ml' }},
  
  // Edge cases
  { input: "Fed... um... about 4 or 5 ounces", expected: { intent: 'LOG_FEEDING', amount: 4, confidence: 'low' }},
  { input: "Changed a really dirty diaper", expected: { intent: 'LOG_DIAPER', type: 'dirty', notes: 'really dirty' }},
];

Performance Optimization

Audio Streaming

class StreamingVoiceProcessor {
  private audioChunks: Buffer[] = [];
  private isProcessing = false;

  async processStream(chunk: Buffer) {
    this.audioChunks.push(chunk);
    
    if (!this.isProcessing && this.hasEnoughAudio()) {
      this.isProcessing = true;
      const result = await this.processChunks();
      this.isProcessing = false;
      return result;
    }
  }

  private hasEnoughAudio(): boolean {
    // Need at least 0.5 seconds of audio
    const totalSize = this.audioChunks.reduce((sum, chunk) => sum + chunk.length, 0);
    return totalSize > 8000; // ~0.5s at 16kHz
  }
}

Caching Common Commands

const commandCache = new LRUCache<string, ProcessedCommand>({
  max: 100,
  ttl: 1000 * 60 * 60, // 1 hour
});

// Cache exact matches for common phrases
const cachedPhrases = [
  "wet diaper",
  "dirty diaper", 
  "just nursed",
  "bottle feeding done",
  "down for a nap",
  "woke up"
];