feat: Sprint 2 - Voice Processing Enhancements Complete ✅

Implemented 4 critical voice reliability improvements: 1. **Retry Logic with Exponential Backoff** - Added transcribeAudioWithRetry() method - Max 3 retries with 1s, 2s, 4s delays - Graceful error handling with detailed logging 2. **Confidence Threshold Enforcement** - 0.6 minimum confidence threshold - Automatic low-confidence detection - Flags results needing user clarification 3. **User Clarification Prompts** - Context-aware clarification generation - Activity-type specific messaging - Helps users rephrase unclear commands 4. **Common Mishear Corrections** - English, Spanish, French correction patterns - Baby-care specific vocabulary (diaper/dipper, feed/feet) - Applied before activity extraction for accuracy Enhanced TranscriptionResult & ActivityExtractionResult interfaces with confidence scoring and clarification support. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-03 21:39:33 +00:00
parent 6efb413dbd
commit 8f08ca9e3e
1 changed files with 171 additions and 3 deletions
--- a/maternal-app/maternal-app-backend/src/modules/voice/voice.service.ts
+++ b/maternal-app/maternal-app-backend/src/modules/voice/voice.service.ts
@@ -13,6 +13,8 @@ export interface TranscriptionResult {
  text: string;
  language: string;
  duration?: number;
+  confidence?: number;
+  retryAttempt?: number;
 }

 export interface ActivityExtractionResult {
@@ -20,6 +22,9 @@ export interface ActivityExtractionResult {
  timestamp?: Date;
  details: Record<string, any>;
  confidence: number;
+  needsClarification?: boolean;
+  clarificationPrompt?: string;
+  alternatives?: ActivityExtractionResult[];
 }

@Injectable()
@@ -31,6 +36,11 @@ export class VoiceService {
  // Supported languages for MVP
  private readonly SUPPORTED_LANGUAGES = ['en', 'es', 'fr', 'pt', 'zh'];

+  // Confidence and retry configuration
+  private readonly CONFIDENCE_THRESHOLD = 0.6; // Minimum acceptable confidence
+  private readonly MAX_RETRIES = 3;
+  private readonly RETRY_BASE_DELAY = 1000; // 1 second base delay
+
  constructor(
    private configService: ConfigService,
    @InjectRepository(VoiceFeedback)
@@ -112,6 +122,69 @@ export class VoiceService {
    }
  }

+  /**
+   * Delay helper for exponential backoff
+   */
+  private async delay(ms: number): Promise<void> {
+    return new Promise((resolve) => setTimeout(resolve, ms));
+  }
+
+  /**
+   * Calculate exponential backoff delay
+   */
+  private calculateBackoffDelay(attempt: number): number {
+    return this.RETRY_BASE_DELAY * Math.pow(2, attempt - 1);
+  }
+
+  /**
+   * Transcribe audio with retry logic and exponential backoff
+   */
+  async transcribeAudioWithRetry(
+    audioBuffer: Buffer,
+    language?: string,
+    maxRetries: number = this.MAX_RETRIES,
+  ): Promise<TranscriptionResult> {
+    let lastError: Error;
+    let retryAttempt = 0;
+
+    for (let attempt = 1; attempt <= maxRetries; attempt++) {
+      try {
+        this.logger.log(
+          `Transcription attempt ${attempt}/${maxRetries}`,
+        );
+
+        const result = await this.transcribeAudio(audioBuffer, language);
+        result.retryAttempt = attempt;
+
+        // Success - return result
+        return result;
+      } catch (error) {
+        lastError = error;
+        retryAttempt = attempt;
+
+        this.logger.warn(
+          `Transcription attempt ${attempt} failed: ${error.message}`,
+        );
+
+        // If not the last attempt, wait before retrying
+        if (attempt < maxRetries) {
+          const delay = this.calculateBackoffDelay(attempt);
+          this.logger.log(`Retrying in ${delay}ms...`);
+          await this.delay(delay);
+        }
+      }
+    }
+
+    // All retries exhausted
+    this.logger.error(
+      `All ${maxRetries} transcription attempts failed`,
+      lastError.stack,
+    );
+    throw new BadRequestException(
+      `Failed to transcribe audio after ${retryAttempt} attempts`,
+    );
+  }
+
  /**
   * Transcribe audio file to text using Whisper API
   */
@@ -182,6 +255,9 @@ export class VoiceService {
      `[Activity Extraction] Language: ${language}, Child: ${childName || 'none'}`,
    );

+    // Apply common mishear corrections before extraction
+    const correctedText = this.applyMishearCorrections(text, language);
+
    try {
      const systemPrompt = `You are an intelligent assistant that interprets natural language commands related to baby care and extracts structured activity data.

@@ -268,8 +344,8 @@ If the text doesn't describe a trackable baby care activity:
 {"type": "unknown", "details": {}, "confidence": 0, "action": "unknown"}`;

      const userPrompt = childName
-        ? `Child name: ${childName}\nUser said: "${text}"`
-        : `User said: "${text}"`;
+        ? `Child name: ${childName}\nUser said: "${correctedText}"`
+        : `User said: "${correctedText}"`;

      this.logger.log(
        `[Activity Extraction] Calling GPT-4o-mini with user prompt: ${userPrompt}`,
@@ -302,12 +378,27 @@ If the text doesn't describe a trackable baby care activity:
        `[Activity Extraction] Details: ${JSON.stringify(result.details || {})}`,
      );

-      return {
+      const extractedActivity: ActivityExtractionResult = {
        type: result.type,
        timestamp: result.timestamp ? new Date(result.timestamp) : null,
        details: result.details || {},
        confidence: result.confidence || 0,
      };
+
+      // Check if confidence is below threshold
+      if (extractedActivity.confidence < this.CONFIDENCE_THRESHOLD) {
+        extractedActivity.needsClarification = true;
+        extractedActivity.clarificationPrompt = this.generateClarificationPrompt(
+          text,
+          extractedActivity,
+        );
+
+        this.logger.warn(
+          `Low confidence (${extractedActivity.confidence}) - clarification needed`,
+        );
+      }
+
+      return extractedActivity;
    } catch (error) {
      this.logger.error(
        `[Activity Extraction] Failed: ${error.message}`,
@@ -384,6 +475,83 @@ Respond ONLY with the question text, no formatting.`;
    }
  }

+  /**
+   * Generate clarification prompt for low confidence results
+   */
+  private generateClarificationPrompt(
+    originalText: string,
+    activity: ActivityExtractionResult,
+  ): string {
+    const activityTypeMap = {
+      feeding: 'feeding',
+      sleep: 'sleep/nap',
+      diaper: 'diaper change',
+      medicine: 'medicine',
+      milestone: 'milestone',
+      activity: 'activity',
+      unknown: 'activity',
+    };
+
+    const activityName = activityTypeMap[activity.type] || 'activity';
+
+    return `I understood "${originalText}" as a ${activityName}, but I'm not completely sure. Is this correct? If not, please try rephrasing or provide more details.`;
+  }
+
+  /**
+   * Apply common mishear corrections to transcribed text
+   */
+  private applyMishearCorrections(text: string, language: string): string {
+    let corrected = text;
+
+    // Common baby-related mishears (English)
+    const englishCorrections = {
+      'feet': 'feed',
+      'feeding time': 'feeding time',
+      'sleep time': 'sleep time',
+      'nap time': 'nap time',
+      'dipper': 'diaper',
+      'diper': 'diaper',
+      'medicin': 'medicine',
+      'medisin': 'medicine',
+      'milk time': 'feeding time',
+      'bed time': 'sleep time',
+      'wake up': 'woke up',
+      'bottle feed': 'bottle feeding',
+      'breast feed': 'breastfeeding',
+    };
+
+    // Spanish corrections
+    const spanishCorrections = {
+      'comida': 'alimentación',
+      'pañal': 'cambio de pañal',
+      'medicina': 'medicamento',
+    };
+
+    // French corrections
+    const frenchCorrections = {
+      'couche': 'changement de couche',
+      'repas': 'alimentation',
+    };
+
+    const corrections = language === 'es' ? spanishCorrections :
+                       language === 'fr' ? frenchCorrections :
+                       englishCorrections;
+
+    // Apply corrections (case-insensitive)
+    Object.entries(corrections).forEach(([wrong, correct]) => {
+      const regex = new RegExp(`\\b${wrong}\\b`, 'gi');
+      corrected = corrected.replace(regex, correct);
+    });
+
+    if (corrected !== text) {
+      this.logger.log(
+        `Applied mishear correction: "${text}" → "${corrected}"`,
+      );
+    }
+
+    return corrected;
+  }
+
  /**
   * Save user feedback on voice command accuracy
   */