feat: Sprint 2 - Voice Processing Enhancements Complete ✅
Implemented 4 critical voice reliability improvements: 1. **Retry Logic with Exponential Backoff** - Added transcribeAudioWithRetry() method - Max 3 retries with 1s, 2s, 4s delays - Graceful error handling with detailed logging 2. **Confidence Threshold Enforcement** - 0.6 minimum confidence threshold - Automatic low-confidence detection - Flags results needing user clarification 3. **User Clarification Prompts** - Context-aware clarification generation - Activity-type specific messaging - Helps users rephrase unclear commands 4. **Common Mishear Corrections** - English, Spanish, French correction patterns - Baby-care specific vocabulary (diaper/dipper, feed/feet) - Applied before activity extraction for accuracy Enhanced TranscriptionResult & ActivityExtractionResult interfaces with confidence scoring and clarification support. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -13,6 +13,8 @@ export interface TranscriptionResult {
|
|||||||
text: string;
|
text: string;
|
||||||
language: string;
|
language: string;
|
||||||
duration?: number;
|
duration?: number;
|
||||||
|
confidence?: number;
|
||||||
|
retryAttempt?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ActivityExtractionResult {
|
export interface ActivityExtractionResult {
|
||||||
@@ -20,6 +22,9 @@ export interface ActivityExtractionResult {
|
|||||||
timestamp?: Date;
|
timestamp?: Date;
|
||||||
details: Record<string, any>;
|
details: Record<string, any>;
|
||||||
confidence: number;
|
confidence: number;
|
||||||
|
needsClarification?: boolean;
|
||||||
|
clarificationPrompt?: string;
|
||||||
|
alternatives?: ActivityExtractionResult[];
|
||||||
}
|
}
|
||||||
|
|
||||||
@Injectable()
|
@Injectable()
|
||||||
@@ -31,6 +36,11 @@ export class VoiceService {
|
|||||||
// Supported languages for MVP
|
// Supported languages for MVP
|
||||||
private readonly SUPPORTED_LANGUAGES = ['en', 'es', 'fr', 'pt', 'zh'];
|
private readonly SUPPORTED_LANGUAGES = ['en', 'es', 'fr', 'pt', 'zh'];
|
||||||
|
|
||||||
|
// Confidence and retry configuration
|
||||||
|
private readonly CONFIDENCE_THRESHOLD = 0.6; // Minimum acceptable confidence
|
||||||
|
private readonly MAX_RETRIES = 3;
|
||||||
|
private readonly RETRY_BASE_DELAY = 1000; // 1 second base delay
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
private configService: ConfigService,
|
private configService: ConfigService,
|
||||||
@InjectRepository(VoiceFeedback)
|
@InjectRepository(VoiceFeedback)
|
||||||
@@ -112,6 +122,69 @@ export class VoiceService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delay helper for exponential backoff
|
||||||
|
*/
|
||||||
|
private async delay(ms: number): Promise<void> {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate exponential backoff delay
|
||||||
|
*/
|
||||||
|
private calculateBackoffDelay(attempt: number): number {
|
||||||
|
return this.RETRY_BASE_DELAY * Math.pow(2, attempt - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transcribe audio with retry logic and exponential backoff
|
||||||
|
*/
|
||||||
|
async transcribeAudioWithRetry(
|
||||||
|
audioBuffer: Buffer,
|
||||||
|
language?: string,
|
||||||
|
maxRetries: number = this.MAX_RETRIES,
|
||||||
|
): Promise<TranscriptionResult> {
|
||||||
|
let lastError: Error;
|
||||||
|
let retryAttempt = 0;
|
||||||
|
|
||||||
|
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
||||||
|
try {
|
||||||
|
this.logger.log(
|
||||||
|
`Transcription attempt ${attempt}/${maxRetries}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await this.transcribeAudio(audioBuffer, language);
|
||||||
|
result.retryAttempt = attempt;
|
||||||
|
|
||||||
|
// Success - return result
|
||||||
|
return result;
|
||||||
|
} catch (error) {
|
||||||
|
lastError = error;
|
||||||
|
retryAttempt = attempt;
|
||||||
|
|
||||||
|
this.logger.warn(
|
||||||
|
`Transcription attempt ${attempt} failed: ${error.message}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// If not the last attempt, wait before retrying
|
||||||
|
if (attempt < maxRetries) {
|
||||||
|
const delay = this.calculateBackoffDelay(attempt);
|
||||||
|
this.logger.log(`Retrying in ${delay}ms...`);
|
||||||
|
await this.delay(delay);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// All retries exhausted
|
||||||
|
this.logger.error(
|
||||||
|
`All ${maxRetries} transcription attempts failed`,
|
||||||
|
lastError.stack,
|
||||||
|
);
|
||||||
|
throw new BadRequestException(
|
||||||
|
`Failed to transcribe audio after ${retryAttempt} attempts`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Transcribe audio file to text using Whisper API
|
* Transcribe audio file to text using Whisper API
|
||||||
*/
|
*/
|
||||||
@@ -182,6 +255,9 @@ export class VoiceService {
|
|||||||
`[Activity Extraction] Language: ${language}, Child: ${childName || 'none'}`,
|
`[Activity Extraction] Language: ${language}, Child: ${childName || 'none'}`,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Apply common mishear corrections before extraction
|
||||||
|
const correctedText = this.applyMishearCorrections(text, language);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const systemPrompt = `You are an intelligent assistant that interprets natural language commands related to baby care and extracts structured activity data.
|
const systemPrompt = `You are an intelligent assistant that interprets natural language commands related to baby care and extracts structured activity data.
|
||||||
|
|
||||||
@@ -268,8 +344,8 @@ If the text doesn't describe a trackable baby care activity:
|
|||||||
{"type": "unknown", "details": {}, "confidence": 0, "action": "unknown"}`;
|
{"type": "unknown", "details": {}, "confidence": 0, "action": "unknown"}`;
|
||||||
|
|
||||||
const userPrompt = childName
|
const userPrompt = childName
|
||||||
? `Child name: ${childName}\nUser said: "${text}"`
|
? `Child name: ${childName}\nUser said: "${correctedText}"`
|
||||||
: `User said: "${text}"`;
|
: `User said: "${correctedText}"`;
|
||||||
|
|
||||||
this.logger.log(
|
this.logger.log(
|
||||||
`[Activity Extraction] Calling GPT-4o-mini with user prompt: ${userPrompt}`,
|
`[Activity Extraction] Calling GPT-4o-mini with user prompt: ${userPrompt}`,
|
||||||
@@ -302,12 +378,27 @@ If the text doesn't describe a trackable baby care activity:
|
|||||||
`[Activity Extraction] Details: ${JSON.stringify(result.details || {})}`,
|
`[Activity Extraction] Details: ${JSON.stringify(result.details || {})}`,
|
||||||
);
|
);
|
||||||
|
|
||||||
return {
|
const extractedActivity: ActivityExtractionResult = {
|
||||||
type: result.type,
|
type: result.type,
|
||||||
timestamp: result.timestamp ? new Date(result.timestamp) : null,
|
timestamp: result.timestamp ? new Date(result.timestamp) : null,
|
||||||
details: result.details || {},
|
details: result.details || {},
|
||||||
confidence: result.confidence || 0,
|
confidence: result.confidence || 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Check if confidence is below threshold
|
||||||
|
if (extractedActivity.confidence < this.CONFIDENCE_THRESHOLD) {
|
||||||
|
extractedActivity.needsClarification = true;
|
||||||
|
extractedActivity.clarificationPrompt = this.generateClarificationPrompt(
|
||||||
|
text,
|
||||||
|
extractedActivity,
|
||||||
|
);
|
||||||
|
|
||||||
|
this.logger.warn(
|
||||||
|
`Low confidence (${extractedActivity.confidence}) - clarification needed`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return extractedActivity;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error(
|
this.logger.error(
|
||||||
`[Activity Extraction] Failed: ${error.message}`,
|
`[Activity Extraction] Failed: ${error.message}`,
|
||||||
@@ -384,6 +475,83 @@ Respond ONLY with the question text, no formatting.`;
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate clarification prompt for low confidence results
|
||||||
|
*/
|
||||||
|
private generateClarificationPrompt(
|
||||||
|
originalText: string,
|
||||||
|
activity: ActivityExtractionResult,
|
||||||
|
): string {
|
||||||
|
const activityTypeMap = {
|
||||||
|
feeding: 'feeding',
|
||||||
|
sleep: 'sleep/nap',
|
||||||
|
diaper: 'diaper change',
|
||||||
|
medicine: 'medicine',
|
||||||
|
milestone: 'milestone',
|
||||||
|
activity: 'activity',
|
||||||
|
unknown: 'activity',
|
||||||
|
};
|
||||||
|
|
||||||
|
const activityName = activityTypeMap[activity.type] || 'activity';
|
||||||
|
|
||||||
|
return `I understood "${originalText}" as a ${activityName}, but I'm not completely sure. Is this correct? If not, please try rephrasing or provide more details.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Apply common mishear corrections to transcribed text
|
||||||
|
*/
|
||||||
|
private applyMishearCorrections(text: string, language: string): string {
|
||||||
|
let corrected = text;
|
||||||
|
|
||||||
|
// Common baby-related mishears (English)
|
||||||
|
const englishCorrections = {
|
||||||
|
'feet': 'feed',
|
||||||
|
'feeding time': 'feeding time',
|
||||||
|
'sleep time': 'sleep time',
|
||||||
|
'nap time': 'nap time',
|
||||||
|
'dipper': 'diaper',
|
||||||
|
'diper': 'diaper',
|
||||||
|
'medicin': 'medicine',
|
||||||
|
'medisin': 'medicine',
|
||||||
|
'milk time': 'feeding time',
|
||||||
|
'bed time': 'sleep time',
|
||||||
|
'wake up': 'woke up',
|
||||||
|
'bottle feed': 'bottle feeding',
|
||||||
|
'breast feed': 'breastfeeding',
|
||||||
|
};
|
||||||
|
|
||||||
|
// Spanish corrections
|
||||||
|
const spanishCorrections = {
|
||||||
|
'comida': 'alimentación',
|
||||||
|
'pañal': 'cambio de pañal',
|
||||||
|
'medicina': 'medicamento',
|
||||||
|
};
|
||||||
|
|
||||||
|
// French corrections
|
||||||
|
const frenchCorrections = {
|
||||||
|
'couche': 'changement de couche',
|
||||||
|
'repas': 'alimentation',
|
||||||
|
};
|
||||||
|
|
||||||
|
const corrections = language === 'es' ? spanishCorrections :
|
||||||
|
language === 'fr' ? frenchCorrections :
|
||||||
|
englishCorrections;
|
||||||
|
|
||||||
|
// Apply corrections (case-insensitive)
|
||||||
|
Object.entries(corrections).forEach(([wrong, correct]) => {
|
||||||
|
const regex = new RegExp(`\\b${wrong}\\b`, 'gi');
|
||||||
|
corrected = corrected.replace(regex, correct);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (corrected !== text) {
|
||||||
|
this.logger.log(
|
||||||
|
`Applied mishear correction: "${text}" → "${corrected}"`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return corrected;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Save user feedback on voice command accuracy
|
* Save user feedback on voice command accuracy
|
||||||
*/
|
*/
|
||||||
|
|||||||
Reference in New Issue
Block a user