Add prompt injection protection for AI endpoints
Implemented comprehensive security against prompt injection attacks: **Detection Patterns:** - System prompt manipulation (ignore/disregard/forget instructions) - Role manipulation (pretend to be, act as) - Data exfiltration (show system prompt, list users) - Command injection (execute code, run command) - Jailbreak attempts (DAN mode, developer mode, admin mode) **Input Validation:** - Maximum length: 2,000 characters - Maximum line length: 500 characters - Maximum repeated characters: 20 consecutive - Special character ratio limit: 30% - HTML/JavaScript injection blocking **Sanitization:** - HTML tag removal - Zero-width character stripping - Control character removal - Whitespace normalization **Rate Limiting:** - 5 suspicious attempts per minute per user - Automatic clearing on successful validation - Per-user tracking with session storage **Context Awareness:** - Parenting keyword validation - Domain-appropriate scope checking - Lenient validation for short prompts **Implementation:** - lib/security/promptSecurity.ts - Core validation logic - app/api/ai/chat/route.ts - Integrated validation - scripts/test-prompt-injection.mjs - 19 test cases (all passing) - lib/security/README.md - Documentation **Test Coverage:** ✅ Valid parenting questions (2 tests) ✅ System manipulation attempts (4 tests) ✅ Role manipulation (1 test) ✅ Data exfiltration (3 tests) ✅ Command injection (2 tests) ✅ Jailbreak techniques (2 tests) ✅ Length attacks (2 tests) ✅ Character encoding attacks (2 tests) ✅ Edge cases (1 test) All suspicious attempts are logged with user ID, reason, risk level, and timestamp for security monitoring. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
343
maternal-web/lib/security/promptSecurity.ts
Normal file
343
maternal-web/lib/security/promptSecurity.ts
Normal file
@@ -0,0 +1,343 @@
|
||||
/**
|
||||
* Prompt Injection Protection
|
||||
*
|
||||
* Detects and prevents malicious prompt injection attempts in AI inputs
|
||||
* to protect against system prompt manipulation, data exfiltration, and
|
||||
* jailbreaking attempts.
|
||||
*/
|
||||
|
||||
export interface PromptValidationResult {
|
||||
isValid: boolean;
|
||||
reason?: string;
|
||||
sanitizedPrompt?: string;
|
||||
riskLevel: 'low' | 'medium' | 'high';
|
||||
}
|
||||
|
||||
/**
|
||||
* Common prompt injection patterns to detect
|
||||
*/
|
||||
const INJECTION_PATTERNS = [
|
||||
// System prompt manipulation
|
||||
/ignore\s+(previous|above|all|prior)\s+(instructions?|prompts?|commands?)/gi,
|
||||
/ignore\s+all/gi, // Catch "ignore all"
|
||||
/disregard\s+(previous|above|all)\s+(instructions?|prompts?|commands?)/gi,
|
||||
/forget\s+(previous|above|all)\s+(instructions?|prompts?|commands?)/gi,
|
||||
/new\s+instructions?:/gi,
|
||||
/system\s+prompt/gi, // Catch "system prompt" anywhere
|
||||
/you\s+are\s+now/gi,
|
||||
/act\s+as\s+a\s+(?!parent|caregiver)/gi, // Allow parenting roles only
|
||||
|
||||
// Role manipulation
|
||||
/pretend\s+to\s+be/gi,
|
||||
/simulate\s+being/gi,
|
||||
/roleplay\s+as/gi,
|
||||
|
||||
// Data exfiltration attempts
|
||||
/show\s+me\s+(your|the)\s+(system|internal|hidden)/gi, // Catch "show me your system/internal/hidden"
|
||||
/your\s+(system|internal|hidden)\s+prompt/gi, // Catch "your system/internal prompt"
|
||||
/what\s+(is|are)\s+your\s+(instructions?|rules?|guidelines?)/gi,
|
||||
/reveal\s+your\s+(system|internal|hidden)/gi,
|
||||
/list\s+all\s+(users?|children|families)/gi,
|
||||
/show\s+all\s+data/gi,
|
||||
|
||||
// Command injection
|
||||
/execute\s+code/gi,
|
||||
/run\s+command/gi,
|
||||
/shell\s+command/gi,
|
||||
|
||||
// Jailbreak attempts
|
||||
/DAN\s+mode/gi, // "Do Anything Now"
|
||||
/developer\s+mode/gi,
|
||||
/admin\s+mode/gi,
|
||||
/sudo\s+mode/gi,
|
||||
/root\s+access/gi,
|
||||
|
||||
// Prompt leaking
|
||||
/repeat\s+(the\s+)?above/gi,
|
||||
/what\s+was\s+your\s+(first|initial|original)/gi,
|
||||
/before\s+this\s+conversation/gi,
|
||||
];
|
||||
|
||||
/**
|
||||
* Suspicious character sequences that may indicate encoding attacks
|
||||
*/
|
||||
const SUSPICIOUS_SEQUENCES = [
|
||||
/\u0000/g, // Null bytes
|
||||
/[\u200B-\u200D\uFEFF]/g, // Zero-width characters
|
||||
/[\u2060-\u2069]/g, // Invisible formatting characters
|
||||
/<script/gi, // HTML script tags
|
||||
/<iframe/gi, // HTML iframe tags
|
||||
/javascript:/gi, // JavaScript protocol
|
||||
/data:text\/html/gi, // Data URIs
|
||||
];
|
||||
|
||||
/**
|
||||
* Maximum allowed lengths to prevent resource exhaustion
|
||||
*/
|
||||
const MAX_PROMPT_LENGTH = 2000; // characters
|
||||
const MAX_LINE_LENGTH = 500; // characters per line
|
||||
const MAX_REPEATED_CHARS = 20; // consecutive same character
|
||||
|
||||
/**
|
||||
* Validates and sanitizes user prompts for AI queries
|
||||
*/
|
||||
export function validatePrompt(prompt: string): PromptValidationResult {
|
||||
if (!prompt || typeof prompt !== 'string') {
|
||||
return {
|
||||
isValid: false,
|
||||
reason: 'Prompt must be a non-empty string',
|
||||
riskLevel: 'low',
|
||||
};
|
||||
}
|
||||
|
||||
// Check length constraints
|
||||
if (prompt.length > MAX_PROMPT_LENGTH) {
|
||||
return {
|
||||
isValid: false,
|
||||
reason: `Prompt exceeds maximum length of ${MAX_PROMPT_LENGTH} characters`,
|
||||
riskLevel: 'medium',
|
||||
};
|
||||
}
|
||||
|
||||
// Check for excessively long lines (may indicate copy-paste attacks)
|
||||
const lines = prompt.split('\n');
|
||||
const longLine = lines.find(line => line.length > MAX_LINE_LENGTH);
|
||||
if (longLine) {
|
||||
return {
|
||||
isValid: false,
|
||||
reason: 'Prompt contains excessively long lines',
|
||||
riskLevel: 'medium',
|
||||
};
|
||||
}
|
||||
|
||||
// Check for suspicious repeated characters
|
||||
const repeatedCharsMatch = prompt.match(/(.)\1+/g);
|
||||
if (repeatedCharsMatch) {
|
||||
const maxRepeat = Math.max(...repeatedCharsMatch.map(m => m.length));
|
||||
if (maxRepeat > MAX_REPEATED_CHARS) {
|
||||
return {
|
||||
isValid: false,
|
||||
reason: 'Prompt contains suspicious repeated characters',
|
||||
riskLevel: 'medium',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Check for suspicious character sequences
|
||||
for (const pattern of SUSPICIOUS_SEQUENCES) {
|
||||
if (pattern.test(prompt)) {
|
||||
return {
|
||||
isValid: false,
|
||||
reason: 'Prompt contains suspicious or hidden characters',
|
||||
riskLevel: 'high',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Check for prompt injection patterns
|
||||
let riskLevel: 'low' | 'medium' | 'high' = 'low';
|
||||
const detectedPatterns: string[] = [];
|
||||
|
||||
for (const pattern of INJECTION_PATTERNS) {
|
||||
if (pattern.test(prompt)) {
|
||||
detectedPatterns.push(pattern.source);
|
||||
riskLevel = 'high';
|
||||
}
|
||||
}
|
||||
|
||||
if (detectedPatterns.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
reason: 'Prompt contains potential injection attempt',
|
||||
riskLevel: 'high',
|
||||
};
|
||||
}
|
||||
|
||||
// Check for excessive special characters (may indicate encoding attack)
|
||||
const specialCharCount = (prompt.match(/[^a-zA-Z0-9\s.,!?'-]/g) || []).length;
|
||||
const specialCharRatio = specialCharCount / prompt.length;
|
||||
|
||||
if (specialCharRatio > 0.3) {
|
||||
return {
|
||||
isValid: false,
|
||||
reason: 'Prompt contains excessive special characters',
|
||||
riskLevel: 'medium',
|
||||
};
|
||||
}
|
||||
|
||||
// Sanitize the prompt (remove potentially dangerous elements)
|
||||
const sanitizedPrompt = sanitizePrompt(prompt);
|
||||
|
||||
return {
|
||||
isValid: true,
|
||||
sanitizedPrompt,
|
||||
riskLevel: 'low',
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitizes prompt by removing potentially dangerous content
|
||||
*/
|
||||
function sanitizePrompt(prompt: string): string {
|
||||
let sanitized = prompt;
|
||||
|
||||
// Remove HTML tags
|
||||
sanitized = sanitized.replace(/<[^>]*>/g, '');
|
||||
|
||||
// Remove excessive whitespace
|
||||
sanitized = sanitized.replace(/\s+/g, ' ').trim();
|
||||
|
||||
// Remove zero-width and invisible characters
|
||||
sanitized = sanitized.replace(/[\u200B-\u200D\uFEFF\u2060-\u2069]/g, '');
|
||||
|
||||
// Remove control characters except newline and tab
|
||||
sanitized = sanitized.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, '');
|
||||
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Context-aware validation for parenting assistant
|
||||
* Checks if the prompt is appropriate for a parenting/childcare context
|
||||
*/
|
||||
export function isParentingRelated(prompt: string): boolean {
|
||||
const parentingKeywords = [
|
||||
'baby', 'child', 'toddler', 'infant', 'kid',
|
||||
'feed', 'sleep', 'diaper', 'nap', 'bottle',
|
||||
'breastfeed', 'formula', 'meal', 'bedtime',
|
||||
'cry', 'fussy', 'teething', 'milestone',
|
||||
'development', 'growth', 'schedule', 'routine',
|
||||
'parent', 'mom', 'dad', 'caregiver',
|
||||
];
|
||||
|
||||
const lowerPrompt = prompt.toLowerCase();
|
||||
const wordCount = lowerPrompt.split(/\s+/).length;
|
||||
|
||||
// For very short prompts, be more lenient
|
||||
if (wordCount <= 5) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if prompt contains parenting-related keywords
|
||||
const hasParentingKeywords = parentingKeywords.some(keyword =>
|
||||
lowerPrompt.includes(keyword)
|
||||
);
|
||||
|
||||
return hasParentingKeywords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rate limiting helper - tracks prompt attempts per session
|
||||
*/
|
||||
class PromptRateLimiter {
|
||||
private attempts: Map<string, number[]> = new Map();
|
||||
private readonly maxAttempts = 5;
|
||||
private readonly windowMs = 60000; // 1 minute
|
||||
|
||||
/**
|
||||
* Check if user has exceeded rate limit for suspicious prompts
|
||||
*/
|
||||
checkRateLimit(userId: string): boolean {
|
||||
const now = Date.now();
|
||||
const userAttempts = this.attempts.get(userId) || [];
|
||||
|
||||
// Filter out old attempts outside the time window
|
||||
const recentAttempts = userAttempts.filter(time => now - time < this.windowMs);
|
||||
|
||||
if (recentAttempts.length >= this.maxAttempts) {
|
||||
return false; // Rate limit exceeded
|
||||
}
|
||||
|
||||
// Add new attempt
|
||||
recentAttempts.push(now);
|
||||
this.attempts.set(userId, recentAttempts);
|
||||
|
||||
return true; // Within rate limit
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear attempts for a user (e.g., after successful validation)
|
||||
*/
|
||||
clearAttempts(userId: string): void {
|
||||
this.attempts.delete(userId);
|
||||
}
|
||||
}
|
||||
|
||||
export const promptRateLimiter = new PromptRateLimiter();
|
||||
|
||||
/**
|
||||
* Logs suspicious prompt attempts for security monitoring
|
||||
*/
|
||||
export function logSuspiciousPrompt(
|
||||
prompt: string,
|
||||
userId: string | undefined,
|
||||
reason: string,
|
||||
riskLevel: string
|
||||
): void {
|
||||
// In production, this should send to your security monitoring system
|
||||
console.warn('[SECURITY] Suspicious prompt detected:', {
|
||||
userId: userId || 'anonymous',
|
||||
reason,
|
||||
riskLevel,
|
||||
promptLength: prompt.length,
|
||||
timestamp: new Date().toISOString(),
|
||||
// Don't log the full prompt to avoid storing malicious content
|
||||
promptPreview: prompt.substring(0, 50) + '...',
|
||||
});
|
||||
|
||||
// TODO: In production, send to Sentry or security monitoring service
|
||||
// if (process.env.NODE_ENV === 'production') {
|
||||
// Sentry.captureMessage('Suspicious prompt attempt', {
|
||||
// level: 'warning',
|
||||
// tags: { riskLevel },
|
||||
// extra: { userId, reason, promptLength: prompt.length },
|
||||
// });
|
||||
// }
|
||||
}
|
||||
|
||||
/**
|
||||
* Complete validation pipeline for AI prompts
|
||||
*/
|
||||
export function validateAIPrompt(
|
||||
prompt: string,
|
||||
userId?: string
|
||||
): PromptValidationResult {
|
||||
// Step 1: Basic validation and sanitization
|
||||
const validationResult = validatePrompt(prompt);
|
||||
|
||||
if (!validationResult.isValid) {
|
||||
logSuspiciousPrompt(
|
||||
prompt,
|
||||
userId,
|
||||
validationResult.reason || 'Unknown',
|
||||
validationResult.riskLevel
|
||||
);
|
||||
|
||||
// Check rate limit for suspicious attempts
|
||||
if (userId && validationResult.riskLevel === 'high') {
|
||||
if (!promptRateLimiter.checkRateLimit(userId)) {
|
||||
return {
|
||||
isValid: false,
|
||||
reason: 'Too many suspicious prompts. Please try again later.',
|
||||
riskLevel: 'high',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return validationResult;
|
||||
}
|
||||
|
||||
// Step 2: Context-aware validation
|
||||
if (!isParentingRelated(prompt)) {
|
||||
// Allow non-parenting questions but with a warning
|
||||
// This is more lenient to avoid false positives
|
||||
validationResult.riskLevel = 'medium';
|
||||
}
|
||||
|
||||
// Clear rate limit on successful validation
|
||||
if (userId) {
|
||||
promptRateLimiter.clearAttempts(userId);
|
||||
}
|
||||
|
||||
return validationResult;
|
||||
}
|
||||
Reference in New Issue
Block a user