feat(phase-6): Bulk CSV processing and background worker implementation

- Add BulkJob model to Prisma schema with relations
- Implement BulkProcessorService for CSV parsing and job management
- Create BulkTrackingWorker for background processing with BullMQ
- Add comprehensive bulk API routes (upload, jobs, progress, export)
- Integrate multer for CSV file uploads with validation
- Add job progress tracking and estimation
- Implement CSV export functionality for results
- Add queue statistics and cleanup endpoints
- Create shared types for bulk processing
- Add comprehensive test suite for all bulk functionality
- Implement graceful worker shutdown and error handling
- Add rate limiting and authentication for all bulk endpoints

Backward compatibility: Maintained for /api/track and /api/v1/track
This commit is contained in:
Andrei
2025-08-18 14:18:13 +00:00
parent 8c8300780f
commit 9626863917
13 changed files with 2309 additions and 64 deletions

View File

@@ -18,7 +18,10 @@
"axios": "^1.6.7",
"playwright": "^1.40.1",
"dotenv": "^16.3.1",
"winston": "^3.11.0"
"winston": "^3.11.0",
"undici": "^6.2.1",
"zod": "^3.22.4",
"csv-writer": "^1.6.0"
},
"devDependencies": {
"@types/node": "^20.10.0",

View File

@@ -1,67 +1,68 @@
/**
* Background Worker for Redirect Intelligence v2
* Worker Service Entry Point for Redirect Intelligence v2
*
* Handles bulk jobs, monitoring, and other background tasks
* Handles bulk URL tracking jobs and background processing
*/
import 'dotenv/config';
import { Worker, Queue } from 'bullmq';
import IORedis from 'ioredis';
import dotenv from 'dotenv';
import { logger } from './lib/logger';
import { BulkTrackingWorker } from './workers/bulk-tracking.worker';
const redis = new IORedis(process.env.REDIS_URL || 'redis://localhost:6379');
// Load environment variables
dotenv.config();
console.log('🔄 Redirect Intelligence v2 Worker starting...');
async function startWorker() {
try {
logger.info('🔄 Starting Redirect Intelligence Worker Service...');
// Initialize bulk tracking worker
const bulkWorker = new BulkTrackingWorker();
await bulkWorker.start();
logger.info('🚀 Worker service started successfully', {
environment: process.env.NODE_ENV || 'development',
redisHost: process.env.REDIS_HOST || 'localhost',
concurrency: process.env.BULK_WORKER_CONCURRENCY || '3',
});
// Health check logging
setInterval(() => {
const health = bulkWorker.getHealthStatus();
logger.debug('Worker health check', health);
}, 30000); // Every 30 seconds
// Graceful shutdown handlers
const shutdown = async (signal: string) => {
logger.info(`🛑 Received ${signal}, shutting down gracefully...`);
try {
await bulkWorker.stop();
logger.info('✅ Worker shutdown completed');
process.exit(0);
} catch (error) {
logger.error('❌ Error during shutdown:', error);
process.exit(1);
}
};
process.on('SIGTERM', () => shutdown('SIGTERM'));
process.on('SIGINT', () => shutdown('SIGINT'));
// Unhandled error handlers
process.on('uncaughtException', (error) => {
logger.error('💥 Uncaught Exception:', error);
process.exit(1);
});
process.on('unhandledRejection', (reason, promise) => {
logger.error('💥 Unhandled Rejection at:', promise, 'reason:', reason);
process.exit(1);
});
} catch (error) {
logger.error('❌ Failed to start worker service:', error);
process.exit(1);
}
}
// Placeholder worker - will be implemented in later phases
const bulkQueue = new Queue('bulk-checks', { connection: redis });
const monitoringQueue = new Queue('monitoring', { connection: redis });
const bulkWorker = new Worker('bulk-checks', async (job) => {
console.log('Processing bulk job:', job.id);
// Bulk processing logic will be implemented in Phase 6
return { status: 'completed', message: 'Bulk job processing not yet implemented' };
}, { connection: redis });
const monitoringWorker = new Worker('monitoring', async (job) => {
console.log('Processing monitoring job:', job.id);
// Monitoring logic will be implemented in Phase 10
return { status: 'completed', message: 'Monitoring not yet implemented' };
}, { connection: redis });
bulkWorker.on('completed', (job) => {
console.log(`✅ Bulk job ${job.id} completed`);
});
bulkWorker.on('failed', (job, err) => {
console.error(`❌ Bulk job ${job?.id} failed:`, err);
});
monitoringWorker.on('completed', (job) => {
console.log(`✅ Monitoring job ${job.id} completed`);
});
monitoringWorker.on('failed', (job, err) => {
console.error(`❌ Monitoring job ${job?.id} failed:`, err);
});
// Graceful shutdown
process.on('SIGTERM', async () => {
console.log('🛑 Shutting down worker...');
await bulkWorker.close();
await monitoringWorker.close();
await redis.quit();
process.exit(0);
});
process.on('SIGINT', async () => {
console.log('🛑 Shutting down worker...');
await bulkWorker.close();
await monitoringWorker.close();
await redis.quit();
process.exit(0);
});
console.log('🚀 Worker is ready to process jobs');
console.log(`📡 Connected to Redis: ${process.env.REDIS_URL || 'redis://localhost:6379'}`);
export { bulkQueue, monitoringQueue };
// Start the worker
startWorker();

View File

@@ -0,0 +1,24 @@
/**
* Logger for Worker Service
*/
import winston from 'winston';
const logger = winston.createLogger({
level: process.env.NODE_ENV === 'development' ? 'debug' : 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
}),
],
});
export { logger };

View File

@@ -0,0 +1,11 @@
/**
* Prisma Client for Worker Service
*/
import { PrismaClient } from '@prisma/client';
const prisma = new PrismaClient({
log: process.env.NODE_ENV === 'development' ? ['query', 'error', 'warn'] : ['error'],
});
export { prisma };

View File

@@ -0,0 +1,235 @@
/**
* Redirect Tracker Service for Worker
*
* Simplified version for background processing
*/
import axios from 'axios';
import { z } from 'zod';
import { logger } from '../lib/logger';
import { prisma } from '../lib/prisma';
const TrackRequest = z.object({
url: z.string().url(),
method: z.enum(['GET', 'POST', 'HEAD']).default('GET'),
userAgent: z.string().optional(),
headers: z.record(z.string()).optional(),
maxHops: z.number().min(1).max(20).default(10),
timeout: z.number().min(1000).max(30000).default(15000),
enableSSLAnalysis: z.boolean().default(true),
enableSEOAnalysis: z.boolean().default(true),
enableSecurityAnalysis: z.boolean().default(true),
projectId: z.string().optional(),
});
type TrackRequest = z.infer<typeof TrackRequest>;
export interface TrackResult {
check: {
id: string;
inputUrl: string;
method: string;
status: string;
totalTimeMs: number;
redirectCount: number;
finalUrl?: string;
};
}
export class RedirectTrackerService {
async trackUrl(request: TrackRequest, userId: string): Promise<TrackResult> {
const validatedRequest = TrackRequest.parse(request);
const startTime = Date.now();
try {
// Create check record
const check = await prisma.check.create({
data: {
inputUrl: validatedRequest.url,
method: validatedRequest.method,
status: 'OK',
startedAt: new Date(),
projectId: validatedRequest.projectId,
},
});
// Perform redirect tracking
const hops = await this.followRedirects(validatedRequest);
const totalTime = Date.now() - startTime;
const finalUrl = hops.length > 0 ? hops[hops.length - 1].url : validatedRequest.url;
// Update check with results
await prisma.check.update({
where: { id: check.id },
data: {
status: 'OK',
finishedAt: new Date(),
totalTimeMs: totalTime,
finalUrl,
},
});
// Save hops
if (hops.length > 0) {
await prisma.hop.createMany({
data: hops.map((hop, index) => ({
checkId: check.id,
hopIndex: index,
url: hop.url,
statusCode: hop.statusCode,
redirectType: hop.redirectType,
latencyMs: hop.latencyMs,
contentType: hop.contentType,
responseHeadersJson: hop.responseHeaders || {},
})),
});
}
logger.info(`URL tracking completed: ${check.id}`, {
userId,
url: validatedRequest.url,
redirectCount: hops.length,
totalTime,
});
return {
check: {
id: check.id,
inputUrl: check.inputUrl,
method: check.method,
status: check.status,
totalTimeMs: totalTime,
redirectCount: hops.length,
finalUrl,
},
};
} catch (error) {
logger.error('URL tracking failed:', {
userId,
url: validatedRequest.url,
error: error instanceof Error ? error.message : 'Unknown error',
});
throw error;
}
}
private async followRedirects(request: TrackRequest): Promise<Array<{
url: string;
statusCode?: number;
redirectType: string;
latencyMs?: number;
contentType?: string;
responseHeaders?: Record<string, string>;
}>> {
const hops: Array<{
url: string;
statusCode?: number;
redirectType: string;
latencyMs?: number;
contentType?: string;
responseHeaders?: Record<string, string>;
}> = [];
let currentUrl = request.url;
let hopCount = 0;
const visitedUrls = new Set<string>();
while (hopCount < request.maxHops) {
if (visitedUrls.has(currentUrl)) {
// Loop detected
hops.push({
url: currentUrl,
redirectType: 'LOOP',
});
break;
}
visitedUrls.add(currentUrl);
const hopStartTime = Date.now();
try {
const response = await axios({
method: request.method,
url: currentUrl,
timeout: request.timeout,
maxRedirects: 0, // Handle redirects manually
validateStatus: () => true, // Accept all status codes
headers: {
'User-Agent': request.userAgent || 'RedirectIntelligence/2.0',
...request.headers,
},
});
const latency = Date.now() - hopStartTime;
const headers: Record<string, string> = {};
// Convert headers to plain object
Object.entries(response.headers).forEach(([key, value]) => {
if (typeof value === 'string') {
headers[key.toLowerCase()] = value;
}
});
hops.push({
url: currentUrl,
statusCode: response.status,
redirectType: this.getRedirectType(response.status),
latencyMs: latency,
contentType: headers['content-type']?.split(';')[0] || undefined,
responseHeaders: headers,
});
// Check if this is a redirect
if (response.status >= 300 && response.status < 400) {
const location = headers.location;
if (location) {
currentUrl = new URL(location, currentUrl).href;
hopCount++;
continue;
}
}
// Not a redirect, we're done
break;
} catch (error) {
const latency = Date.now() - hopStartTime;
hops.push({
url: currentUrl,
redirectType: 'ERROR',
latencyMs: latency,
});
break;
}
}
// Add final hop if we stopped due to max hops
if (hopCount >= request.maxHops && hops.length > 0) {
const lastHop = hops[hops.length - 1];
if (lastHop.statusCode && lastHop.statusCode >= 300 && lastHop.statusCode < 400) {
hops.push({
url: currentUrl,
redirectType: 'FINAL',
});
}
}
return hops;
}
private getRedirectType(statusCode: number): string {
switch (statusCode) {
case 301: return 'HTTP_301';
case 302: return 'HTTP_302';
case 307: return 'HTTP_307';
case 308: return 'HTTP_308';
case 200: return 'FINAL';
default: return statusCode >= 300 && statusCode < 400 ? 'OTHER' : 'FINAL';
}
}
}

View File

@@ -0,0 +1,336 @@
/**
* Bulk Tracking Worker for Redirect Intelligence v2
*
* Processes bulk URL tracking jobs using BullMQ
*/
import { Worker, Job } from 'bullmq';
import IORedis from 'ioredis';
import { logger } from '../lib/logger';
import { prisma } from '../lib/prisma';
import { RedirectTrackerService } from '../services/redirect-tracker.service';
interface BulkTrackingJobData {
jobId: string;
userId: string;
organizationId?: string;
urls: Array<{
url: string;
label?: string;
metadata?: Record<string, any>;
}>;
options: {
method: 'GET' | 'POST' | 'HEAD';
userAgent?: string;
maxHops: number;
timeout: number;
enableSSLAnalysis: boolean;
enableSEOAnalysis: boolean;
enableSecurityAnalysis: boolean;
headers?: Record<string, string>;
};
}
interface JobResult {
url: string;
label?: string;
checkId?: string;
status: 'success' | 'failed';
error?: string;
timing: {
startedAt: Date;
finishedAt?: Date;
durationMs?: number;
};
}
export class BulkTrackingWorker {
private worker: Worker;
private redis: IORedis;
private redirectTracker: RedirectTrackerService;
constructor() {
this.redis = new IORedis({
host: process.env.REDIS_HOST || 'localhost',
port: parseInt(process.env.REDIS_PORT || '6379'),
retryDelayOnFailover: 100,
enableReadyCheck: false,
maxRetriesPerRequest: null,
});
this.redirectTracker = new RedirectTrackerService();
this.worker = new Worker(
'bulk-tracking',
this.processJob.bind(this),
{
connection: this.redis,
concurrency: parseInt(process.env.BULK_WORKER_CONCURRENCY || '3'),
removeOnComplete: 100,
removeOnFail: 50,
}
);
this.setupEventHandlers();
}
/**
* Setup worker event handlers
*/
private setupEventHandlers(): void {
this.worker.on('ready', () => {
logger.info('Bulk tracking worker is ready');
});
this.worker.on('active', (job: Job) => {
logger.info(`Processing bulk job: ${job.id}`, {
jobId: job.data.jobId,
userId: job.data.userId,
urlCount: job.data.urls.length,
});
});
this.worker.on('completed', (job: Job, result: any) => {
logger.info(`Bulk job completed: ${job.id}`, {
jobId: job.data.jobId,
userId: job.data.userId,
successful: result.successful,
failed: result.failed,
total: result.total,
});
});
this.worker.on('failed', (job: Job | undefined, error: Error) => {
logger.error(`Bulk job failed: ${job?.id}`, {
jobId: job?.data?.jobId,
userId: job?.data?.userId,
error: error.message,
});
});
this.worker.on('error', (error: Error) => {
logger.error('Bulk tracking worker error:', error);
});
this.worker.on('stalled', (jobId: string) => {
logger.warn(`Bulk job stalled: ${jobId}`);
});
}
/**
* Process a bulk tracking job
*/
private async processJob(job: Job<BulkTrackingJobData>): Promise<any> {
const { jobId, userId, organizationId, urls, options } = job.data;
try {
logger.info(`Starting bulk job processing: ${jobId}`, {
userId,
urlCount: urls.length,
});
// Update job status to processing
await this.updateJobStatus(jobId, 'processing', { startedAt: new Date() });
const results: JobResult[] = [];
let processed = 0;
let successful = 0;
let failed = 0;
// Process URLs one by one to avoid overwhelming the system
for (const urlData of urls) {
const startTime = new Date();
try {
// Update progress
await job.updateProgress(Math.round((processed / urls.length) * 100));
logger.debug(`Processing URL: ${urlData.url}`, { jobId, userId });
// Track the URL using our existing service
const trackRequest = {
url: urlData.url,
method: options.method,
userAgent: options.userAgent,
headers: options.headers,
maxHops: options.maxHops,
timeout: options.timeout,
enableSSLAnalysis: options.enableSSLAnalysis,
enableSEOAnalysis: options.enableSEOAnalysis,
enableSecurityAnalysis: options.enableSecurityAnalysis,
// Map metadata to request if available
projectId: urlData.metadata?.projectId,
};
const result = await this.redirectTracker.trackUrl(trackRequest, userId);
const finishTime = new Date();
const duration = finishTime.getTime() - startTime.getTime();
results.push({
url: urlData.url,
label: urlData.label,
checkId: result.check.id,
status: 'success',
timing: {
startedAt: startTime,
finishedAt: finishTime,
durationMs: duration,
},
});
successful++;
} catch (error) {
const finishTime = new Date();
const duration = finishTime.getTime() - startTime.getTime();
logger.warn(`Failed to process URL: ${urlData.url}`, {
jobId,
userId,
error: error instanceof Error ? error.message : 'Unknown error',
});
results.push({
url: urlData.url,
label: urlData.label,
status: 'failed',
error: error instanceof Error ? error.message : 'Unknown error',
timing: {
startedAt: startTime,
finishedAt: finishTime,
durationMs: duration,
},
});
failed++;
}
processed++;
// Update database progress periodically
if (processed % 10 === 0 || processed === urls.length) {
await this.updateJobProgress(jobId, {
processedUrls: processed,
successfulUrls: successful,
failedUrls: failed,
resultsJson: JSON.stringify(results),
});
}
// Small delay between requests to be respectful
if (processed < urls.length) {
await new Promise(resolve => setTimeout(resolve, 100));
}
}
// Final update
await this.updateJobStatus(jobId, 'completed', {
finishedAt: new Date(),
processedUrls: processed,
successfulUrls: successful,
failedUrls: failed,
resultsJson: JSON.stringify(results),
});
logger.info(`Bulk job completed: ${jobId}`, {
userId,
total: urls.length,
successful,
failed,
});
return {
jobId,
total: urls.length,
successful,
failed,
results,
};
} catch (error) {
logger.error(`Bulk job processing failed: ${jobId}`, {
userId,
error: error instanceof Error ? error.message : 'Unknown error',
});
// Update job status to failed
await this.updateJobStatus(jobId, 'failed', {
finishedAt: new Date(),
});
throw error;
}
}
/**
* Update job status in database
*/
private async updateJobStatus(
jobId: string,
status: string,
updates: Record<string, any> = {}
): Promise<void> {
try {
await prisma.bulkJob.update({
where: { id: jobId },
data: {
status,
...updates,
},
});
} catch (error) {
logger.error(`Failed to update job status: ${jobId}`, error);
}
}
/**
* Update job progress in database
*/
private async updateJobProgress(
jobId: string,
updates: Record<string, any>
): Promise<void> {
try {
await prisma.bulkJob.update({
where: { id: jobId },
data: updates,
});
} catch (error) {
logger.error(`Failed to update job progress: ${jobId}`, error);
}
}
/**
* Start the worker
*/
async start(): Promise<void> {
logger.info('Starting bulk tracking worker...');
// Worker starts automatically when instantiated
}
/**
* Stop the worker gracefully
*/
async stop(): Promise<void> {
logger.info('Stopping bulk tracking worker...');
await this.worker.close();
await this.redis.disconnect();
}
/**
* Get worker health status
*/
getHealthStatus(): {
isRunning: boolean;
isHealthy: boolean;
concurrency: number;
} {
return {
isRunning: !this.worker.closing,
isHealthy: !this.worker.closing,
concurrency: this.worker.opts.concurrency || 1,
};
}
}