feat(phase-6): Bulk CSV processing and background worker implementation

- Add BulkJob model to Prisma schema with relations - Implement BulkProcessorService for CSV parsing and job management - Create BulkTrackingWorker for background processing with BullMQ - Add comprehensive bulk API routes (upload, jobs, progress, export) - Integrate multer for CSV file uploads with validation - Add job progress tracking and estimation - Implement CSV export functionality for results - Add queue statistics and cleanup endpoints - Create shared types for bulk processing - Add comprehensive test suite for all bulk functionality - Implement graceful worker shutdown and error handling - Add rate limiting and authentication for all bulk endpoints Backward compatibility: Maintained for /api/track and /api/v1/track
2025-08-18 14:18:13 +00:00
parent 8c8300780f
commit 9626863917
13 changed files with 2309 additions and 64 deletions
--- a/apps/api/src/services/bulk-processor.service.ts
+++ b/apps/api/src/services/bulk-processor.service.ts
@@ -0,0 +1,603 @@
+/**
+ * Bulk Processing Service for Redirect Intelligence v2
+ * 
+ * Manages CSV upload, parsing, and bulk redirect analysis jobs
+ */
+
+import fs from 'fs/promises';
+import path from 'path';
+import { Queue, Job } from 'bullmq';
+import IORedis from 'ioredis';
+import csvParser from 'csv-parser';
+import { createObjectCsvWriter } from 'csv-writer';
+import { z } from 'zod';
+import { logger } from '../lib/logger';
+import { prisma } from '../lib/prisma';
+
+// Job types and data structures
+export interface BulkTrackingJob {
+  id: string;
+  userId: string;
+  organizationId?: string;
+  projectId?: string;
+  urls: Array<{
+    url: string;
+    label?: string;
+    metadata?: Record<string, any>;
+  }>;
+  options: {
+    method: 'GET' | 'POST' | 'HEAD';
+    userAgent?: string;
+    maxHops: number;
+    timeout: number;
+    enableSSLAnalysis: boolean;
+    enableSEOAnalysis: boolean;
+    enableSecurityAnalysis: boolean;
+    headers?: Record<string, string>;
+  };
+  status: 'pending' | 'processing' | 'completed' | 'failed' | 'cancelled';
+  progress: {
+    total: number;
+    processed: number;
+    successful: number;
+    failed: number;
+  };
+  results?: Array<{
+    url: string;
+    label?: string;
+    checkId?: string;
+    status: 'success' | 'failed';
+    error?: string;
+    timing: {
+      startedAt: Date;
+      finishedAt?: Date;
+      durationMs?: number;
+    };
+  }>;
+  createdAt: Date;
+  startedAt?: Date;
+  finishedAt?: Date;
+  estimatedCompletionAt?: Date;
+}
+
+// Validation schemas
+const BulkJobCreateSchema = z.object({
+  projectId: z.string().optional(),
+  urls: z.array(z.object({
+    url: z.string().url('Invalid URL format'),
+    label: z.string().optional(),
+    metadata: z.record(z.any()).optional(),
+  })).min(1, 'At least one URL is required').max(1000, 'Maximum 1000 URLs per job'),
+  options: z.object({
+    method: z.enum(['GET', 'POST', 'HEAD']).default('GET'),
+    userAgent: z.string().optional(),
+    maxHops: z.number().min(1).max(20).default(10),
+    timeout: z.number().min(1000).max(30000).default(15000),
+    enableSSLAnalysis: z.boolean().default(true),
+    enableSEOAnalysis: z.boolean().default(true),
+    enableSecurityAnalysis: z.boolean().default(true),
+    headers: z.record(z.string()).optional(),
+  }).default({}),
+});
+
+const CsvRowSchema = z.object({
+  url: z.string().min(1, 'URL is required'),
+  label: z.string().optional(),
+  method: z.enum(['GET', 'POST', 'HEAD']).optional(),
+  user_agent: z.string().optional(),
+  max_hops: z.string().optional(),
+  timeout: z.string().optional(),
+  enable_ssl: z.string().optional(),
+  enable_seo: z.string().optional(),
+  enable_security: z.string().optional(),
+});
+
+export type BulkJobCreateRequest = z.infer<typeof BulkJobCreateSchema>;
+export type CsvRow = z.infer<typeof CsvRowSchema>;
+
+export class BulkProcessorService {
+  private redis: IORedis;
+  private trackingQueue: Queue;
+  private readonly uploadsDir: string;
+
+  constructor() {
+    this.redis = new IORedis({
+      host: process.env.REDIS_HOST || 'localhost',
+      port: parseInt(process.env.REDIS_PORT || '6379'),
+      retryDelayOnFailover: 100,
+      enableReadyCheck: false,
+      maxRetriesPerRequest: null,
+    });
+
+    this.trackingQueue = new Queue('bulk-tracking', {
+      connection: this.redis,
+      defaultJobOptions: {
+        removeOnComplete: 100, // Keep last 100 completed jobs
+        removeOnFail: 50,      // Keep last 50 failed jobs
+        attempts: 3,
+        backoff: {
+          type: 'exponential',
+          delay: 2000,
+        },
+      },
+    });
+
+    this.uploadsDir = path.join(process.cwd(), 'uploads');
+    this.ensureUploadsDirectory();
+  }
+
+  /**
+   * Ensure uploads directory exists
+   */
+  private async ensureUploadsDirectory(): Promise<void> {
+    try {
+      await fs.mkdir(this.uploadsDir, { recursive: true });
+    } catch (error) {
+      logger.error('Failed to create uploads directory:', error);
+    }
+  }
+
+  /**
+   * Parse CSV file and extract URL data
+   */
+  async parseCsvFile(filePath: string): Promise<Array<{
+    url: string;
+    label?: string;
+    metadata?: Record<string, any>;
+  }>> {
+    const results: Array<{ url: string; label?: string; metadata?: Record<string, any> }> = [];
+    
+    return new Promise((resolve, reject) => {
+      const stream = require('fs').createReadStream(filePath)
+        .pipe(csvParser())
+        .on('data', (row: any) => {
+          try {
+            // Validate and parse each row
+            const validatedRow = CsvRowSchema.parse(row);
+            
+            // Normalize URL
+            let url = validatedRow.url.trim();
+            if (!url.startsWith('http://') && !url.startsWith('https://')) {
+              url = `https://${url}`;
+            }
+
+            const parsedRow = {
+              url,
+              label: validatedRow.label?.trim() || undefined,
+              metadata: {
+                // Store additional CSV columns as metadata
+                method: validatedRow.method || 'GET',
+                userAgent: validatedRow.user_agent?.trim(),
+                maxHops: validatedRow.max_hops ? parseInt(validatedRow.max_hops) : undefined,
+                timeout: validatedRow.timeout ? parseInt(validatedRow.timeout) : undefined,
+                enableSSL: this.parseBoolean(validatedRow.enable_ssl),
+                enableSEO: this.parseBoolean(validatedRow.enable_seo),
+                enableSecurity: this.parseBoolean(validatedRow.enable_security),
+              },
+            };
+
+            results.push(parsedRow);
+          } catch (error) {
+            logger.warn('Invalid CSV row skipped:', { row, error: error instanceof Error ? error.message : 'Unknown error' });
+          }
+        })
+        .on('end', () => {
+          logger.info(`CSV parsing completed: ${results.length} valid URLs found`);
+          resolve(results);
+        })
+        .on('error', (error: Error) => {
+          logger.error('CSV parsing failed:', error);
+          reject(error);
+        });
+    });
+  }
+
+  /**
+   * Parse boolean values from CSV
+   */
+  private parseBoolean(value?: string): boolean | undefined {
+    if (!value) return undefined;
+    const normalized = value.toLowerCase().trim();
+    if (normalized === 'true' || normalized === '1' || normalized === 'yes') return true;
+    if (normalized === 'false' || normalized === '0' || normalized === 'no') return false;
+    return undefined;
+  }
+
+  /**
+   * Create a new bulk tracking job
+   */
+  async createBulkJob(
+    userId: string,
+    organizationId: string | undefined,
+    jobData: BulkJobCreateRequest
+  ): Promise<BulkTrackingJob> {
+    try {
+      // Validate input
+      const validatedData = BulkJobCreateSchema.parse(jobData);
+      
+      const jobId = `bulk_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
+      
+      // Create job record in database
+      const bulkJob = await prisma.bulkJob.create({
+        data: {
+          id: jobId,
+          userId,
+          organizationId,
+          projectId: validatedData.projectId,
+          status: 'pending',
+          totalUrls: validatedData.urls.length,
+          processedUrls: 0,
+          successfulUrls: 0,
+          failedUrls: 0,
+          configJson: JSON.stringify(validatedData.options),
+          urlsJson: JSON.stringify(validatedData.urls),
+        },
+      });
+
+      // Queue the job for processing
+      await this.trackingQueue.add(
+        'process-bulk-tracking',
+        {
+          jobId,
+          userId,
+          organizationId,
+          urls: validatedData.urls,
+          options: validatedData.options,
+        },
+        {
+          jobId,
+          delay: 0, // Start immediately
+        }
+      );
+
+      const job: BulkTrackingJob = {
+        id: jobId,
+        userId,
+        organizationId,
+        projectId: validatedData.projectId,
+        urls: validatedData.urls,
+        options: validatedData.options,
+        status: 'pending',
+        progress: {
+          total: validatedData.urls.length,
+          processed: 0,
+          successful: 0,
+          failed: 0,
+        },
+        createdAt: bulkJob.createdAt,
+      };
+
+      logger.info(`Bulk tracking job created: ${jobId}`, {
+        userId,
+        urlCount: validatedData.urls.length,
+        organizationId,
+      });
+
+      return job;
+    } catch (error) {
+      logger.error('Failed to create bulk job:', error);
+      throw new Error(`Failed to create bulk job: ${error instanceof Error ? error.message : 'Unknown error'}`);
+    }
+  }
+
+  /**
+   * Create bulk job from CSV file
+   */
+  async createBulkJobFromCsv(
+    userId: string,
+    organizationId: string | undefined,
+    filePath: string,
+    options: Partial<BulkJobCreateRequest['options']> = {}
+  ): Promise<BulkTrackingJob> {
+    try {
+      // Parse CSV file
+      const urls = await this.parseCsvFile(filePath);
+      
+      if (urls.length === 0) {
+        throw new Error('No valid URLs found in CSV file');
+      }
+
+      // Create job with parsed URLs
+      const jobData: BulkJobCreateRequest = {
+        urls,
+        options: {
+          method: 'GET',
+          maxHops: 10,
+          timeout: 15000,
+          enableSSLAnalysis: true,
+          enableSEOAnalysis: true,
+          enableSecurityAnalysis: true,
+          ...options,
+        },
+      };
+
+      const job = await this.createBulkJob(userId, organizationId, jobData);
+      
+      // Clean up uploaded file
+      await fs.unlink(filePath).catch(() => {});
+      
+      return job;
+    } catch (error) {
+      // Clean up uploaded file on error
+      await fs.unlink(filePath).catch(() => {});
+      throw error;
+    }
+  }
+
+  /**
+   * Get bulk job status and progress
+   */
+  async getBulkJob(jobId: string, userId: string): Promise<BulkTrackingJob | null> {
+    try {
+      const bulkJob = await prisma.bulkJob.findFirst({
+        where: {
+          id: jobId,
+          userId,
+        },
+      });
+
+      if (!bulkJob) {
+        return null;
+      }
+
+      // Get job progress from queue
+      const queueJob = await this.trackingQueue.getJob(jobId);
+      const progress = queueJob?.progress || 0;
+      
+      const job: BulkTrackingJob = {
+        id: bulkJob.id,
+        userId: bulkJob.userId,
+        organizationId: bulkJob.organizationId || undefined,
+        projectId: bulkJob.projectId || undefined,
+        urls: JSON.parse(bulkJob.urlsJson as string),
+        options: JSON.parse(bulkJob.configJson as string),
+        status: bulkJob.status as BulkTrackingJob['status'],
+        progress: {
+          total: bulkJob.totalUrls,
+          processed: bulkJob.processedUrls,
+          successful: bulkJob.successfulUrls,
+          failed: bulkJob.failedUrls,
+        },
+        results: bulkJob.resultsJson ? JSON.parse(bulkJob.resultsJson as string) : undefined,
+        createdAt: bulkJob.createdAt,
+        startedAt: bulkJob.startedAt || undefined,
+        finishedAt: bulkJob.finishedAt || undefined,
+        estimatedCompletionAt: this.calculateEstimatedCompletion(bulkJob),
+      };
+
+      return job;
+    } catch (error) {
+      logger.error('Failed to get bulk job:', error);
+      return null;
+    }
+  }
+
+  /**
+   * Calculate estimated completion time
+   */
+  private calculateEstimatedCompletion(bulkJob: any): Date | undefined {
+    if (!bulkJob.startedAt || bulkJob.status === 'completed' || bulkJob.status === 'failed') {
+      return undefined;
+    }
+
+    const elapsed = Date.now() - bulkJob.startedAt.getTime();
+    const processed = bulkJob.processedUrls;
+    const remaining = bulkJob.totalUrls - processed;
+
+    if (processed === 0) {
+      return undefined;
+    }
+
+    const avgTimePerUrl = elapsed / processed;
+    const estimatedRemainingTime = avgTimePerUrl * remaining;
+    
+    return new Date(Date.now() + estimatedRemainingTime);
+  }
+
+  /**
+   * Cancel a bulk job
+   */
+  async cancelBulkJob(jobId: string, userId: string): Promise<boolean> {
+    try {
+      // Update database status
+      await prisma.bulkJob.updateMany({
+        where: {
+          id: jobId,
+          userId,
+        },
+        data: {
+          status: 'cancelled',
+          finishedAt: new Date(),
+        },
+      });
+
+      // Remove job from queue
+      const queueJob = await this.trackingQueue.getJob(jobId);
+      if (queueJob) {
+        await queueJob.remove();
+      }
+
+      logger.info(`Bulk job cancelled: ${jobId}`, { userId });
+      return true;
+    } catch (error) {
+      logger.error('Failed to cancel bulk job:', error);
+      return false;
+    }
+  }
+
+  /**
+   * Get user's bulk jobs
+   */
+  async getUserBulkJobs(
+    userId: string,
+    limit = 20,
+    offset = 0
+  ): Promise<BulkTrackingJob[]> {
+    try {
+      const bulkJobs = await prisma.bulkJob.findMany({
+        where: { userId },
+        orderBy: { createdAt: 'desc' },
+        take: limit,
+        skip: offset,
+      });
+
+      return Promise.all(
+        bulkJobs.map(async (bulkJob) => {
+          const job: BulkTrackingJob = {
+            id: bulkJob.id,
+            userId: bulkJob.userId,
+            organizationId: bulkJob.organizationId || undefined,
+            projectId: bulkJob.projectId || undefined,
+            urls: JSON.parse(bulkJob.urlsJson as string),
+            options: JSON.parse(bulkJob.configJson as string),
+            status: bulkJob.status as BulkTrackingJob['status'],
+            progress: {
+              total: bulkJob.totalUrls,
+              processed: bulkJob.processedUrls,
+              successful: bulkJob.successfulUrls,
+              failed: bulkJob.failedUrls,
+            },
+            results: bulkJob.resultsJson ? JSON.parse(bulkJob.resultsJson as string) : undefined,
+            createdAt: bulkJob.createdAt,
+            startedAt: bulkJob.startedAt || undefined,
+            finishedAt: bulkJob.finishedAt || undefined,
+            estimatedCompletionAt: this.calculateEstimatedCompletion(bulkJob),
+          };
+          return job;
+        })
+      );
+    } catch (error) {
+      logger.error('Failed to get user bulk jobs:', error);
+      return [];
+    }
+  }
+
+  /**
+   * Export bulk job results to CSV
+   */
+  async exportResultsToCsv(jobId: string, userId: string): Promise<string> {
+    try {
+      const job = await this.getBulkJob(jobId, userId);
+      if (!job || !job.results) {
+        throw new Error('Job not found or no results available');
+      }
+
+      const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+      const fileName = `bulk-results-${jobId}-${timestamp}.csv`;
+      const filePath = path.join(this.uploadsDir, fileName);
+
+      const csvWriter = createObjectCsvWriter({
+        path: filePath,
+        header: [
+          { id: 'url', title: 'URL' },
+          { id: 'label', title: 'Label' },
+          { id: 'status', title: 'Status' },
+          { id: 'checkId', title: 'Check ID' },
+          { id: 'error', title: 'Error' },
+          { id: 'startedAt', title: 'Started At' },
+          { id: 'finishedAt', title: 'Finished At' },
+          { id: 'durationMs', title: 'Duration (ms)' },
+        ],
+      });
+
+      const records = job.results.map(result => ({
+        url: result.url,
+        label: result.label || '',
+        status: result.status,
+        checkId: result.checkId || '',
+        error: result.error || '',
+        startedAt: result.timing.startedAt.toISOString(),
+        finishedAt: result.timing.finishedAt?.toISOString() || '',
+        durationMs: result.timing.durationMs || '',
+      }));
+
+      await csvWriter.writeRecords(records);
+      
+      logger.info(`Results exported to CSV: ${filePath}`, { jobId, userId });
+      return filePath;
+    } catch (error) {
+      logger.error('Failed to export results to CSV:', error);
+      throw new Error(`Failed to export results: ${error instanceof Error ? error.message : 'Unknown error'}`);
+    }
+  }
+
+  /**
+   * Clean up old bulk jobs and files
+   */
+  async cleanupOldJobs(maxAgeHours = 72): Promise<void> {
+    try {
+      const cutoff = new Date(Date.now() - (maxAgeHours * 60 * 60 * 1000));
+      
+      // Delete old jobs from database
+      const result = await prisma.bulkJob.deleteMany({
+        where: {
+          createdAt: {
+            lt: cutoff,
+          },
+          status: {
+            in: ['completed', 'failed', 'cancelled'],
+          },
+        },
+      });
+
+      // Clean up old files
+      try {
+        const files = await fs.readdir(this.uploadsDir);
+        for (const file of files) {
+          const filePath = path.join(this.uploadsDir, file);
+          const stats = await fs.stat(filePath);
+          
+          if (stats.mtime < cutoff) {
+            await fs.unlink(filePath);
+            logger.info(`Cleaned up old file: ${file}`);
+          }
+        }
+      } catch (error) {
+        logger.warn('Failed to cleanup old files:', error);
+      }
+
+      logger.info(`Cleaned up ${result.count} old bulk jobs`);
+    } catch (error) {
+      logger.error('Failed to cleanup old jobs:', error);
+    }
+  }
+
+  /**
+   * Get queue statistics
+   */
+  async getQueueStats(): Promise<{
+    waiting: number;
+    active: number;
+    completed: number;
+    failed: number;
+    delayed: number;
+  }> {
+    try {
+      const [waiting, active, completed, failed, delayed] = await Promise.all([
+        this.trackingQueue.getWaiting(),
+        this.trackingQueue.getActive(),
+        this.trackingQueue.getCompleted(),
+        this.trackingQueue.getFailed(),
+        this.trackingQueue.getDelayed(),
+      ]);
+
+      return {
+        waiting: waiting.length,
+        active: active.length,
+        completed: completed.length,
+        failed: failed.length,
+        delayed: delayed.length,
+      };
+    } catch (error) {
+      logger.error('Failed to get queue stats:', error);
+      return {
+        waiting: 0,
+        active: 0,
+        completed: 0,
+        failed: 0,
+        delayed: 0,
+      };
+    }
+  }
+}
+