Files
url_tracker_tool/apps/api/src/services/bulk-processor.service.ts
Andrei 76f3cf22d1 fix: resolve production deployment issues and switch to in-memory rate limiting
- Fix CORS configuration to use CORS_ORIGIN env variable
- Switch from Redis-based to in-memory rate limiting for stability
- Fix frontend authentication error handling for public API
- Disable problematic trackingRateLimit middleware
- Update environment configuration for production

This resolves hanging issues with tracking API and enables
frontend forms to work properly on production.
2025-08-19 18:25:48 +00:00

608 lines
18 KiB
TypeScript

/**
* Bulk Processing Service for Redirect Intelligence v2
*
* Manages CSV upload, parsing, and bulk redirect analysis jobs
*/
import fs from 'fs/promises';
import path from 'path';
import { Queue, Job } from 'bullmq';
import IORedis from 'ioredis';
import csvParser from 'csv-parser';
import { createObjectCsvWriter } from 'csv-writer';
import { z } from 'zod';
import { logger } from '../lib/logger';
import { prisma } from '../lib/prisma';
// Job types and data structures
export interface BulkTrackingJob {
id: string;
userId: string;
organizationId?: string;
projectId?: string;
urls: Array<{
url: string;
label?: string;
metadata?: Record<string, any>;
}>;
options: {
method: 'GET' | 'POST' | 'HEAD';
userAgent?: string;
maxHops: number;
timeout: number;
enableSSLAnalysis: boolean;
enableSEOAnalysis: boolean;
enableSecurityAnalysis: boolean;
headers?: Record<string, string>;
};
status: 'PENDING' | 'QUEUED' | 'RUNNING' | 'COMPLETED' | 'FAILED' | 'CANCELLED' | 'ERROR';
progress: {
total: number;
processed: number;
successful: number;
failed: number;
};
results?: Array<{
url: string;
label?: string;
checkId?: string;
status: 'success' | 'failed';
error?: string;
timing: {
startedAt: Date;
finishedAt?: Date;
durationMs?: number;
};
}>;
createdAt: Date;
startedAt?: Date;
finishedAt?: Date;
estimatedCompletionAt?: Date;
}
// Validation schemas
const BulkJobCreateSchema = z.object({
projectId: z.string().optional(),
urls: z.array(z.object({
url: z.string().url('Invalid URL format'),
label: z.string().optional(),
metadata: z.record(z.any()).optional(),
})).min(1, 'At least one URL is required').max(1000, 'Maximum 1000 URLs per job'),
options: z.object({
method: z.enum(['GET', 'POST', 'HEAD']).default('GET'),
userAgent: z.string().optional(),
maxHops: z.number().min(1).max(20).default(10),
timeout: z.number().min(1000).max(30000).default(15000),
enableSSLAnalysis: z.boolean().default(true),
enableSEOAnalysis: z.boolean().default(true),
enableSecurityAnalysis: z.boolean().default(true),
headers: z.record(z.string()).optional(),
}).default({}),
});
const CsvRowSchema = z.object({
url: z.string().min(1, 'URL is required'),
label: z.string().optional(),
method: z.enum(['GET', 'POST', 'HEAD']).optional(),
user_agent: z.string().optional(),
max_hops: z.string().optional(),
timeout: z.string().optional(),
enable_ssl: z.string().optional(),
enable_seo: z.string().optional(),
enable_security: z.string().optional(),
});
export type BulkJobCreateRequest = z.infer<typeof BulkJobCreateSchema>;
export type CsvRow = z.infer<typeof CsvRowSchema>;
export class BulkProcessorService {
private redis: IORedis;
private trackingQueue: Queue;
private readonly uploadsDir: string;
constructor() {
// TEMPORARY: Disable Redis for bulk processing to avoid hangs
// this.redis = new IORedis({
// host: process.env.REDIS_HOST || 'localhost',
// port: parseInt(process.env.REDIS_PORT || '6379'),
// enableReadyCheck: false,
// maxRetriesPerRequest: null,
// });
// this.trackingQueue = new Queue('bulk-tracking', {
// connection: this.redis,
// defaultJobOptions: {
// removeOnComplete: 100, // Keep last 100 completed jobs
// removeOnFail: 50, // Keep last 50 failed jobs
// attempts: 3,
// backoff: {
// type: 'exponential',
// delay: 2000,
// },
// },
// });
this.uploadsDir = path.join(process.cwd(), 'uploads');
this.ensureUploadsDirectory();
}
/**
* Ensure uploads directory exists
*/
private async ensureUploadsDirectory(): Promise<void> {
try {
await fs.mkdir(this.uploadsDir, { recursive: true });
} catch (error) {
logger.error('Failed to create uploads directory:', error);
}
}
/**
* Parse CSV file and extract URL data
*/
async parseCsvFile(filePath: string): Promise<Array<{
url: string;
label?: string;
metadata?: Record<string, any>;
}>> {
const results: Array<{ url: string; label?: string; metadata?: Record<string, any> }> = [];
return new Promise((resolve, reject) => {
const stream = require('fs').createReadStream(filePath)
.pipe(csvParser())
.on('data', (row: any) => {
try {
// Validate and parse each row
const validatedRow = CsvRowSchema.parse(row);
// Normalize URL
let url = validatedRow.url.trim();
if (!url.startsWith('http://') && !url.startsWith('https://')) {
url = `https://${url}`;
}
const parsedRow = {
url,
label: validatedRow.label?.trim() || undefined,
metadata: {
// Store additional CSV columns as metadata
method: validatedRow.method || 'GET',
userAgent: validatedRow.user_agent?.trim(),
maxHops: validatedRow.max_hops ? parseInt(validatedRow.max_hops) : undefined,
timeout: validatedRow.timeout ? parseInt(validatedRow.timeout) : undefined,
enableSSL: this.parseBoolean(validatedRow.enable_ssl),
enableSEO: this.parseBoolean(validatedRow.enable_seo),
enableSecurity: this.parseBoolean(validatedRow.enable_security),
},
};
results.push(parsedRow);
} catch (error) {
logger.warn('Invalid CSV row skipped:', { row, error: error instanceof Error ? error.message : 'Unknown error' });
}
})
.on('end', () => {
logger.info(`CSV parsing completed: ${results.length} valid URLs found`);
resolve(results);
})
.on('error', (error: Error) => {
logger.error('CSV parsing failed:', error);
reject(error);
});
});
}
/**
* Parse boolean values from CSV
*/
private parseBoolean(value?: string): boolean | undefined {
if (!value) return undefined;
const normalized = value.toLowerCase().trim();
if (normalized === 'true' || normalized === '1' || normalized === 'yes') return true;
if (normalized === 'false' || normalized === '0' || normalized === 'no') return false;
return undefined;
}
/**
* Create a new bulk tracking job
*/
async createBulkJob(
userId: string,
organizationId: string | undefined,
jobData: BulkJobCreateRequest,
filePath?: string
): Promise<BulkTrackingJob> {
try {
// Validate input
const validatedData = BulkJobCreateSchema.parse(jobData);
const jobId = `bulk_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
// Create job record in database
const bulkJob = await prisma.bulkJob.create({
data: {
id: jobId,
userId,
organizationId: organizationId || null,
projectId: validatedData.projectId || 'default-project',
uploadPath: filePath || 'api',
status: 'PENDING' as any,
totalUrls: validatedData.urls.length,
processedUrls: 0,
successfulUrls: 0,
failedUrls: 0,
configJson: JSON.stringify(validatedData.options),
urlsJson: JSON.stringify(validatedData.urls),
} as any,
});
// Queue the job for processing
await this.trackingQueue.add(
'process-bulk-tracking',
{
jobId,
userId,
organizationId,
urls: validatedData.urls,
options: validatedData.options,
},
{
jobId,
delay: 0, // Start immediately
}
);
const job: BulkTrackingJob = {
id: jobId,
userId,
organizationId,
projectId: validatedData.projectId,
urls: validatedData.urls as any,
options: validatedData.options as any,
status: 'PENDING',
progress: {
total: validatedData.urls.length,
processed: 0,
successful: 0,
failed: 0,
},
createdAt: bulkJob.createdAt,
};
logger.info(`Bulk tracking job created: ${jobId}`, {
userId,
urlCount: validatedData.urls.length,
organizationId,
});
return job;
} catch (error) {
logger.error('Failed to create bulk job:', error);
throw new Error(`Failed to create bulk job: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Create bulk job from CSV file
*/
async createBulkJobFromCsv(
userId: string,
organizationId: string | undefined,
filePath: string,
projectId: string,
options: Partial<BulkJobCreateRequest['options']> = {}
): Promise<BulkTrackingJob> {
try {
// Parse CSV file
const urls = await this.parseCsvFile(filePath);
if (urls.length === 0) {
throw new Error('No valid URLs found in CSV file');
}
// Create job with parsed URLs
const jobData: BulkJobCreateRequest = {
urls,
options: {
method: 'GET',
maxHops: 10,
timeout: 15000,
enableSSLAnalysis: true,
enableSEOAnalysis: true,
enableSecurityAnalysis: true,
...options,
},
projectId
};
const job = await this.createBulkJob(userId, organizationId, jobData, filePath);
// Clean up uploaded file
await fs.unlink(filePath).catch(() => {});
return job;
} catch (error) {
// Clean up uploaded file on error
await fs.unlink(filePath).catch(() => {});
throw error;
}
}
/**
* Get bulk job status and progress
*/
async getBulkJob(jobId: string, userId: string): Promise<BulkTrackingJob | null> {
try {
const bulkJob = await prisma.bulkJob.findFirst({
where: {
id: jobId,
userId,
},
});
if (!bulkJob) {
return null;
}
// Get job progress from queue
const queueJob = await this.trackingQueue.getJob(jobId);
const progress = queueJob?.progress || 0;
const job: BulkTrackingJob = {
id: bulkJob.id,
userId: bulkJob.userId,
...(bulkJob.organizationId ? { organizationId: bulkJob.organizationId } : {}),
...(bulkJob.projectId ? { projectId: bulkJob.projectId } : {}),
urls: JSON.parse(bulkJob.urlsJson as string),
options: JSON.parse(bulkJob.configJson as string),
status: bulkJob.status as BulkTrackingJob['status'],
progress: {
total: bulkJob.totalUrls,
processed: bulkJob.processedUrls,
successful: bulkJob.successfulUrls,
failed: bulkJob.failedUrls,
},
results: bulkJob.resultsJson ? JSON.parse(bulkJob.resultsJson as string) : undefined,
createdAt: bulkJob.createdAt,
startedAt: bulkJob.startedAt || undefined,
finishedAt: bulkJob.finishedAt || undefined,
estimatedCompletionAt: this.calculateEstimatedCompletion(bulkJob),
};
return job;
} catch (error) {
logger.error('Failed to get bulk job:', error);
return null;
}
}
/**
* Calculate estimated completion time
*/
private calculateEstimatedCompletion(bulkJob: any): Date | undefined {
if (!bulkJob.startedAt || bulkJob.status === 'COMPLETED' || bulkJob.status === 'FAILED') {
return undefined;
}
const elapsed = Date.now() - bulkJob.startedAt.getTime();
const processed = bulkJob.processedUrls;
const remaining = bulkJob.totalUrls - processed;
if (processed === 0) {
return undefined;
}
const avgTimePerUrl = elapsed / processed;
const estimatedRemainingTime = avgTimePerUrl * remaining;
return new Date(Date.now() + estimatedRemainingTime);
}
/**
* Cancel a bulk job
*/
async cancelBulkJob(jobId: string, userId: string): Promise<boolean> {
try {
// Update database status
await prisma.bulkJob.updateMany({
where: {
id: jobId,
userId,
},
data: {
status: 'CANCELLED',
finishedAt: new Date(),
},
});
// Remove job from queue
const queueJob = await this.trackingQueue.getJob(jobId);
if (queueJob) {
await queueJob.remove();
}
logger.info(`Bulk job cancelled: ${jobId}`, { userId });
return true;
} catch (error) {
logger.error('Failed to cancel bulk job:', error);
return false;
}
}
/**
* Get user's bulk jobs
*/
async getUserBulkJobs(
userId: string,
limit = 20,
offset = 0
): Promise<BulkTrackingJob[]> {
try {
const bulkJobs = await prisma.bulkJob.findMany({
where: { userId },
orderBy: { createdAt: 'desc' },
take: limit,
skip: offset,
});
return Promise.all(
bulkJobs.map(async (bulkJob) => {
const job: BulkTrackingJob = {
id: bulkJob.id,
userId: bulkJob.userId,
...(bulkJob.organizationId ? { organizationId: bulkJob.organizationId } : {}),
...(bulkJob.projectId ? { projectId: bulkJob.projectId } : {}),
urls: JSON.parse(bulkJob.urlsJson as string),
options: JSON.parse(bulkJob.configJson as string),
status: bulkJob.status as BulkTrackingJob['status'],
progress: {
total: bulkJob.totalUrls,
processed: bulkJob.processedUrls,
successful: bulkJob.successfulUrls,
failed: bulkJob.failedUrls,
},
results: bulkJob.resultsJson ? JSON.parse(bulkJob.resultsJson as string) : undefined,
createdAt: bulkJob.createdAt,
startedAt: bulkJob.startedAt || undefined,
finishedAt: bulkJob.finishedAt || undefined,
estimatedCompletionAt: this.calculateEstimatedCompletion(bulkJob),
};
return job;
})
);
} catch (error) {
logger.error('Failed to get user bulk jobs:', error);
return [];
}
}
/**
* Export bulk job results to CSV
*/
async exportResultsToCsv(jobId: string, userId: string): Promise<string> {
try {
const job = await this.getBulkJob(jobId, userId);
if (!job || !job.results) {
throw new Error('Job not found or no results available');
}
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const fileName = `bulk-results-${jobId}-${timestamp}.csv`;
const filePath = path.join(this.uploadsDir, fileName);
const csvWriter = createObjectCsvWriter({
path: filePath,
header: [
{ id: 'url', title: 'URL' },
{ id: 'label', title: 'Label' },
{ id: 'status', title: 'Status' },
{ id: 'checkId', title: 'Check ID' },
{ id: 'error', title: 'Error' },
{ id: 'startedAt', title: 'Started At' },
{ id: 'finishedAt', title: 'Finished At' },
{ id: 'durationMs', title: 'Duration (ms)' },
],
});
const records = job.results.map(result => ({
url: result.url,
label: result.label || '',
status: result.status,
checkId: result.checkId || '',
error: result.error || '',
startedAt: result.timing.startedAt.toISOString(),
finishedAt: result.timing.finishedAt?.toISOString() || '',
durationMs: result.timing.durationMs || '',
}));
await csvWriter.writeRecords(records);
logger.info(`Results exported to CSV: ${filePath}`, { jobId, userId });
return filePath;
} catch (error) {
logger.error('Failed to export results to CSV:', error);
throw new Error(`Failed to export results: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Clean up old bulk jobs and files
*/
async cleanupOldJobs(maxAgeHours = 72): Promise<void> {
try {
const cutoff = new Date(Date.now() - (maxAgeHours * 60 * 60 * 1000));
// Delete old jobs from database
const result = await prisma.bulkJob.deleteMany({
where: {
createdAt: {
lt: cutoff,
},
status: {
in: ['COMPLETED', 'FAILED', 'CANCELLED'],
},
},
});
// Clean up old files
try {
const files = await fs.readdir(this.uploadsDir);
for (const file of files) {
const filePath = path.join(this.uploadsDir, file);
const stats = await fs.stat(filePath);
if (stats.mtime < cutoff) {
await fs.unlink(filePath);
logger.info(`Cleaned up old file: ${file}`);
}
}
} catch (error) {
logger.warn('Failed to cleanup old files:', error);
}
logger.info(`Cleaned up ${result.count} old bulk jobs`);
} catch (error) {
logger.error('Failed to cleanup old jobs:', error);
}
}
/**
* Get queue statistics
*/
async getQueueStats(): Promise<{
waiting: number;
active: number;
completed: number;
failed: number;
delayed: number;
}> {
try {
const [waiting, active, completed, failed, delayed] = await Promise.all([
this.trackingQueue.getWaiting(),
this.trackingQueue.getActive(),
this.trackingQueue.getCompleted(),
this.trackingQueue.getFailed(),
this.trackingQueue.getDelayed(),
]);
return {
waiting: waiting.length,
active: active.length,
completed: completed.length,
failed: failed.length,
delayed: delayed.length,
};
} catch (error) {
logger.error('Failed to get queue stats:', error);
return {
waiting: 0,
active: 0,
completed: 0,
failed: 0,
delayed: 0,
};
}
}
}