/** * Retry Failed URLs Script (Direct Convex Version) * * This script finds URLs that failed during historical processing and retries them. * It calls Convex actions DIRECTLY, bypassing the Vercel API to avoid timeout issues. * * Usage: * node retry-failed-urls.js --count (count how many need retrying) * node retry-failed-urls.js (start retrying) * node retry-failed-urls.js --status (check progress) * node retry-failed-urls.js --reset (start over) */ const sql = require('mssql'); const fs = require('fs'); const path = require('path'); // Configuration const CONFIG = { sqlServer: { server: 'TYAURNE8ZS', port: 1433, database: 'ProshareErpdb', user: 'AskProshare', password: 'p*7Cn8h4rck@<-wC', options: { encrypt: false, trustServerCertificate: true, }, }, // Direct Convex HTTP endpoint URL (uses .convex.site for HTTP actions) convexUrl: 'https://pastel-hummingbird-582.convex.site', // Check endpoint still uses Vercel (lightweight check only) checkEndpoint: 'https://proshare-ask-proshare.vercel.app/api/sync/check-indexed', apiKey: '517f6402e63260bda41524c118103d1ca0912f91d357a92c667227fb06542a96', batchSize: 5, // URLs per batch (process sequentially) delayBetweenUrls: 2000, // 2 seconds between each URL delayBetweenBatches: 3000, // 3 seconds between batches progressFile: path.join(__dirname, '.retry-progress.json'), failedUrlsFile: path.join(__dirname, 'failed-urls.json'), completionMarker: path.join(__dirname, '.retry-complete'), }; // Progress tracking function loadProgress() { try { if (fs.existsSync(CONFIG.progressFile)) { return JSON.parse(fs.readFileSync(CONFIG.progressFile, 'utf8')); } } catch (error) { console.log('[Retry] Could not load progress, starting fresh'); } return { totalToRetry: 0, retried: 0, succeeded: 0, stillFailing: 0, lastIndex: 0, startedAt: new Date().toISOString(), lastUpdated: new Date().toISOString(), errorReasons: { timeout: 0, cloudflare: 0, notFound: 0, contentTooShort: 0, serverError: 0, networkError: 0, convexError: 0, other: 0, }, }; } // Categorize error message function categorizeError(errorMsg) { if (!errorMsg) return 'other'; const msg = errorMsg.toLowerCase(); if (msg.includes('timeout') || msg.includes('504') || msg.includes('timed out') || msg.includes('aborted')) { return 'timeout'; } if (msg.includes('cloudflare') || msg.includes('just a moment') || msg.includes('captcha')) { return 'cloudflare'; } if (msg.includes('404') || msg.includes('not found')) { return 'notFound'; } if (msg.includes('content too short') || msg.includes('insufficient content') || msg.includes('too short')) { return 'contentTooShort'; } if (msg.includes('500') || msg.includes('502') || msg.includes('503') || msg.includes('server error') || msg.includes('5xx')) { return 'serverError'; } if (msg.includes('network') || msg.includes('econnrefused') || msg.includes('enotfound') || msg.includes('fetch failed')) { return 'networkError'; } if (msg.includes('convex') || msg.includes('action') || msg.includes('function')) { return 'convexError'; } return 'other'; } function saveProgress(progress) { progress.lastUpdated = new Date().toISOString(); fs.writeFileSync(CONFIG.progressFile, JSON.stringify(progress, null, 2)); } function markComplete() { fs.writeFileSync(CONFIG.completionMarker, JSON.stringify({ completedAt: new Date().toISOString(), message: 'Retry complete!' }, null, 2)); } function isComplete() { return fs.existsSync(CONFIG.completionMarker); } // Load or fetch failed URLs async function getFailedUrls() { // Check if we have a cached list if (fs.existsSync(CONFIG.failedUrlsFile)) { console.log('[Retry] Loading cached failed URLs list...'); return JSON.parse(fs.readFileSync(CONFIG.failedUrlsFile, 'utf8')); } console.log('[Retry] Building failed URLs list from SQL Server...'); console.log('[Retry] This compares SQL URLs with indexed documents in Convex...'); // Connect to SQL Server const pool = await sql.connect(CONFIG.sqlServer); // Get all URLs from SQL Server console.log('[Retry] Fetching URLs from SQL Server...'); const result = await pool.request().query(` SELECT DISTINCT loc as Url FROM dbo.vw_PublishedArticleUrls WHERE loc IS NOT NULL AND loc LIKE 'https://proshare.co/articles/%' ORDER BY loc `); const allUrls = result.recordset.map(r => r.Url); console.log(`[Retry] Found ${allUrls.length.toLocaleString()} URLs in SQL Server`); await pool.close(); // Check which URLs are NOT indexed in Convex const failedUrls = []; const checkBatchSize = 100; console.log('[Retry] Checking which URLs are not indexed in Convex...'); for (let i = 0; i < allUrls.length; i += checkBatchSize) { const batch = allUrls.slice(i, i + checkBatchSize); try { const response = await fetch(CONFIG.checkEndpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', 'X-API-Key': CONFIG.apiKey, }, body: JSON.stringify({ urls: batch }), }); if (response.ok) { const data = await response.json(); failedUrls.push(...(data.notIndexed || [])); } else { failedUrls.push(...batch); } } catch (error) { failedUrls.push(...batch); } if ((i + checkBatchSize) % 1000 === 0) { console.log(`[Retry] Checked ${Math.min(i + checkBatchSize, allUrls.length).toLocaleString()}/${allUrls.length.toLocaleString()} URLs...`); } await new Promise(resolve => setTimeout(resolve, 100)); } console.log(`[Retry] Found ${failedUrls.length.toLocaleString()} URLs that need retrying`); // Cache the list fs.writeFileSync(CONFIG.failedUrlsFile, JSON.stringify(failedUrls, null, 2)); return failedUrls; } // Process a single URL directly via Convex HTTP API async function processUrlDirect(url) { const convexActionUrl = `${CONFIG.convexUrl}/processUrl`; try { const response = await fetch(convexActionUrl, { method: 'POST', headers: { 'Content-Type': 'application/json', 'X-API-Key': CONFIG.apiKey, }, body: JSON.stringify({ url: url, lastmod: new Date().toISOString(), }), }); if (!response.ok) { const text = await response.text(); return { success: false, error: `HTTP ${response.status}: ${text.substring(0, 200)}`, }; } const result = await response.json(); if (result.status === 'processed') { return { success: true, chunkCount: result.chunkCount || 0, }; } else if (result.status === 'skipped') { return { success: true, skipped: true, }; } else { return { success: false, error: result.error || 'Unknown processing error', }; } } catch (error) { return { success: false, error: error.message || 'Network error', }; } } // Process a batch of URLs sequentially async function processBatch(urls, batchNumber, progress) { console.log(`\n[Retry] ========== Batch ${batchNumber} (${urls.length} URLs) ==========`); let batchSucceeded = 0; let batchFailed = 0; const batchErrors = []; for (let i = 0; i < urls.length; i++) { const url = urls[i]; const urlShort = url.replace('https://proshare.co/articles/', '').substring(0, 50); process.stdout.write(`[Retry] ${i + 1}/${urls.length}: ${urlShort}... `); const result = await processUrlDirect(url); if (result.success) { if (result.skipped) { console.log('SKIPPED (already exists)'); } else { console.log(`OK (${result.chunkCount} chunks)`); } batchSucceeded++; progress.succeeded++; } else { console.log(`FAILED: ${result.error.substring(0, 50)}`); batchFailed++; progress.stillFailing++; batchErrors.push(`${url}: ${result.error}`); // Categorize the error const category = categorizeError(result.error); progress.errorReasons[category] = (progress.errorReasons[category] || 0) + 1; } progress.retried++; // Delay between URLs if (i < urls.length - 1) { await new Promise(resolve => setTimeout(resolve, CONFIG.delayBetweenUrls)); } } return { succeeded: batchSucceeded, failed: batchFailed, errors: batchErrors, }; } // Format time function formatTime(seconds) { if (seconds < 60) return `${Math.round(seconds)}s`; if (seconds < 3600) return `${Math.round(seconds / 60)}m`; const hours = Math.floor(seconds / 3600); const minutes = Math.round((seconds % 3600) / 60); return `${hours}h ${minutes}m`; } // Main function async function main() { const args = process.argv.slice(2); // Handle --status flag if (args.includes('--status')) { const progress = loadProgress(); const complete = isComplete(); console.log('\n========================================'); console.log('[Retry] Retry Status'); console.log('========================================'); console.log('Status:', complete ? 'COMPLETE' : 'In Progress'); console.log('Total to retry:', progress.totalToRetry.toLocaleString()); console.log('Retried:', progress.retried.toLocaleString()); console.log('Succeeded:', progress.succeeded.toLocaleString()); console.log('Still failing:', progress.stillFailing.toLocaleString()); if (progress.retried > 0) { console.log('Success rate:', ((progress.succeeded / progress.retried) * 100).toFixed(1) + '%'); } console.log('Started:', progress.startedAt); console.log('Last updated:', progress.lastUpdated); if (progress.errorReasons && progress.stillFailing > 0) { console.log(''); console.log('Failure Breakdown:'); console.log(' - Timeouts:', progress.errorReasons.timeout?.toLocaleString() || 0); console.log(' - Cloudflare:', progress.errorReasons.cloudflare?.toLocaleString() || 0); console.log(' - 404 Not Found:', progress.errorReasons.notFound?.toLocaleString() || 0); console.log(' - Content too short:', progress.errorReasons.contentTooShort?.toLocaleString() || 0); console.log(' - Server errors:', progress.errorReasons.serverError?.toLocaleString() || 0); console.log(' - Network errors:', progress.errorReasons.networkError?.toLocaleString() || 0); console.log(' - Convex errors:', progress.errorReasons.convexError?.toLocaleString() || 0); console.log(' - Other:', progress.errorReasons.other?.toLocaleString() || 0); } console.log('========================================\n'); return; } // Handle --reset flag (keeps the cached URL list) if (args.includes('--reset')) { if (fs.existsSync(CONFIG.progressFile)) fs.unlinkSync(CONFIG.progressFile); if (fs.existsSync(CONFIG.completionMarker)) fs.unlinkSync(CONFIG.completionMarker); console.log('[Retry] Progress reset. Cached URL list preserved.'); return; } // Handle --reset-all flag (deletes everything including cached URLs) if (args.includes('--reset-all')) { if (fs.existsSync(CONFIG.progressFile)) fs.unlinkSync(CONFIG.progressFile); if (fs.existsSync(CONFIG.failedUrlsFile)) fs.unlinkSync(CONFIG.failedUrlsFile); if (fs.existsSync(CONFIG.completionMarker)) fs.unlinkSync(CONFIG.completionMarker); console.log('[Retry] Everything reset. Will rebuild failed URLs list on next run.'); return; } // Handle --test flag (process just 5 URLs to verify it works) if (args.includes('--test')) { console.log('\n[Retry] TEST MODE - Processing 5 URLs to verify endpoint works\n'); const failedUrls = await getFailedUrls(); if (failedUrls.length === 0) { console.log('[Retry] No URLs to test!'); return; } const testUrls = failedUrls.slice(0, 5); let succeeded = 0; let failed = 0; for (let i = 0; i < testUrls.length; i++) { const url = testUrls[i]; const urlShort = url.replace('https://proshare.co/articles/', '').substring(0, 50); process.stdout.write(`[Test] ${i + 1}/5: ${urlShort}... `); const result = await processUrlDirect(url); if (result.success) { console.log(result.skipped ? 'SKIPPED' : `OK (${result.chunkCount} chunks)`); succeeded++; } else { console.log(`FAILED: ${result.error}`); failed++; } if (i < testUrls.length - 1) { await new Promise(resolve => setTimeout(resolve, 2000)); } } console.log('\n========================================'); console.log(`[Test] Results: ${succeeded} OK, ${failed} failed`); console.log('========================================\n'); return; } // Handle --count flag if (args.includes('--count')) { console.log('\n[Retry] Counting failed URLs...\n'); const failedUrls = await getFailedUrls(); console.log('\n========================================'); console.log(`[Retry] Total URLs needing retry: ${failedUrls.length.toLocaleString()}`); console.log('========================================\n'); return; } // Check if already complete if (isComplete()) { console.log('[Retry] Already complete. Use --reset to start over.'); return; } console.log('========================================'); console.log('[Retry] Starting Failed URL Retry (Direct Convex Mode)'); console.log('[Retry] Time:', new Date().toISOString()); console.log('[Retry] Batch size:', CONFIG.batchSize, 'URLs'); console.log('[Retry] Delay between URLs:', CONFIG.delayBetweenUrls / 1000, 'seconds'); console.log('========================================\n'); // Get failed URLs const failedUrls = await getFailedUrls(); if (failedUrls.length === 0) { console.log('[Retry] No URLs to retry!'); markComplete(); return; } let progress = loadProgress(); progress.totalToRetry = failedUrls.length; saveProgress(progress); const startTime = Date.now(); let batchNumber = Math.floor(progress.lastIndex / CONFIG.batchSize); // Process in batches for (let i = progress.lastIndex; i < failedUrls.length; i += CONFIG.batchSize) { batchNumber++; const batch = failedUrls.slice(i, i + CONFIG.batchSize); const result = await processBatch(batch, batchNumber, progress); // Update progress progress.lastIndex = i + batch.length; saveProgress(progress); // Print summary const elapsed = (Date.now() - startTime) / 1000; const percent = ((progress.lastIndex / failedUrls.length) * 100).toFixed(1); const rate = progress.retried / (elapsed || 1); const remaining = failedUrls.length - progress.lastIndex; const eta = remaining / rate; const successRate = progress.retried > 0 ? ((progress.succeeded / progress.retried) * 100).toFixed(1) : '0.0'; console.log(`\n[Retry] ---------- Batch ${batchNumber} Summary ----------`); console.log(`[Retry] Progress: ${percent}% (${progress.lastIndex.toLocaleString()}/${failedUrls.length.toLocaleString()})`); console.log(`[Retry] Total: ${progress.succeeded.toLocaleString()} succeeded | ${progress.stillFailing.toLocaleString()} failed | ${successRate}% success`); console.log(`[Retry] This batch: +${result.succeeded} OK, +${result.failed} failed`); console.log(`[Retry] Errors: T/O=${progress.errorReasons.timeout || 0} | 404=${progress.errorReasons.notFound || 0} | Short=${progress.errorReasons.contentTooShort || 0} | Net=${progress.errorReasons.networkError || 0} | CF=${progress.errorReasons.cloudflare || 0}`); console.log(`[Retry] ETA: ${formatTime(eta)} | Elapsed: ${formatTime(elapsed)}`); // Delay between batches if (progress.lastIndex < failedUrls.length) { console.log(`[Retry] Waiting ${CONFIG.delayBetweenBatches / 1000}s before next batch...`); await new Promise(resolve => setTimeout(resolve, CONFIG.delayBetweenBatches)); } } // Mark complete markComplete(); console.log('\n========================================'); console.log('[Retry] COMPLETE!'); console.log('========================================'); console.log('Total retried:', progress.retried.toLocaleString()); console.log('Succeeded:', progress.succeeded.toLocaleString()); console.log('Still failing:', progress.stillFailing.toLocaleString()); console.log('Success rate:', ((progress.succeeded / progress.retried) * 100).toFixed(1) + '%'); console.log('Time taken:', formatTime((Date.now() - startTime) / 1000)); console.log(''); console.log('Failure Breakdown:'); console.log(' - Timeouts:', progress.errorReasons.timeout?.toLocaleString() || 0); console.log(' - Cloudflare blocked:', progress.errorReasons.cloudflare?.toLocaleString() || 0); console.log(' - 404 Not Found:', progress.errorReasons.notFound?.toLocaleString() || 0); console.log(' - Content too short:', progress.errorReasons.contentTooShort?.toLocaleString() || 0); console.log(' - Server errors:', progress.errorReasons.serverError?.toLocaleString() || 0); console.log(' - Network errors:', progress.errorReasons.networkError?.toLocaleString() || 0); console.log(' - Convex errors:', progress.errorReasons.convexError?.toLocaleString() || 0); console.log(' - Other:', progress.errorReasons.other?.toLocaleString() || 0); console.log('========================================\n'); } main().catch(console.error);