1
2import { Actor } from 'apify';
3import { CheerioCrawler } from 'crawlee';
4
5
6function logUsageSummary(eventsProcessed, videosFound, failedRequests, startTime, maxPages) {
7 const endTime = Date.now();
8 const duration = Math.round((endTime - startTime) / 1000);
9 const efficiency = videosFound > 0 ? (videosFound / eventsProcessed).toFixed(2) : '0';
10
11 console.log('\n🔍 === FINAL USAGE SUMMARY ===');
12 console.log(`📊 Total Events Processed: ${eventsProcessed} / ${maxPages}`);
13 console.log(`🎥 Videos Found: ${videosFound}`);
14 console.log(`❌ Failed Requests: ${failedRequests}`);
15 console.log(`⏱️ Duration: ${duration} seconds`);
16 console.log(`🎯 Efficiency: ${efficiency} videos per event`);
17 console.log(`💰 Billable Events: ${eventsProcessed}`);
18
19 if (eventsProcessed < maxPages) {
20 console.log(`💡 You used ${maxPages - eventsProcessed} fewer events than your limit`);
21 }
22
23 console.log('==============================\n');
24}
25
26
27function startUsageMonitor(maxPages, getEventsProcessed) {
28 const monitorInterval = setInterval(() => {
29 const eventsProcessed = getEventsProcessed();
30 if (eventsProcessed > 0) {
31 const percentage = Math.round((eventsProcessed / maxPages) * 100);
32 console.log(`📈 Usage: ${eventsProcessed}/${maxPages} events (${percentage}%)`);
33 }
34 }, 30000);
35
36 return monitorInterval;
37}
38
39
40await Actor.main(async () => {
41
42 let eventsProcessed = 0;
43 let videosFound = 0;
44 let failedRequests = 0;
45 let startTime = Date.now();
46
47
48 const input = await Actor.getInput();
49
50
51 if (!input || !input.startUrl) {
52 throw new Error('Start URL is required');
53 }
54
55
56 const {
57 startUrl,
58 linkRegex = '.*',
59 videoRegex = '\\.(mp4|avi|mov|mkv|webm|m4v)$',
60 maxCrawlDepth = 3,
61 maxPages = 100,
62 outputFormat = 'JSON'
63 } = input;
64
65
66 console.log('=== USAGE TRACKING ===');
67 console.log(`Maximum pages to crawl: ${maxPages}`);
68 console.log(`Each page = 1 billable event`);
69 console.log(`Estimated maximum cost: ${maxPages} events`);
70 console.log('=====================');
71
72 console.log('Starting Video Download Link Crawler...');
73 console.log(`Start URL: ${startUrl}`);
74 console.log(`Video Regex: ${videoRegex}`);
75 console.log(`Max Depth: ${maxCrawlDepth}`);
76 console.log(`Max Pages: ${maxPages}`);
77
78
79 const requestQueue = await Actor.openRequestQueue();
80
81
82 await requestQueue.addRequest({
83 url: startUrl,
84 userData: { depth: 0 }
85 });
86
87
88 const dataset = await Actor.openDataset();
89
90
91 const crawler = new CheerioCrawler({
92 requestQueue,
93 maxRequestsPerCrawl: maxPages,
94 async requestHandler({ request, $ }) {
95 const { url } = request;
96 const { depth } = request.userData;
97
98
99 eventsProcessed++;
100
101 console.log(`Event #${eventsProcessed}: Processing ${url} (depth: ${depth})`);
102
103
104 if (eventsProcessed % 10 === 0) {
105 console.log(`📊 PROGRESS: ${eventsProcessed}/${maxPages} events processed`);
106 }
107
108
109 if (eventsProcessed >= maxPages * 0.8 && eventsProcessed <= maxPages * 0.8 + 1) {
110 console.log(`⚠️ WARNING: Approaching usage limit (${eventsProcessed}/${maxPages} events)`);
111 }
112
113
114 const videoLinks = await extractVideoLinks($, url, videoRegex);
115
116 console.log(`Found ${videoLinks.length} video links on ${url}`);
117 videosFound += videoLinks.length;
118
119
120 for (const videoLink of videoLinks) {
121 await dataset.pushData({
122 sourceUrl: url,
123 videoUrl: videoLink.url,
124 title: videoLink.title,
125 fileSize: videoLink.fileSize,
126 format: videoLink.format,
127 foundAt: new Date().toISOString(),
128 depth: depth,
129 eventNumber: eventsProcessed
130 });
131 }
132
133
134 if (depth < maxCrawlDepth) {
135 const links = await extractLinks($, url, linkRegex, videoRegex);
136
137 console.log(`Found ${links.length} links to follow from ${url}`);
138
139 for (const link of links) {
140 await requestQueue.addRequest({
141 url: link,
142 userData: { depth: depth + 1 }
143 });
144 }
145 }
146 },
147 async failedRequestHandler({ request }) {
148 failedRequests++;
149 console.error(`❌ Request failed (not counted as event): ${request.url}`);
150 console.log(`Failed requests so far: ${failedRequests}`);
151
152
153 const videoRegexPattern = new RegExp(videoRegex, 'i');
154 if (videoRegexPattern.test(request.url)) {
155 console.log(`🎥 Recording failed request as video link: ${request.url}`);
156
157
158 const urlParts = request.url.split('/');
159 const filename = urlParts[urlParts.length - 1];
160 const format = getVideoFormat(request.url);
161
162 videosFound++;
163
164 await dataset.pushData({
165 sourceUrl: request.userData.sourceUrl || 'Unknown',
166 videoUrl: request.url,
167 title: filename.replace(/\.[^/.]+$/, ""),
168 fileSize: null,
169 format: format,
170 foundAt: new Date().toISOString(),
171 depth: request.userData.depth || 0,
172 note: 'Found as direct video link',
173 eventNumber: 'N/A (failed request)'
174 });
175 }
176 }
177 });
178
179
180 const usageMonitor = startUsageMonitor(maxPages, () => eventsProcessed);
181
182
183 await crawler.run();
184
185
186 clearInterval(usageMonitor);
187
188
189
190 console.log(`Debug: eventsProcessed = ${eventsProcessed}, failedRequests = ${failedRequests}`);
191
192
193 const crawlerStats = crawler.stats || {};
194 console.log('Crawler stats for reference:', {
195 requestsFinished: crawlerStats.requestsFinished,
196 requestsFailed: crawlerStats.requestsFailed,
197 requestsTotal: crawlerStats.requestsTotal
198 });
199
200
201 const results = await dataset.getData();
202 await exportResults(results.items, outputFormat, eventsProcessed, videosFound);
203
204
205 logUsageSummary(eventsProcessed, videosFound, failedRequests, startTime, maxPages);
206
207 console.log(`✅ Crawling completed! Found ${results.items.length} video links.`);
208});
209
210
211async function extractVideoLinks($, baseUrl, videoRegex) {
212 const videoLinks = [];
213 const regex = new RegExp(videoRegex, 'i');
214
215 try {
216
217 $('a[href]').each((index, element) => {
218 const href = $(element).attr('href');
219 if (!href) return;
220
221 try {
222 const absoluteUrl = new URL(href, baseUrl).href;
223
224 if (regex.test(absoluteUrl)) {
225 videoLinks.push({
226 url: absoluteUrl,
227 title: $(element).text().trim() || $(element).attr('title') || 'Unknown',
228 fileSize: null,
229 format: getVideoFormat(absoluteUrl)
230 });
231 }
232 } catch (urlError) {
233 console.warn(`Invalid URL: ${href}`);
234 }
235 });
236
237
238 $('video source[src], video[src]').each((index, element) => {
239 const src = $(element).attr('src');
240 if (!src) return;
241
242 try {
243 const absoluteUrl = new URL(src, baseUrl).href;
244
245 if (regex.test(absoluteUrl)) {
246 videoLinks.push({
247 url: absoluteUrl,
248 title: $('video').attr('title') || 'Video',
249 fileSize: null,
250 format: getVideoFormat(absoluteUrl)
251 });
252 }
253 } catch (urlError) {
254 console.warn(`Invalid video URL: ${src}`);
255 }
256 });
257
258
259 $('a[href*="download"], a[href*="sample"], a[href*="video"]').each((index, element) => {
260 const href = $(element).attr('href');
261 if (!href) return;
262
263 try {
264 const absoluteUrl = new URL(href, baseUrl).href;
265
266 if (regex.test(absoluteUrl)) {
267 const linkText = $(element).text().trim();
268 const title = linkText || $(element).attr('title') || $(element).attr('alt') || 'Video File';
269
270 videoLinks.push({
271 url: absoluteUrl,
272 title: title,
273 fileSize: null,
274 format: getVideoFormat(absoluteUrl)
275 });
276 }
277 } catch (urlError) {
278 console.warn(`Invalid video link URL: ${href}`);
279 }
280 });
281
282
283 $('iframe[src*="video"], embed[src*="video"]').each((index, element) => {
284 const src = $(element).attr('src');
285 if (!src) return;
286
287 try {
288 const absoluteUrl = new URL(src, baseUrl).href;
289
290 if (regex.test(absoluteUrl)) {
291 videoLinks.push({
292 url: absoluteUrl,
293 title: $(element).attr('title') || 'Embedded Video',
294 fileSize: null,
295 format: getVideoFormat(absoluteUrl)
296 });
297 }
298 } catch (urlError) {
299 console.warn(`Invalid embedded video URL: ${src}`);
300 }
301 });
302
303 } catch (error) {
304 console.error('Error extracting video links:', error);
305 }
306
307
308 const uniqueVideos = [];
309 const seenUrls = new Set();
310
311 for (const video of videoLinks) {
312 if (!seenUrls.has(video.url)) {
313 seenUrls.add(video.url);
314 uniqueVideos.push(video);
315 }
316 }
317
318 return uniqueVideos;
319}
320
321
322async function extractLinks($, baseUrl, linkRegex, videoRegex) {
323 const links = [];
324 const regex = new RegExp(linkRegex, 'i');
325 const videoRegexPattern = new RegExp(videoRegex, 'i');
326
327 try {
328 $('a[href]').each((index, element) => {
329 const href = $(element).attr('href');
330 if (!href) return;
331
332 try {
333 const absoluteUrl = new URL(href, baseUrl).href;
334
335
336 if (videoRegexPattern.test(absoluteUrl)) {
337 console.log(`Found direct video link (not crawling): ${absoluteUrl}`);
338
339 return;
340 }
341
342
343 if (regex.test(absoluteUrl) && absoluteUrl.startsWith('http')) {
344
345 const urlPath = new URL(absoluteUrl).pathname;
346 if (!urlPath.match(/\.(pdf|zip|exe|dmg|pkg|deb|rpm)$/i)) {
347 links.push(absoluteUrl);
348 }
349 }
350 } catch (urlError) {
351 console.warn(`Invalid link URL: ${href}`);
352 }
353 });
354 } catch (error) {
355 console.error('Error extracting links:', error);
356 }
357
358 return [...new Set(links)];
359}
360
361
362function getVideoFormat(url) {
363 const match = url.match(/\.([^.?]+)(?:\?|$)/);
364 return match ? match[1].toLowerCase() : 'unknown';
365}
366
367
368async function exportResults(results, format, eventsProcessed, videosFound) {
369 try {
370
371 const metadata = {
372 totalEvents: eventsProcessed,
373 totalVideos: results.length,
374 videosFound: videosFound,
375 exportedAt: new Date().toISOString(),
376 format: format
377 };
378
379 switch (format) {
380 case 'CSV':
381 await Actor.setValue('OUTPUT.csv', convertToCSV(results));
382 await Actor.setValue('USAGE_SUMMARY.json', metadata);
383 break;
384 case 'HTML':
385 await Actor.setValue('OUTPUT.html', convertToHTML(results, metadata));
386 break;
387 case 'XML':
388 await Actor.setValue('OUTPUT.xml', convertToXML(results, metadata));
389 break;
390 default:
391 await Actor.setValue('OUTPUT.json', results);
392 await Actor.setValue('USAGE_SUMMARY.json', metadata);
393 }
394 console.log(`✅ Results exported in ${format} format`);
395 console.log(`📊 Usage summary saved as USAGE_SUMMARY.json`);
396 } catch (error) {
397 console.error('Error exporting results:', error);
398 }
399}
400
401
402function convertToCSV(data) {
403 if (!data.length) return '';
404
405 const headers = Object.keys(data[0]);
406 const csvContent = [
407 headers.join(','),
408 ...data.map(row =>
409 headers.map(header => `"${(row[header] || '').toString().replace(/"/g, '""')}"`).join(',')
410 )
411 ].join('\n');
412
413 return csvContent;
414}
415
416
417function convertToHTML(data, metadata) {
418 const htmlContent = `
419<!DOCTYPE html>
420<html>
421<head>
422 <title>Video Download Links</title>
423 <style>
424 body { font-family: Arial, sans-serif; margin: 20px; }
425 .metadata { background: #f5f5f5; padding: 15px; border-radius: 5px; margin-bottom: 20px; }
426 table { border-collapse: collapse; width: 100%; }
427 th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
428 th { background-color: #f2f2f2; }
429 a { color: #0066cc; text-decoration: none; }
430 a:hover { text-decoration: underline; }
431 </style>
432</head>
433<body>
434 <h1>Video Download Links</h1>
435
436 <div class="metadata">
437 <h3>Usage Summary</h3>
438 <p><strong>Total Events:</strong> ${metadata.totalEvents}</p>
439 <p><strong>Videos Found:</strong> ${metadata.videosFound}</p>
440 <p><strong>Export Date:</strong> ${metadata.exportedAt}</p>
441 <p><strong>Format:</strong> ${metadata.format}</p>
442 </div>
443
444 <p>Total videos found: ${data.length}</p>
445 <table>
446 <thead>
447 <tr>
448 <th>Event #</th>
449 <th>Title</th>
450 <th>Video URL</th>
451 <th>Source URL</th>
452 <th>Format</th>
453 <th>File Size</th>
454 <th>Found At</th>
455 <th>Depth</th>
456 <th>Notes</th>
457 </tr>
458 </thead>
459 <tbody>
460 ${data.map(item => `
461 <tr>
462 <td>${item.eventNumber || 'N/A'}</td>
463 <td>${item.title || 'Unknown'}</td>
464 <td><a href="${item.videoUrl}" target="_blank">${item.videoUrl}</a></td>
465 <td><a href="${item.sourceUrl}" target="_blank">${item.sourceUrl}</a></td>
466 <td>${item.format}</td>
467 <td>${item.fileSize || 'Unknown'}</td>
468 <td>${item.foundAt}</td>
469 <td>${item.depth}</td>
470 <td>${item.note || ''}</td>
471 </tr>
472 `).join('')}
473 </tbody>
474 </table>
475</body>
476</html>
477 `;
478
479 return htmlContent;
480}
481
482
483function convertToXML(data, metadata) {
484 const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
485<videoResults>
486 <metadata>
487 <totalEvents>${metadata.totalEvents}</totalEvents>
488 <videosFound>${metadata.videosFound}</videosFound>
489 <exportedAt>${metadata.exportedAt}</exportedAt>
490 <format>${metadata.format}</format>
491 </metadata>
492 <videos count="${data.length}">
493 ${data.map(item => `
494 <video>
495 <eventNumber>${item.eventNumber || 'N/A'}</eventNumber>
496 <title><![CDATA[${item.title || 'Unknown'}]]></title>
497 <videoUrl><![CDATA[${item.videoUrl}]]></videoUrl>
498 <sourceUrl><![CDATA[${item.sourceUrl}]]></sourceUrl>
499 <format>${item.format}</format>
500 <fileSize>${item.fileSize || 'Unknown'}</fileSize>
501 <foundAt>${item.foundAt}</foundAt>
502 <depth>${item.depth}</depth>
503 <note><![CDATA[${item.note || ''}]]></note>
504 </video>
505 `).join('')}
506 </videos>
507</videoResults>`;
508
509 return xmlContent;
510}