1import path from 'node:path';
2
3import { Actor, log } from 'apify';
4import * as cheerio from 'cheerio';
5import got from 'got';
6import guard from 'robots-txt-guard';
7
8import parseRobots from 'robots-txt-parse';
9
10
11await Actor.init();
12
13const input = await Actor.getInput();
14const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration);
15const proxyUrl = proxyConfiguration.newUrl();
16
17log.info(`Using proxy: ${proxyUrl}`);
18
19const {
20 startUrl,
21 maxCrawlDepth = 1,
22 maxConcurrency = 10,
23 imageExtensions = ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'svg'],
24 respectRobotsTxt = true,
25 userAgent = 'Mozilla/5.0 (compatible; ApifyBot/1.0; +https://apify.com/bot)',
26 useScope = false,
27 scope = [],
28 includeSubdomains = false
29} = input;
30
31
32if (!startUrl) throw new Error('startUrl is required!');
33
34log.info('Starting Image URL Scraper', { startUrl, maxCrawlDepth, maxConcurrency });
35
36const robotsGuards = new Map();
37
38async function getGuardForUrl(url) {
39 const { hostname } = new URL(url);
40 if (!robotsGuards.has(hostname)) {
41 if (!respectRobotsTxt) {
42 robotsGuards.set(hostname, null);
43 } else {
44 try {
45 const robotsUrl = new URL('/robots.txt', `https://${hostname}`).href;
46 const robotsTxtText = await got(robotsUrl, {
47 timeout: { request: 5000 },
48 headers: { 'User-Agent': userAgent },
49 agent: {
50 http: proxyConfiguration?.newProxyAgent?.(),
51 https: proxyConfiguration?.newProxyAgent?.()
52 }
53 }).text();
54
55 const parsedRobots = parseRobots(robotsTxtText);
56 const guardInstance = guard(parsedRobots);
57
58 robotsGuards.set(hostname, guardInstance);
59 } catch (err) {
60 log.warning(`Could not load robots.txt for ${hostname}, allowing all by default. Error: ${err}`);
61 robotsGuards.set(hostname, null);
62 }
63 }
64 }
65 return robotsGuards.get(hostname);
66}
67
68
69const requestQueue = await Actor.openRequestQueue();
70await requestQueue.addRequest({ url: startUrl, userData: { depth: 0 } });
71
72const processedUrls = new Set();
73const foundImages = [];
74
75const crawlPage = async (request) => {
76 const { url, userData: { depth } } = request;
77
78 const guardian = await getGuardForUrl(url);
79 if (guardian && !guardian.isAllowed(url)) {
80 log.warning(`Blocked by robots.txt: ${url}`);
81 return;
82 }
83
84 if (processedUrls.has(url)) {
85 log.info(`Already processed ${url}, skipping.`);
86 return;
87 }
88 processedUrls.add(url);
89
90 log.info(`Processing ${url} (depth: ${depth})`);
91
92 let body;
93 try {
94 const response = await got(url, {
95 timeout: { request: 10000 },
96 headers: {
97 'User-Agent': userAgent,
98 },
99 agent: {
100 http: proxyConfiguration?.newProxyAgent?.(),
101 https: proxyConfiguration?.newProxyAgent?.()
102 }
103 });
104 const contentType = response.headers['content-type'] || '';
105 if (!contentType.includes('text/html')) {
106 log.info(`Skipping non-HTML content at ${url}`);
107 return;
108 }
109 body = response.body;
110 } catch (error) {
111 log.error(`Failed to download ${url}: ${error.message}`);
112 return;
113 }
114
115 const $ = cheerio.load(body);
116 const imagesOnPage = [];
117
118 $('img[src]').each((_, el) => {
119 const src = $(el).attr('src');
120 if (src) imagesOnPage.push(src);
121 });
122
123 $('[style]').each((_, el) => {
124 const style = $(el).attr('style') || '';
125 const match = style.match(/background-image:\s*url\(["']?([^"')]+)["']?\)/i);
126 if (match) imagesOnPage.push(match[1]);
127 });
128
129 const filteredImages = imagesOnPage
130 .map(src => {
131 try {
132 return new URL(src, url).href;
133 } catch {
134 return null;
135 }
136 })
137 .filter(src => src && imageExtensions.includes(path.extname(src).substring(1).toLowerCase()));
138
139 const uniqueImages = [...new Set(filteredImages)];
140
141 log.info(`Found ${uniqueImages.length} images on ${url}`);
142
143 for (const imgUrl of uniqueImages) {
144 foundImages.push({
145 url: imgUrl,
146 sourcePage: url,
147 detectedAt: new Date().toISOString(),
148 });
149 }
150
151 if (depth < maxCrawlDepth) {
152 const links = [];
153 $('a[href]').each((_, el) => {
154 const href = $(el).attr('href');
155 if (!href) return;
156 try {
157 const absoluteUrl = new URL(href, url).href;
158 links.push(absoluteUrl);
159 const ext = path.extname(absoluteUrl).substring(1).toLowerCase();
160 if (imageExtensions.includes(ext)) {
161 foundImages.push({
162 url: absoluteUrl,
163 sourcePage: url,
164 detectedAt: new Date().toISOString(),
165 });
166 }
167 } catch { }
168 });
169
170 const uniqueLinks = [...new Set(links)];
171
172 function isUrlAllowedByScope(targetUrl) {
173 if (!useScope) return true;
174
175 try {
176 const { hostname } = new URL(targetUrl);
177 return scope.some(domain => {
178 if (includeSubdomains) {
179 return hostname === domain || hostname.endsWith(`.${domain}`);
180
181 } else {
182 return hostname === domain;
183 }
184 });
185 } catch {
186 return false;
187 }
188 }
189
190
191
192
193 for (const link of uniqueLinks) {
194 if (!processedUrls.has(link) && isUrlAllowedByScope(link)) {
195 await requestQueue.addRequest({
196 url: link,
197 userData: { depth: depth + 1 }
198 });
199 }
200 }
201
202 log.info(`Enqueued ${uniqueLinks.length} links from ${url}`);
203 }
204};
205
206const concurrency = Math.min(maxConcurrency, 20);
207const promises = [];
208
209for (let i = 0; i < concurrency; i++) {
210 promises.push((async () => {
211 while (true) {
212 const request = await requestQueue.fetchNextRequest();
213 if (!request) break;
214
215 try {
216 await crawlPage(request);
217 await requestQueue.markRequestHandled(request);
218 } catch (err) {
219 log.error(`Error crawling ${request.url}: ${err.message}`);
220 await requestQueue.markRequestFailed(request);
221 }
222 }
223 })());
224}
225
226await Promise.all(promises);
227
228for (const image of foundImages) {
229 await Actor.pushData({
230 url: image.url,
231 sourcePage: image.sourcePage,
232 foundAt: image.foundAt,
233 });
234}
235
236log.info('Crawl finished.');
237await Actor.exit();