1import { readFile } from 'node:fs/promises';
2import type { IncomingMessage } from 'node:http';
3import { dirname } from 'node:path';
4import { fileURLToPath, URL } from 'node:url';
5
6import type {
7 AutoscaledPool,
8 Awaitable,
9 Dictionary,
10 JSDOMCrawlerOptions,
11 JSDOMCrawlingContext,
12 ProxyConfiguration,
13 Request,
14} from '@crawlee/jsdom';
15import {
16 Dataset,
17 JSDOMCrawler,
18 KeyValueStore,
19 log,
20 RequestList,
21 RequestQueueV2,
22} from '@crawlee/jsdom';
23import type { ApifyEnv } from 'apify';
24import { Actor } from 'apify';
25
26import type {
27 CrawlerSetupOptions,
28 RequestMetadata,
29} from '@apify/scraper-tools';
30import {
31 constants as scraperToolsConstants,
32 createContext,
33 tools,
34} from '@apify/scraper-tools';
35
36import type { Input } from './consts.js';
37import { ProxyRotation } from './consts.js';
38
39const { SESSION_MAX_USAGE_COUNTS, META_KEY } = scraperToolsConstants;
40const SCHEMA = JSON.parse(
41 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
42);
43
44const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
45const SESSION_STORE_NAME = 'APIFY-JSDOM-SCRAPER-SESSION-STORE';
46const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
47
48
49
50
51
52export class CrawlerSetup implements CrawlerSetupOptions {
53 name = 'JSDOM Scraper';
54 rawInput: string;
55 env: ApifyEnv;
56
57
58
59 globalStore = new Map();
60 requestQueue: RequestQueueV2;
61 keyValueStore: KeyValueStore;
62 customData: unknown;
63 input: Input;
64 maxSessionUsageCount: number;
65 evaledPageFunction: (...args: unknown[]) => unknown;
66 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
67 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
68 datasetName?: string;
69 keyValueStoreName?: string;
70 requestQueueName?: string;
71
72 crawler!: JSDOMCrawler;
73 dataset!: Dataset;
74 pagesOutputted!: number;
75 proxyConfiguration?: ProxyConfiguration;
76 private initPromise: Promise<void>;
77
78 constructor(input: Input) {
79
80 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
81
82
83 this.rawInput = JSON.stringify(input);
84
85
86 tools.maybeLoadPageFunctionFromDisk(
87 input,
88 dirname(fileURLToPath(import.meta.url)),
89 );
90
91
92 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
93
94 this.input = input;
95 this.env = Actor.getEnv();
96
97
98 this.input.pseudoUrls.forEach((purl) => {
99 if (!tools.isPlainObject(purl)) {
100 throw new Error(
101 'The pseudoUrls Array must only contain Objects.',
102 );
103 }
104 if (purl.userData && !tools.isPlainObject(purl.userData)) {
105 throw new Error(
106 'The userData property of a pseudoUrl must be an Object.',
107 );
108 }
109 });
110
111 this.input.initialCookies.forEach((cookie) => {
112 if (!tools.isPlainObject(cookie)) {
113 throw new Error(
114 'The initialCookies Array must only contain Objects.',
115 );
116 }
117 });
118
119
120 this.maxSessionUsageCount =
121 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
122
123
124 this.evaledPageFunction = tools.evalFunctionOrThrow(
125 this.input.pageFunction,
126 );
127
128 if (this.input.preNavigationHooks) {
129 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
130 this.input.preNavigationHooks,
131 'preNavigationHooks',
132 );
133 } else {
134 this.evaledPreNavigationHooks = [];
135 }
136
137 if (this.input.postNavigationHooks) {
138 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
139 this.input.postNavigationHooks,
140 'postNavigationHooks',
141 );
142 } else {
143 this.evaledPostNavigationHooks = [];
144 }
145
146
147 this.datasetName = this.input.datasetName;
148 this.keyValueStoreName = this.input.keyValueStoreName;
149 this.requestQueueName = this.input.requestQueueName;
150
151
152 this.crawler = null!;
153 this.requestQueue = null!;
154 this.dataset = null!;
155 this.keyValueStore = null!;
156 this.proxyConfiguration = null!;
157 this.initPromise = this._initializeAsync();
158 }
159
160 private async _initializeAsync() {
161
162 const startUrls = this.input.startUrls.map((req) => {
163 req.useExtendedUniqueKey = true;
164 req.keepUrlFragment = this.input.keepUrlFragments;
165 return req;
166 });
167
168
169 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
170
171
172 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
173
174 if (
175 !(await this.keyValueStore.recordExists(
176 REQUEST_QUEUE_INIT_FLAG_KEY,
177 ))
178 ) {
179 const requests: Request[] = [];
180 for await (const request of await RequestList.open(
181 null,
182 startUrls,
183 )) {
184 if (
185 this.input.maxResultsPerCrawl > 0 &&
186 requests.length >= 1.5 * this.input.maxResultsPerCrawl
187 ) {
188 break;
189 }
190 requests.push(request);
191 }
192
193 const { waitForAllRequestsToBeAdded } =
194 await this.requestQueue.addRequestsBatched(requests);
195
196 void waitForAllRequestsToBeAdded.then(async () => {
197 await this.keyValueStore.setValue(
198 REQUEST_QUEUE_INIT_FLAG_KEY,
199 '1',
200 );
201 });
202 }
203
204
205 this.dataset = await Dataset.open(this.datasetName);
206 const info = await this.dataset.getInfo();
207 this.pagesOutputted = info?.itemCount ?? 0;
208
209
210 this.proxyConfiguration = (await Actor.createProxyConfiguration(
211 this.input.proxyConfiguration,
212 )) as any as ProxyConfiguration;
213 }
214
215
216
217
218 async createCrawler() {
219 await this.initPromise;
220
221 const options: JSDOMCrawlerOptions = {
222 proxyConfiguration: this.proxyConfiguration,
223 requestHandler: this._requestHandler.bind(this),
224 preNavigationHooks: [],
225 runScripts: this.input.runScripts ?? true,
226 hideInternalConsole: !(this.input.showInternalConsole ?? false),
227 postNavigationHooks: [],
228 requestQueue: this.requestQueue,
229 navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
230 requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
231 ignoreSslErrors: this.input.ignoreSslErrors,
232 failedRequestHandler: this._failedRequestHandler.bind(this),
233 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
234 maxRequestRetries: this.input.maxRequestRetries,
235 maxRequestsPerCrawl:
236 this.input.maxPagesPerCrawl === 0
237 ? undefined
238 : this.input.maxPagesPerCrawl,
239 additionalMimeTypes: this.input.additionalMimeTypes,
240 autoscaledPoolOptions: {
241 maxConcurrency: this.input.maxConcurrency,
242 systemStatusOptions: {
243
244
245 maxEventLoopOverloadedRatio:
246 MAX_EVENT_LOOP_OVERLOADED_RATIO,
247 },
248 },
249 useSessionPool: true,
250 persistCookiesPerSession: true,
251 sessionPoolOptions: {
252 persistStateKeyValueStoreId: this.input.sessionPoolName
253 ? SESSION_STORE_NAME
254 : undefined,
255 persistStateKey: this.input.sessionPoolName,
256 sessionOptions: {
257 maxUsageCount: this.maxSessionUsageCount,
258 },
259 },
260 experiments: {
261 requestLocking: true,
262 },
263 };
264
265 this._createNavigationHooks(options);
266
267 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
268 options.sessionPoolOptions!.maxPoolSize = 1;
269 }
270
271 if (this.input.suggestResponseEncoding) {
272 if (this.input.forceResponseEncoding) {
273 options.forceResponseEncoding =
274 this.input.suggestResponseEncoding;
275 } else {
276 options.suggestResponseEncoding =
277 this.input.suggestResponseEncoding;
278 }
279 }
280
281 this.crawler = new JSDOMCrawler(options);
282
283 return this.crawler;
284 }
285
286 private _createNavigationHooks(options: JSDOMCrawlerOptions) {
287 options.preNavigationHooks!.push(async ({ request, session }) => {
288
289 request.headers = Object.entries(request.headers ?? {}).reduce(
290 (newHeaders, [key, value]) => {
291 newHeaders[key.toLowerCase()] = value;
292 return newHeaders;
293 },
294 {} as Dictionary<string>,
295 );
296
297
298 if (this.input.initialCookies && this.input.initialCookies.length) {
299 const cookiesToSet = session
300 ? tools.getMissingCookiesFromSession(
301 session,
302 this.input.initialCookies,
303 request.url,
304 )
305 : this.input.initialCookies;
306 if (cookiesToSet?.length) {
307
308 session?.setCookies(cookiesToSet, request.url);
309 }
310 }
311 });
312
313 options.preNavigationHooks!.push(
314 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
315 );
316 options.postNavigationHooks!.push(
317 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
318 );
319 }
320
321 private _runHookWithEnhancedContext(
322 hooks: ((...args: unknown[]) => Awaitable<void>)[],
323 ) {
324 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
325 const { customData } = this.input;
326 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
327 });
328 }
329
330 private async _failedRequestHandler({ request }: JSDOMCrawlingContext) {
331 const lastError =
332 request.errorMessages[request.errorMessages.length - 1];
333 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
334 log.error(
335 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
336 );
337 return this._handleResult(request, undefined, undefined, true);
338 }
339
340
341
342
343
344
345
346
347
348
349
350 private async _requestHandler(crawlingContext: JSDOMCrawlingContext) {
351 const { request, response, window, crawler } = crawlingContext;
352 const pageFunctionArguments: Dictionary = {};
353
354
355 const props = Object.getOwnPropertyDescriptors(crawlingContext);
356 ['json', 'body'].forEach((key) => {
357 props[key].configurable = true;
358 });
359 Object.defineProperties(pageFunctionArguments, props);
360 Object.defineProperties(
361 this,
362 Object.getOwnPropertyDescriptors(pageFunctionArguments),
363 );
364
365
366
367
368
369
370 tools.ensureMetaData(request);
371
372
373 const aborted = await this._handleMaxResultsPerCrawl(
374 crawler.autoscaledPool,
375 );
376 if (aborted) return;
377
378
379 const contextOptions = {
380 crawlerSetup: {
381 rawInput: this.rawInput,
382 env: this.env,
383 globalStore: this.globalStore,
384 requestQueue: this.requestQueue,
385 keyValueStore: this.keyValueStore,
386 customData: this.input.customData,
387 },
388 pageFunctionArguments,
389 };
390 const { context, state } = createContext(contextOptions);
391
392
393
394
395 const pageFunctionResult = await this.evaledPageFunction(context);
396
397
398
399
400
401
402
403 if (!state.skipLinks && !!window)
404 await this._handleLinks(crawlingContext);
405
406
407 await this._handleResult(
408 request,
409 response,
410 pageFunctionResult as Dictionary,
411 );
412 }
413
414 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
415 if (
416 !this.input.maxResultsPerCrawl ||
417 this.pagesOutputted < this.input.maxResultsPerCrawl
418 )
419 return false;
420 if (!autoscaledPool) return false;
421 log.info(
422 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
423 );
424 await autoscaledPool.abort();
425 return true;
426 }
427
428 private async _handleLinks({
429 request,
430 enqueueLinks,
431 }: JSDOMCrawlingContext) {
432 if (!(this.input.linkSelector && this.requestQueue)) return;
433 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
434 .depth;
435 const hasReachedMaxDepth =
436 this.input.maxCrawlingDepth &&
437 currentDepth >= this.input.maxCrawlingDepth;
438 if (hasReachedMaxDepth) {
439 log.debug(
440 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
441 );
442 return;
443 }
444
445 await enqueueLinks({
446 selector: this.input.linkSelector,
447 pseudoUrls: this.input.pseudoUrls,
448 globs: this.input.globs,
449 exclude: this.input.excludes,
450 transformRequestFunction: (requestOptions) => {
451 requestOptions.userData ??= {};
452 requestOptions.userData[META_KEY] = {
453 parentRequestId: request.id || request.uniqueKey,
454 depth: currentDepth + 1,
455 };
456
457 requestOptions.useExtendedUniqueKey = true;
458 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
459 return requestOptions;
460 },
461 });
462 }
463
464 private async _handleResult(
465 request: Request,
466 response?: IncomingMessage,
467 pageFunctionResult?: Dictionary,
468 isError?: boolean,
469 ) {
470 const payload = tools.createDatasetPayload(
471 request,
472 response,
473 pageFunctionResult,
474 isError,
475 );
476 await this.dataset.pushData(payload);
477 this.pagesOutputted++;
478 }
479}