1import { readFile } from 'node:fs/promises';
2import type { IncomingMessage } from 'node:http';
3import { dirname } from 'node:path';
4import { fileURLToPath, URL } from 'node:url';
5
6import type {
7 AutoscaledPool,
8 Awaitable,
9 CheerioCrawlerOptions,
10 CheerioCrawlingContext,
11 Dictionary,
12 ProxyConfiguration,
13 Request,
14} from '@crawlee/cheerio';
15import {
16 CheerioCrawler,
17 Dataset,
18 KeyValueStore,
19 log,
20 RequestList,
21 RequestQueueV2,
22} from '@crawlee/cheerio';
23import type { ApifyEnv } from 'apify';
24import { Actor } from 'apify';
25import { load } from 'cheerio';
26
27import type {
28 CrawlerSetupOptions,
29 RequestMetadata,
30} from '@apify/scraper-tools';
31import {
32 constants as scraperToolsConstants,
33 createContext,
34 tools,
35} from '@apify/scraper-tools';
36
37import type { Input } from './consts.js';
38import { ProxyRotation } from './consts.js';
39
40const { SESSION_MAX_USAGE_COUNTS, META_KEY } = scraperToolsConstants;
41const SCHEMA = JSON.parse(
42 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
43);
44
45const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
46const SESSION_STORE_NAME = 'APIFY-CHEERIO-SCRAPER-SESSION-STORE';
47const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
48
49
50
51
52
53export class CrawlerSetup implements CrawlerSetupOptions {
54 name = 'Cheerio Scraper';
55 rawInput: string;
56 env: ApifyEnv;
57
58
59
60 globalStore = new Map();
61 requestQueue: RequestQueueV2;
62 keyValueStore: KeyValueStore;
63 customData: unknown;
64 input: Input;
65 maxSessionUsageCount: number;
66 evaledPageFunction: (...args: unknown[]) => unknown;
67 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
68 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
69 datasetName?: string;
70 keyValueStoreName?: string;
71 requestQueueName?: string;
72
73 crawler!: CheerioCrawler;
74 dataset!: Dataset;
75 pagesOutputted!: number;
76 proxyConfiguration?: ProxyConfiguration;
77 private initPromise: Promise<void>;
78
79 constructor(input: Input) {
80
81 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
82
83
84 this.rawInput = JSON.stringify(input);
85
86
87 tools.maybeLoadPageFunctionFromDisk(
88 input,
89 dirname(fileURLToPath(import.meta.url)),
90 );
91
92
93 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
94
95 this.input = input;
96 this.env = Actor.getEnv();
97
98
99 this.input.pseudoUrls.forEach((purl) => {
100 if (!tools.isPlainObject(purl)) {
101 throw new Error(
102 'The pseudoUrls Array must only contain Objects.',
103 );
104 }
105 if (purl.userData && !tools.isPlainObject(purl.userData)) {
106 throw new Error(
107 'The userData property of a pseudoUrl must be an Object.',
108 );
109 }
110 });
111
112 this.input.initialCookies.forEach((cookie) => {
113 if (!tools.isPlainObject(cookie)) {
114 throw new Error(
115 'The initialCookies Array must only contain Objects.',
116 );
117 }
118 });
119
120
121 this.maxSessionUsageCount =
122 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
123
124
125 this.evaledPageFunction = tools.evalFunctionOrThrow(
126 this.input.pageFunction,
127 );
128
129 if (this.input.preNavigationHooks) {
130 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
131 this.input.preNavigationHooks,
132 'preNavigationHooks',
133 );
134 } else {
135 this.evaledPreNavigationHooks = [];
136 }
137
138 if (this.input.postNavigationHooks) {
139 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
140 this.input.postNavigationHooks,
141 'postNavigationHooks',
142 );
143 } else {
144 this.evaledPostNavigationHooks = [];
145 }
146
147
148 this.datasetName = this.input.datasetName;
149 this.keyValueStoreName = this.input.keyValueStoreName;
150 this.requestQueueName = this.input.requestQueueName;
151
152
153 this.crawler = null!;
154 this.requestQueue = null!;
155 this.dataset = null!;
156 this.keyValueStore = null!;
157 this.proxyConfiguration = null!;
158 this.initPromise = this._initializeAsync();
159 }
160
161 private async _initializeAsync() {
162
163 const startUrls = this.input.startUrls.map((req) => {
164 req.useExtendedUniqueKey = true;
165 req.keepUrlFragment = this.input.keepUrlFragments;
166 return req;
167 });
168
169
170 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
171
172
173 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
174
175 if (
176 !(await this.keyValueStore.recordExists(
177 REQUEST_QUEUE_INIT_FLAG_KEY,
178 ))
179 ) {
180 const requests: Request[] = [];
181 for await (const request of await RequestList.open(
182 null,
183 startUrls,
184 )) {
185 if (
186 this.input.maxResultsPerCrawl > 0 &&
187 requests.length >= 1.5 * this.input.maxResultsPerCrawl
188 ) {
189 break;
190 }
191 requests.push(request);
192 }
193
194 const { waitForAllRequestsToBeAdded } =
195 await this.requestQueue.addRequestsBatched(requests);
196
197 void waitForAllRequestsToBeAdded.then(async () => {
198 await this.keyValueStore.setValue(
199 REQUEST_QUEUE_INIT_FLAG_KEY,
200 '1',
201 );
202 });
203 }
204
205
206 this.dataset = await Dataset.open(this.datasetName);
207 const info = await this.dataset.getInfo();
208 this.pagesOutputted = info?.itemCount ?? 0;
209
210
211 this.proxyConfiguration = (await Actor.createProxyConfiguration(
212 this.input.proxyConfiguration,
213 )) as any as ProxyConfiguration;
214 }
215
216
217
218
219 async createCrawler() {
220 await this.initPromise;
221
222 const options: CheerioCrawlerOptions = {
223 proxyConfiguration: this.proxyConfiguration,
224 requestHandler: this._requestHandler.bind(this),
225 preNavigationHooks: [],
226 postNavigationHooks: [],
227 requestQueue: this.requestQueue,
228 navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
229 requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
230 ignoreSslErrors: this.input.ignoreSslErrors,
231 failedRequestHandler: this._failedRequestHandler.bind(this),
232 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
233 maxRequestRetries: this.input.maxRequestRetries,
234 maxRequestsPerCrawl:
235 this.input.maxPagesPerCrawl === 0
236 ? undefined
237 : this.input.maxPagesPerCrawl,
238 additionalMimeTypes: this.input.additionalMimeTypes,
239 autoscaledPoolOptions: {
240 maxConcurrency: this.input.maxConcurrency,
241 systemStatusOptions: {
242
243
244 maxEventLoopOverloadedRatio:
245 MAX_EVENT_LOOP_OVERLOADED_RATIO,
246 },
247 },
248 useSessionPool: true,
249 persistCookiesPerSession: true,
250 sessionPoolOptions: {
251 persistStateKeyValueStoreId: this.input.sessionPoolName
252 ? SESSION_STORE_NAME
253 : undefined,
254 persistStateKey: this.input.sessionPoolName,
255 sessionOptions: {
256 maxUsageCount: this.maxSessionUsageCount,
257 },
258 },
259 experiments: {
260 requestLocking: true,
261 },
262 };
263
264 this._createNavigationHooks(options);
265
266 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
267 options.sessionPoolOptions!.maxPoolSize = 1;
268 }
269
270 if (this.input.suggestResponseEncoding) {
271 if (this.input.forceResponseEncoding) {
272 options.forceResponseEncoding =
273 this.input.suggestResponseEncoding;
274 } else {
275 options.suggestResponseEncoding =
276 this.input.suggestResponseEncoding;
277 }
278 }
279
280 this.crawler = new CheerioCrawler(options);
281
282 return this.crawler;
283 }
284
285 private _createNavigationHooks(options: CheerioCrawlerOptions) {
286 options.preNavigationHooks!.push(async ({ request, session }) => {
287
288 request.headers = Object.entries(request.headers ?? {}).reduce(
289 (newHeaders, [key, value]) => {
290 newHeaders[key.toLowerCase()] = value;
291 return newHeaders;
292 },
293 {} as Dictionary<string>,
294 );
295
296
297 if (this.input.initialCookies && this.input.initialCookies.length) {
298 const cookiesToSet = session
299 ? tools.getMissingCookiesFromSession(
300 session,
301 this.input.initialCookies,
302 request.url,
303 )
304 : this.input.initialCookies;
305 if (cookiesToSet?.length) {
306
307 session?.setCookies(cookiesToSet, request.url);
308 }
309 }
310 });
311
312 options.preNavigationHooks!.push(
313 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
314 );
315 options.postNavigationHooks!.push(
316 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
317 );
318 }
319
320 private _runHookWithEnhancedContext(
321 hooks: ((...args: unknown[]) => Awaitable<void>)[],
322 ) {
323 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
324 const { customData } = this.input;
325 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
326 });
327 }
328
329 private async _failedRequestHandler({ request }: CheerioCrawlingContext) {
330 const lastError =
331 request.errorMessages[request.errorMessages.length - 1];
332 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
333 log.error(
334 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
335 );
336 return this._handleResult(request, undefined, undefined, true);
337 }
338
339
340
341
342
343
344
345
346
347
348
349 private async _requestHandler(crawlingContext: CheerioCrawlingContext) {
350 const { request, response, $, crawler } = crawlingContext;
351 const pageFunctionArguments: Dictionary = {};
352
353
354 const props = Object.getOwnPropertyDescriptors(crawlingContext);
355 ['json', 'body'].forEach((key) => {
356 props[key].configurable = true;
357 });
358 Object.defineProperties(pageFunctionArguments, props);
359
360 pageFunctionArguments.cheerio = load([]);
361 pageFunctionArguments.response = {
362 status: response!.statusCode,
363 headers: response!.headers,
364 };
365
366 Object.defineProperties(
367 this,
368 Object.getOwnPropertyDescriptors(pageFunctionArguments),
369 );
370
371
372
373
374
375
376 tools.ensureMetaData(request);
377
378
379 const aborted = await this._handleMaxResultsPerCrawl(
380 crawler.autoscaledPool,
381 );
382 if (aborted) return;
383
384
385 const contextOptions = {
386 crawlerSetup: {
387 rawInput: this.rawInput,
388 env: this.env,
389 globalStore: this.globalStore,
390 requestQueue: this.requestQueue,
391 keyValueStore: this.keyValueStore,
392 customData: this.input.customData,
393 },
394 pageFunctionArguments,
395 };
396 const { context, state } = createContext(contextOptions);
397
398
399
400
401 const pageFunctionResult = await this.evaledPageFunction(context);
402
403
404
405
406
407
408
409 if (!state.skipLinks && !!$) await this._handleLinks(crawlingContext);
410
411
412 await this._handleResult(
413 request,
414 response,
415 pageFunctionResult as Dictionary,
416 );
417 }
418
419 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
420 if (
421 !this.input.maxResultsPerCrawl ||
422 this.pagesOutputted < this.input.maxResultsPerCrawl
423 )
424 return false;
425 if (!autoscaledPool) return false;
426 log.info(
427 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
428 );
429 await autoscaledPool.abort();
430 return true;
431 }
432
433 private async _handleLinks({
434 request,
435 enqueueLinks,
436 }: CheerioCrawlingContext) {
437 if (!(this.input.linkSelector && this.requestQueue)) return;
438 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
439 .depth;
440 const hasReachedMaxDepth =
441 this.input.maxCrawlingDepth &&
442 currentDepth >= this.input.maxCrawlingDepth;
443 if (hasReachedMaxDepth) {
444 log.debug(
445 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
446 );
447 return;
448 }
449
450 await enqueueLinks({
451 selector: this.input.linkSelector,
452 pseudoUrls: this.input.pseudoUrls,
453 globs: this.input.globs,
454 exclude: this.input.excludes,
455 transformRequestFunction: (requestOptions) => {
456 requestOptions.userData ??= {};
457 requestOptions.userData[META_KEY] = {
458 parentRequestId: request.id || request.uniqueKey,
459 depth: currentDepth + 1,
460 };
461
462 requestOptions.useExtendedUniqueKey = true;
463 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
464 return requestOptions;
465 },
466 });
467 }
468
469 private async _handleResult(
470 request: Request,
471 response?: IncomingMessage,
472 pageFunctionResult?: Dictionary,
473 isError?: boolean,
474 ) {
475 const payload = tools.createDatasetPayload(
476 request,
477 response,
478 pageFunctionResult,
479 isError,
480 );
481 await this.dataset.pushData(payload);
482 this.pagesOutputted++;
483 }
484}