1const Apify = require('apify');
2
3const { log } = Apify.utils;
4
5Apify.main(async () => {
6 const input = await Apify.getInput();
7 const { urls = [], proxy = { useApifyProxy: false } } = input
8
9 if (input.url) urls.push(input.url)
10
11 const requests = [];
12 for (const url of urls) {
13 if (!new URL(url)) throw new Error('All URLs must be valid URLs!');
14 requests.push({ url });
15 }
16
17 const requestList = await Apify.openRequestList('start-urls', requests);
18 const proxyConfiguration = await Apify.createProxyConfiguration({ ...proxy });
19
20 const crawler = new Apify.CheerioCrawler({
21 requestList,
22 proxyConfiguration,
23 maxConcurrency: 50,
24 handlePageFunction: async ({ $, request }) => {
25 const meta = {};
26
27 for (const tag of $('head meta')) {
28 const name = $(tag).attr('name') || $(tag).attr('property') || $(tag).attr('http-equiv');
29 const content = $(tag).attr('content');
30 if (name) meta[name] = content ? content.trim() : null;
31 }
32
33 const result = {
34 url: request.url,
35 title: ($('head title').text() || '').trim(),
36 meta,
37 };
38
39 return Apify.pushData(result);
40 },
41 });
42
43 log.info('Starting the crawl...');
44 await crawler.run();
45 log.info('Scraping finished! Metadata for each site is available in "Results".');
46});