Download HTML from URLs avatar

Download HTML from URLs

Try for free

No credit card required

Go to Store
Download HTML from URLs

Download HTML from URLs

mtrunkat/url-list-download-html
Try for free

No credit card required

This actor takes a list of URLs and downloads HTML of each page.

Dockerfile

1# Dockerfile contains instructions how to build a Docker image that
2# will contain all the code and configuration needed to run your actor.
3# For a full Dockerfile reference,
4# see https://docs.docker.com/engine/reference/builder/
5
6# First, specify the base Docker image. Apify provides the following
7# base images for your convenience:
8#  apify/actor-node-basic (Node.js 10 on Alpine Linux, small and fast)
9#  apify/actor-node-chrome (Node.js 10 + Chrome on Debian)
10#  apify/actor-node-chrome-xvfb (Node.js 10 + Chrome + Xvfb on Debian)
11# For more information, see https://apify.com/docs/actor#base-images
12# Note that you can use any other image from Docker Hub.
13FROM apify/actor-node-chrome
14
15# Second, copy just package.json since it should be the only file
16# that affects NPM install in the next step
17COPY package.json ./
18
19# Install NPM packages, skip optional and development dependencies to
20# keep the image small. Avoid logging too much and print the dependency
21# tree for debugging
22RUN npm --quiet set progress=false \
23 && npm install --only=prod --no-optional \
24 && echo "Installed NPM packages:" \
25 && npm list \
26 && echo "Node.js version:" \
27 && node --version \
28 && echo "NPM version:" \
29 && npm --version
30
31# Next, copy the remaining files and directories with the source code.
32# Since we do this after NPM install, quick build will be really fast
33# for most source file changes.
34COPY . ./
35
36# Optionally, specify how to launch the source code of your actor.
37# By default, Apify's base Docker images define the CMD instruction
38# that runs the source code using the command specified
39# in the "scripts.start" section of the package.json file.
40# In short, the instruction looks something like this:
41# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Input",
3    "type": "object",
4    "description": "Use the following form to configure this scraper. The URL list is required and all other fields are optional.",
5    "schemaVersion": 1,
6    "properties": {
7        "requestListSources": {
8            "title": "Start URLs",
9            "type": "array",
10            "description": "URLs to start with",
11            "prefill": [
12                { "url": "https://apify.com" }
13            ],
14            "editor": "requestListSources",
15            "minItems": 1
16        },
17        "proxyConfiguration": {
18            "title": "Proxy configuration",
19            "type": "object",
20            "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
21            "prefill": { "useApifyProxy": false },
22            "default": {},
23            "editor": "proxy"
24        },
25        "handlePageTimeoutSecs": {
26            "title": "Page timeout",
27            "type": "integer",
28            "description": "Maximum time the scraper will spend processing one page.",
29            "minimum": 1,
30            "default": 60,
31            "maximum": 360,
32            "unit": "secs"
33        },
34        "useChrome": {
35            "title": "Use Chrome",
36            "type": "boolean",
37            "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
38            "default": false,
39            "groupCaption": "Browser masking options",
40            "groupDescription": "Settings that help mask as a real user and prevent scraper detection."
41        },
42        "useStealth": {
43            "title": "Use Stealth",
44            "type": "boolean",
45            "description": "The scraper will apply various browser emulation techniques to match a real user as closely as possible. This feature works best in conjunction with the Use Chrome option and also carries the risk of making the scraper unstable.",
46            "default": false
47        }
48    },
49    "required": ["requestListSources"]
50}

main.js

1const Apify = require('apify');
2
3Apify.main(async () => {
4    const input = await Apify.getValue('INPUT');
5 
6    console.log(input);
7
8    const requestList = await Apify.openRequestList('my-list', input.requestListSources);
9    const launchPuppeteerOptions = Object.assign({}, input.proxyConfiguration);
10
11    if (input.useChrome) launchPuppeteerOptions.useChrome;
12    if (input.useStealth) launchPuppeteerOptions.stealth;
13
14    const handlePageFunction = async ({ request, response, page }) => {
15        if (request.userData.waitForSelector) {
16            await page.waitForSelector(request.userData.waitForSelector);
17        }
18    
19        await Apify.pushData({
20            url: request.url,
21            finishedAt: new Date(),
22            html: await page.evaluate(() => document.body.outerHTML),
23            '#debug': Apify.utils.createRequestDebugInfo(request, response),
24            '#error': false,
25        });
26    };
27    
28    const handleFailedRequestFunction = async ({ request }) => {
29        await Apify.pushData({
30            url: request.url,
31            finishedAt: new Date(),
32            '#debug': Apify.utils.createRequestDebugInfo(request),
33            '#error': true,
34        });
35    };
36
37    const crawlerOptions = {
38        requestList,
39        handlePageFunction,
40        handleFailedRequestFunction,
41        launchPuppeteerOptions,
42    };
43
44    if (input.handlePageTimeoutSecs) {
45        crawlerOptions.handlePageTimeoutSecs = input.handlePageTimeoutSecs;
46    }
47
48    const puppeteerCrawler = new Apify.PuppeteerCrawler(crawlerOptions);
49    await puppeteerCrawler.run();
50});

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^0.14.15"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Me!"
11}
Developer
Maintained by Community

Actor Metrics

  • 95 monthly users

  • 17 stars

  • >99% runs succeeded

  • Created in Feb 2018

  • Modified 9 months ago

Categories