Probe Page Resources avatar

Probe Page Resources

Try for free

No credit card required

Go to Store
Probe Page Resources

Probe Page Resources

jancurn/probe-page-resources
Try for free

No credit card required

Sequentially loads a list of URLs in headless Chrome and analyzes HTTP resources requested by each page. Source code at https://github.com/jancurn/act-probe-page-resources

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-puppeteer-chrome
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=myuser:myuser . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

main.js

1const chromeLauncher = require('chrome-launcher');
2const CDP = require('chrome-remote-interface');
3const _ = require('underscore');
4const Apify = require('apify');
5const typeCheck = require('type-check').typeCheck;
6
7
8// Definition of the input
9const INPUT_TYPE = `{
10    urls: [String],
11    waitSecs: Maybe Number,
12    verboseLog: Maybe Boolean,
13    headers: Maybe Object     
14}`;
15
16
17Apify.main(async () => {
18    // Fetch and check the input
19    const input = await Apify.getValue('INPUT');
20    if (!typeCheck(INPUT_TYPE, input)) {
21        console.log('Expected input:');
22        console.log(INPUT_TYPE);
23        console.log('Received input:');
24        console.dir(input);
25        throw new Error('Received invalid input');
26    }
27
28    // Launch Chrome
29    const chrome = await launchChrome({
30        headless: !!process.env.APIFY_HEADLESS,
31        verboseLog: input.verboseLog
32    });
33    const client = await CDP({ port: chrome.port });
34
35    let currentResult = null;
36
37    // Extract domains
38    const { Network, Page } = client;
39
40    // Add HTTP headers
41    if (input.headers) {
42        await Network.setExtraHTTPHeaders({ headers: input.headers });
43        if (input.headers['User-Agent']) await Network.setUserAgentOverride({ userAgent: input.headers['User-Agent'] });
44    }
45
46    // Setup event handlers
47    await Network.requestWillBeSent((params) => {
48        //console.log("### Network.requestWillBeSent");
49        //console.dir(params);
50
51        let req = currentResult.requests[params.requestId];
52        if (!req) {
53            req = currentResult.requests[params.requestId] = {};
54            req.url = params.request.url;
55            req.method = params.request.method;
56            req.requestedAt = new Date(params.wallTime * 1000);
57        } else {
58            // On redirects, the Network.requestWillBeSent() is fired multiple times
59            // with the same requestId and the subsequent requests contain the 'redirectResponse' field
60            req.redirects = req.redirects || [];
61            const redirect = _.pick(params.redirectResponse, 'url', 'status');
62            redirect.location = params.redirectResponse && params.redirectResponse.headers ? params.redirectResponse.headers['location'] : null;
63            req.redirects.push(redirect);
64        }
65    });
66
67    await Network.responseReceived((params) => {
68        //console.log("### Network.responseReceived");
69        //console.dir(params);
70
71        const req = currentResult.requests[params.requestId];
72        req.status = params.response.status;
73        req.mimeType = params.response.mimeType;
74        req.type = params.type;
75    });
76
77    await Network.loadingFailed((params) => {
78        //console.log("### Network.loadingFailed");
79        //console.dir(params);
80
81        // Note that request failures might come from the previous page
82        const req = currentResult.requests[params.requestId];
83        if (req) {
84            req.type = params.type;
85            req.errorText = params.errorText;
86            req.canceled = params.canceled;
87        }
88    });
89
90    // Enable events
91    await Promise.all([Network.enable(), Page.enable()]);
92
93    // Disable cache
94    await Network.setCacheDisabled({ cacheDisabled: true });
95
96    // Iterate and probe all URLs
97    const results = [];
98    for (let url of input.urls) {
99        console.log(`Navigating to URL: ${url}`);
100        currentResult = {
101            url,
102            requests: {}
103        };
104        results.push(currentResult);
105
106        await Page.navigate({ url });
107        await Page.loadEventFired();
108
109        // Wait input.waitSecs seconds
110        await new Promise((resolve) => setTimeout(resolve, input.waitSecs*1000 || 0));
111        await Page.stopLoading();
112    }
113
114    // Save results
115    await Apify.setValue('OUTPUT', results);
116
117    // Only useful for local development
118    await chrome.kill();
119
120    console.log('Done');
121});
122
123
124// Code inspired by https://developers.google.com/web/updates/2017/04/headless-chrome
125const launchChrome = async (options = {}) => {
126    console.log('Launching Chrome...');
127    const chrome = await chromeLauncher.launch({
128        chromeFlags: [
129            options.headless ? '--disable-gpu' : '',
130            options.headless ? '--headless' : '',
131            '--no-sandbox',
132        ],
133        logLevel: options.verboseLog ? 'verbose' : 'error',
134    });
135
136    const version = await CDP.Version({port: chrome.port});
137    console.log(`Chrome launched (pid: ${chrome.pid}, port: ${chrome.port}, userAgent: ${version['User-Agent']})`);
138
139    return chrome;
140};

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "chrome-launcher": "latest",
9        "chrome-remote-interface": "latest",
10        "underscore": "latest",
11        "apify": "^2.2.2",
12        "type-check": "latest"
13    },
14    "scripts": {
15        "start": "node main.js"
16    }
17}
Developer
Maintained by Community

Actor Metrics

  • 1 monthly user

  • 3 stars

  • >99% runs succeeded

  • Created in Aug 2017

  • Modified a year ago

Categories