Page Rank Actor avatar
Page Rank Actor

Under maintenance

Pricing

Pay per usage

Go to Store
Page Rank Actor

Page Rank Actor

Under maintenance

Developed by

Ludvík Prokopec

Maintained by Community

Page rank Actor ranks URLs that satisfy the glob patterns.

0.0 (0)

Pricing

Pay per usage

0

Monthly users

1

Runs succeeded

>99%

Last modified

a month ago

.dockerignore

1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
10quote_type = single
11max_line_length = 100
12trailing_comma = true
13
14[*.md]
15indent_size = 1

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5.vscode
6dist
7node_modules
8apify_storage
9storage

.prettierrc.js

1const config = {
2    trailingComma: 'all',
3};
4
5// eslint-disable-next-line import/no-default-export
6export default config;

eslint.config.mjs

1import apifyTypescriptConfig from '@apify/eslint-config/ts';
2
3export default [
4    ...apifyTypescriptConfig,
5    {
6        languageOptions: {
7            sourceType: 'module',
8            parserOptions: {
9                projectService: true,
10                tsconfigRootDir: import.meta.dirname,
11            },
12        },
13    },
14];

package.json

1{
2    "name": "page-rank-actor",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "Page rank Actor ranks URLs that satisfy the glob patterns.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.2.6",
11        "crawlee": "^3.11.5",
12        "graphology": "^0.26.0",
13        "graphology-gexf": "^0.13.2",
14        "graphology-layout-forceatlas2": "^0.10.1",
15        "puppeteer": "*",
16        "uniqolor": "^1.1.1",
17        "zod": "^3.24.2"
18    },
19    "devDependencies": {
20        "@apify/eslint-config": "1.0.0",
21        "@apify/tsconfig": "^0.1.0",
22        "eslint": "^9.19.0",
23        "prettier": "^3.5.3",
24        "tsx": "^4.6.2",
25        "typescript": "^5.3.3",
26        "typescript-eslint": "^8.23.0",
27        "vitest": "^3.0.8"
28    },
29    "scripts": {
30        "start": "npm run start:dev",
31        "start:prod": "node dist/main.js",
32        "start:dev": "tsx src/main.ts",
33        "build": "rm -rf dist && tsc",
34        "lint": "eslint src/**.ts",
35        "lint:fix": "eslint src/**.ts --fix",
36        "test": "vitest --run",
37        "test:update": "vitest --run -u",
38        "format": "prettier --write ."
39    },
40    "author": "It's not you it's me",
41    "license": "ISC"
42}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "NodeNext",
5        "moduleResolution": "NodeNext",
6        "target": "ES2022",
7        "outDir": "dist",
8        "noUnusedLocals": false,
9        "skipLibCheck": true,
10        "strictNullChecks": true,
11        "lib": ["DOM"]
12    },
13    "include": ["./src/**/*"]
14}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY --chown=myuser package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY --chown=myuser . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node-puppeteer-chrome:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY --chown=myuser package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38    && npm install --omit=dev --omit=optional \
39    && echo "Installed NPM packages:" \
40    && (npm list --omit=dev --all || true) \
41    && echo "Node.js version:" \
42    && node --version \
43    && echo "NPM version:" \
44    && npm --version \
45    && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY --from=builder --chown=myuser /home/myuser/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY --chown=myuser . ./
54
55
56# Run the image. If you know you won't need headful browsers,
57# you can remove the XVFB start script for a micro perf gain.
58CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "page-rank-actor",
4    "title": "Page rank Actor",
5    "description": "Page rank Actor ranks URLs that satisfy the glob patterns.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "ts-crawlee-puppeteer-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Page rank Actor",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        },
17        "alpha": {
18            "title": "Alpha",
19            "type": "string",
20            "description": "The damping factor used in the PageRank algorithm, which determines the probability that a user will continue to follow links.",
21            "default": "0.85",
22            "prefill": "0.85",
23            "editor": "textfield"
24        },
25        "epsilon": {
26            "title": "Epsilon",
27            "type": "string",
28            "description": "A small positive value used to control the precision of the PageRank algorithm, helping to determine when the algorithm has converged. It is used as a threshold for stopping the iterative computation process. The default value is `Number.EPSILON * 10`.",
29            "default": "2.220446049250313e-15",
30            "editor": "textfield"
31        },
32        "includeUrlGlobs": {
33            "sectionCaption": "Crawler settings",
34            "title": "Include URLs (globs)",
35            "type": "array",
36            "description": "Glob patterns matching URLs of pages that will be included in crawling. \n\nSetting this option will disable the default Start URLs based scoping and will allow you to customize the crawling scope yourself. Note that this affects only links found on pages, but not **Start URLs** - if you want to crawl a page, make sure to specify its URL in the **Start URLs** field. \n\nFor example `https://{store,docs}.example.com/**` lets the crawler to access all URLs starting with `https://store.example.com/` or `https://docs.example.com/`, and `https://example.com/**/*\\?*foo=*` allows the crawler to access all URLs that contain `foo` query parameter with any value.\n\nLearn more about globs and test them [here](https://www.digitalocean.com/community/tools/glob?comments=true&glob=https%3A%2F%2Fexample.com%2Fscrape_this%2F%2A%2A&matches=false&tests=https%3A%2F%2Fexample.com%2Ftools%2F&tests=https%3A%2F%2Fexample.com%2Fscrape_this%2F&tests=https%3A%2F%2Fexample.com%2Fscrape_this%2F123%3Ftest%3Dabc&tests=https%3A%2F%2Fexample.com%2Fdont_scrape_this).",
37            "editor": "globs"
38        },
39        "excludeUrlGlobs": {
40            "title": "Exclude URLs (globs)",
41            "type": "array",
42            "description": "Glob patterns matching URLs of pages that will be excluded from crawling. Note that this affects only links found on pages, but not **Start URLs**, which are always crawled. \n\nFor example `https://{store,docs}.example.com/**` excludes all URLs starting with `https://store.example.com/` or `https://docs.example.com/`, and `https://example.com/**/*\\?*foo=*` excludes all URLs that contain `foo` query parameter with any value.\n\nLearn more about globs and test them [here](https://www.digitalocean.com/community/tools/glob?comments=true&glob=https%3A%2F%2Fexample.com%2Fdont_scrape_this%2F%2A%2A&matches=false&tests=https%3A%2F%2Fexample.com%2Ftools%2F&tests=https%3A%2F%2Fexample.com%2Fdont_scrape_this%2F&tests=https%3A%2F%2Fexample.com%2Fdont_scrape_this%2F123%3Ftest%3Dabc&tests=https%3A%2F%2Fexample.com%2Fscrape_this).",
43            "editor": "globs"
44        },
45        "maxCrawlPages": {
46            "title": "Max pages",
47            "type": "integer",
48            "description": "The maximum number pages to crawl. It includes the start URLs, pagination pages, pages with no content, etc. The crawler will automatically finish after reaching this number. This setting is useful to prevent accidental crawler runaway.",
49            "minimum": 0,
50            "default": 9999999
51        },
52        "maxCrawlDepth": {
53            "title": "Max crawling depth",
54            "type": "integer",
55            "description": "The maximum number of links starting from the start URL that the crawler will recursively follow. The start URLs have depth `0`, the pages linked directly from the start URLs have depth `1`, and so on.\n\nThis setting is useful to prevent accidental crawler runaway. By setting it to `0`, the Actor will only crawl the Start URLs.",
56            "minimum": 0,
57            "default": 20
58        },
59        "linkSelector": {
60            "title": "Link selector",
61            "type": "string",
62            "description": "A CSS selector matching links to be enqueued.",
63            "default": "a",
64            "editor": "textfield"
65        }
66    }
67}

images/HTMLGraphScreenshot.png

src/PageRank.test.ts

1import { describe, expect, test } from 'vitest';
2
3import { PageRank } from './PageRank.js';
4
5describe('PageRank', () => {
6    test('Should calculate the ranks of the nodes', () => {
7        const pageRank = new PageRank();
8
9        /**
10         * A -> B
11         * B -> A | C
12         * C -> A | D
13         * D -> A | B
14         */
15        pageRank.addLink({ from: 'A', to: 'B' });
16
17        pageRank.addLink({ from: 'B', to: 'A' });
18        pageRank.addLink({ from: 'B', to: 'C' });
19
20        pageRank.addLink({ from: 'C', to: 'A' });
21        pageRank.addLink({ from: 'C', to: 'D' });
22
23        pageRank.addLink({ from: 'D', to: 'A' });
24        pageRank.addLink({ from: 'D', to: 'B' });
25
26        const vector = pageRank.rank();
27
28        expect(pageRank.getMatrix()).toMatchInlineSnapshot(`
29          [
30            [
31              0,
32              1,
33              0,
34              0,
35            ],
36            [
37              1,
38              0,
39              1,
40              0,
41            ],
42            [
43              1,
44              0,
45              0,
46              1,
47            ],
48            [
49              1,
50              1,
51              0,
52              0,
53            ],
54          ]
55        `);
56
57        expect(vector).toMatchInlineSnapshot(`
58          [
59            {
60              "importance": 1,
61              "inlinks": [
62                "A",
63                "D",
64              ],
65              "outlinks": [
66                "A",
67                "C",
68              ],
69              "pageRank": 0.36403338045385997,
70              "url": "B",
71            },
72            {
73              "importance": 2,
74              "inlinks": [
75                "B",
76                "C",
77                "D",
78              ],
79              "outlinks": [
80                "B",
81              ],
82              "pageRank": 0.3245614035087722,
83              "url": "A",
84            },
85            {
86              "importance": 3,
87              "inlinks": [
88                "B",
89              ],
90              "outlinks": [
91                "A",
92                "D",
93              ],
94              "pageRank": 0.19221418669289012,
95              "url": "C",
96            },
97            {
98              "importance": 4,
99              "inlinks": [
100                "C",
101              ],
102              "outlinks": [
103                "A",
104                "B",
105              ],
106              "pageRank": 0.11919102934447878,
107              "url": "D",
108            },
109          ]
110        `);
111    });
112});

src/PageRank.ts

1import { SquareMatrix } from './SquareMatrix.js';
2import { Vector } from './Vector.js';
3
4export type RankArgs = {
5    alpha?: number;
6    epsilon?: number;
7};
8
9export type RankPayloadItem = {
10    url: string;
11    pageRank: number;
12    inlinks: string[];
13    outlinks: string[];
14};
15
16export type AddLinkArgs = {
17    from: string;
18    to: string;
19};
20
21export class PageRank {
22    private matrix = new SquareMatrix(0, []);
23    private nodes: string[] = [];
24
25    /**
26     * Just for quick access to the node index.
27     * Can be replaced with nodes.indexOf(node) but it's O(n).
28     */
29    private nodeIndexes: Record<string, number> = {};
30
31    /**
32     * Add a link from source page to target page.
33     * @param from - The source page.
34     * @param to - The target page.
35     */
36    addLink({ from, to }: AddLinkArgs): void {
37        this.upsertNode(from);
38        this.upsertNode(to);
39        this.addLinkToMatrix(from, to);
40    }
41
42    /**
43     * Iteratively calculate the rank of the nodes.
44     * @param alpha - The damping factor (between 0 and 1).
45     * @param epsilon - The minimum difference between the previous and the current rank in the iteration.
46     * @returns The ranks of the nodes.
47     * @see - https://web.stanford.edu/class/cs315b/assignment3.html
48     */
49    rank({ alpha = 0.85, epsilon = Number.EPSILON * 10 }: RankArgs = {}): RankPayloadItem[] {
50        /**
51         * Number of nodes (N)
52         */
53        const N = this.matrix.size();
54        const transitionMatrix = new SquareMatrix(N, this.matrix.values);
55
56        /**
57         * Initialize the rank of the nodes.
58         * PR(p;0)=1/N
59         */
60        let pageRanks = new Vector(Array.from({ length: N }, () => 1 / N));
61
62        /**
63         * Normalize the adjacency matrix into a stochastic transition matrix
64         * Stochastic matrix is a square matrix used to describe the transitions of a Markov chain.
65         * For each row, the sum of the elements is equal to 1.
66         */
67        for (let i = 0; i < N; i++) {
68            const outboundLinks = transitionMatrix.values[i].reduce((sum, val) => sum + val, 0);
69
70            if (outboundLinks === 0) {
71                /**
72                 * Handle dangling nodes by distributing probability equally
73                 */
74                for (let j = 0; j < N; j++) {
75                    transitionMatrix.values[i][j] = 1 / N;
76                }
77            } else {
78                /**
79                 * Normalize the matrix values to sum to 1 (stochastic matrix)
80                 */
81                for (let j = 0; j < N; j++) {
82                    transitionMatrix.values[i][j] /= outboundLinks;
83                }
84            }
85        }
86
87        /**
88         * Apply the damping factor to the transition matrix.
89         * Makes the matrix aperiodic and irreducible.
90         * Aperiodic and irreducible Markov chains converge to a unique stationary distribution.
91         */
92        for (let i = 0; i < N; i++) {
93            for (let j = 0; j < N; j++) {
94                // eslint-disable-next-line operator-linebreak
95                transitionMatrix.values[i][j] =
96                    alpha * transitionMatrix.values[i][j] + (1 - alpha) / N;
97            }
98        }
99
100        /**
101         * The difference between the previous and the current rank in the iteration.
102         */
103        let diff = Infinity;
104
105        while (diff >= epsilon) {
106            /**
107             * P(k+1) = M * P(k)
108             * = PR(p;t+1)=alpha*SUM(PR(q;t)/L(q))+((1-alpha)/N)
109             * Alpha is already applied to the transition matrix (M).
110             */
111            const newPageRanks = this.multiply(transitionMatrix, pageRanks);
112
113            /**
114             * L2 norm (Euclidean distance) between the previous and the current rank in the iteration.
115             */
116            diff = Math.sqrt(
117                pageRanks.values.reduce(
118                    (sum, val, index) => sum + (val - newPageRanks.values[index]) ** 2,
119                    0,
120                ),
121            );
122
123            pageRanks = newPageRanks;
124        }
125
126        const inlinks = this.matrix.values.map((_, rowIndex) =>
127            // eslint-disable-next-line implicit-arrow-linebreak
128            this.nodes.filter((_node, colIndex) => this.matrix.values[colIndex][rowIndex] > 0),
129        );
130
131        const outlinks = this.matrix.values.map((_, rowIndex) =>
132            // eslint-disable-next-line implicit-arrow-linebreak
133            this.nodes.filter((_node, colIndex) => this.matrix.values[rowIndex][colIndex] > 0),
134        );
135
136        return this.nodes
137            .map((node, nodeIndex) => ({
138                url: node,
139                pageRank: pageRanks.values[nodeIndex],
140                inlinks: inlinks[nodeIndex],
141                outlinks: outlinks[nodeIndex],
142            }))
143            .sort((a, b) => b.pageRank - a.pageRank)
144            .map((pageRankNode, index) => ({
145                ...pageRankNode,
146                importance: index + 1,
147            }));
148    }
149
150    getMatrix(): number[][] {
151        return this.matrix.values;
152    }
153
154    getNodes(): string[] {
155        return this.nodes;
156    }
157
158    private upsertNode(node: string): void {
159        if (this.getNodeIndex(node) === undefined) {
160            this.addNode(node);
161        }
162    }
163
164    private addNode(node: string): void {
165        this.nodeIndexes[node] = this.nodes.length;
166        this.nodes.push(node);
167
168        this.matrix = this.matrix.resizeBy(1);
169    }
170
171    private getNodeIndex(node: string): number | undefined {
172        return this.nodeIndexes[node];
173    }
174
175    private addLinkToMatrix(from: string, to: string): void {
176        const fromIndex = this.getNodeIndex(from)!;
177        const toIndex = this.getNodeIndex(to)!;
178
179        const value = this.matrix.at(fromIndex, toIndex);
180        this.matrix.values[fromIndex][toIndex] = value + 1;
181    }
182
183    private multiply(matrix: SquareMatrix, vector: Vector) {
184        const result = new Vector(Array.from({ length: matrix.size() }, () => 0));
185
186        for (let i = 0; i < matrix.size(); i++) {
187            for (let j = 0; j < matrix.size(); j++) {
188                result.values[i] += vector.values[j] * matrix.values[j][i];
189            }
190        }
191
192        return result;
193    }
194}

src/SquareMatrix.ts

1export class SquareMatrix {
2    /** Values of the matrix as a two dimensional array */
3    private _values: number[][];
4
5    constructor(size: number, values?: number[][]) {
6        // Create matrix filled with 0 by default
7        this._values = new Array<number[]>(size)
8            .fill([])
9            .map(() => new Array<number>(size).fill(0));
10
11        if (values) {
12            this.values = values;
13        }
14    }
15
16    size(): number {
17        return this._values.length;
18    }
19
20    get values(): number[][] {
21        return this._values;
22    }
23
24    /**
25     * Set values into the matrix.
26     * If the parameters matrix is to wide, the values are cropped to the current matrix size.
27     * It the parameters matrix is to small, remaining cells will be filled with 0.
28     * @param newValues Arrays of new values.
29     */
30    set values(newValues: number[][]) {
31        const minRow = Math.min(newValues.length, this.size());
32        const minCol = minRow > 0 ? Math.min(newValues[0].length, this.size()) : 0;
33
34        for (let r = 0; r < minRow; r++) {
35            for (let c = 0; c < minCol; c++) {
36                this.values[r][c] = newValues[r][c];
37            }
38        }
39    }
40
41    /**
42     * Get a matrix value, from its position
43     * @param row Matrix line, from 0 to `rows`
44     * @param col Matric column, from 0 to `columns`
45     */
46    at(row: number, col: number): number {
47        return this.values[row][col];
48    }
49
50    resizeBy(resize: number): SquareMatrix {
51        return new SquareMatrix(this.size() + resize, this.values);
52    }
53}

src/Vector.ts

1export class Vector {
2    /** Values of the vector */
3    private _values: number[];
4
5    constructor(values?: number[]) {
6        // Create matrix filled with 0 by default
7        this._values = new Array<number>((values || [0]).length).fill(0);
8
9        if (values) {
10            this.values = values;
11        }
12    }
13
14    size() {
15        return this._values.length;
16    }
17
18    get values() {
19        return this._values;
20    }
21
22    /**
23     * Set values into the vector.
24     * If the parameters vector is to wide, the values are cropped to the current vector size.
25     * It the parameters vector is to small, remaining cells will be filled with 0.
26     * @param newValues Arrays of new values.
27     */
28    set values(newValues: number[]) {
29        const minSize = Math.min(this._values.length, newValues.length);
30        for (let i = 0; i < minSize; i++) {
31            this._values[i] = newValues[i];
32        }
33    }
34
35    /**
36     * Get a matrix value, from its position
37     * @param row Matrix line, from 0 to `rows`
38     */
39    at(row: number): number {
40        return this._values[row];
41    }
42}

src/createPageRankRouter.ts

1import {
2    constructGlobObjectsFromGlobs,
3    createPuppeteerRouter,
4    createRequests,
5    EnqueueStrategy,
6    extractUrlsFromPage,
7    type Dictionary,
8    type GlobInput,
9    type PuppeteerCrawlingContext,
10    type RouterHandler,
11} from 'crawlee';
12
13import { PageRank } from './PageRank.js';
14
15export type CreatePageRankRouterArgs = {
16    includeUrlGlobs?: GlobInput[];
17    excludeUrlGlobs?: GlobInput[];
18    linkSelector: string;
19    maxCrawlDepth?: number;
20};
21
22export type UserData = {
23    depth: number;
24};
25
26export type CreatePageRankRouterPayload = {
27    router: RouterHandler<PuppeteerCrawlingContext<Dictionary>>;
28    pageRank: PageRank;
29};
30
31export const createPageRankRouter = ({
32    includeUrlGlobs,
33    excludeUrlGlobs,
34    linkSelector,
35    maxCrawlDepth,
36}: CreatePageRankRouterArgs): CreatePageRankRouterPayload => {
37    const pageRank = new PageRank();
38
39    const router = createPuppeteerRouter();
40
41    router.addDefaultHandler(async ({ page, request, log, enqueueLinks }) => {
42        const title = await page.title();
43        log.debug(`${title}`, { url: request.loadedUrl });
44
45        const { depth } = request.userData;
46
47        const url = new URL(request.loadedUrl);
48
49        const urls = await extractUrlsFromPage(page, linkSelector, url.origin);
50
51        /**
52         * Construct the URL patterns from the include and exclude URL globs.
53         * URL patterns are used to determine which URLs to enqueue and add to a page rank graph.
54         */
55        const urlPatternObjects = constructGlobObjectsFromGlobs(includeUrlGlobs ?? []);
56        const urlExcludePatternObjects = constructGlobObjectsFromGlobs(excludeUrlGlobs ?? []);
57
58        /**
59         * Filter the URLs based on the URL patterns.
60         */
61        const nextRequests = createRequests(
62            urls,
63            urlPatternObjects,
64            urlExcludePatternObjects,
65            EnqueueStrategy.SameHostname,
66        );
67
68        /**
69         * Add filtered URLs to the page rank graph.
70         */
71        for (const nextRequest of nextRequests) {
72            const nextRequestURL = new URL(nextRequest.url);
73
74            /**
75             * Always add the link without query parameters or hash fragments.
76             */
77            const nextUrl = `${nextRequestURL.origin}${nextRequestURL.pathname}`;
78
79            pageRank.addLink({
80                from: request.url,
81                to: nextUrl,
82            });
83        }
84
85        /**
86         * Enqueue the filtered URLs for further crawling.
87         */
88        await enqueueLinks({
89            urls: nextRequests.map((nextRequest) => nextRequest.url),
90            userData: {
91                depth: depth + 1,
92            } as UserData,
93            transformRequestFunction: (req) => {
94                /**
95                 * If the maximum crawl depth is reached, do not enqueue the request.
96                 */
97                if (typeof maxCrawlDepth === 'number' && depth >= maxCrawlDepth) {
98                    return null;
99                }
100
101                return req;
102            },
103        });
104    });
105
106    return { router, pageRank };
107};

src/inputSchema.ts

1import { type GlobInput, type Request } from 'crawlee';
2import { z } from 'zod';
3
4export const inputSchema = z.object({
5    /**
6     * No additional validation is needed as it is already validated by the JSON schema.
7     */
8    startUrls: z.custom<Request[]>(),
9
10    includeUrlGlobs: z.custom<GlobInput[]>().optional(),
11    excludeUrlGlobs: z.custom<GlobInput[]>().optional(),
12    maxCrawlPages: z.custom<number>(),
13    maxCrawlDepth: z.custom<number>(),
14    linkSelector: z.custom<string>(),
15
16    alpha: z.coerce
17        .number()
18        .refine((val) => val > 0, { message: 'Alpha must be greater than 0' })
19        .refine((val) => val < 1, { message: 'Alpha must be less than 1' })
20        .default(0.85),
21    epsilon: z.coerce
22        .number()
23        .refine((val) => val > 0, { message: 'Epsilon must be greater than 0' })
24        .default(Number.EPSILON * 10),
25});
26
27export const processInput = (input: unknown) => {
28    return inputSchema.parse(input);
29};
30
31export type InputSchema = z.infer<typeof inputSchema>;

src/main.ts

1import { Actor, Dataset } from 'apify';
2import { PuppeteerCrawler } from 'crawlee';
3
4import { createPageRankRouter } from './createPageRankRouter.js';
5import { processInput } from './inputSchema.js';
6import { Graphology } from './utils/Graphology.js';
7
8await Actor.init();
9
10const input = processInput(await Actor.getInput());
11
12const {
13    startUrls,
14    alpha,
15    epsilon,
16    includeUrlGlobs,
17    excludeUrlGlobs,
18    maxCrawlDepth,
19    maxCrawlPages,
20    linkSelector,
21} = input;
22
23const proxyConfiguration = await Actor.createProxyConfiguration();
24
25const { router, pageRank } = createPageRankRouter({
26    excludeUrlGlobs,
27    includeUrlGlobs,
28    linkSelector,
29    maxCrawlDepth,
30});
31
32const crawler = new PuppeteerCrawler({
33    proxyConfiguration,
34    maxRequestsPerCrawl: maxCrawlPages,
35    requestHandler: router,
36    launchContext: {
37        launchOptions: {
38            args: [
39                '--disable-gpu', // Mitigates the "crashing GPU process" issue in Docker containers
40                '--no-sandbox', // Mitigates the "sandboxed" process issue in Docker containers
41            ],
42        },
43    },
44});
45
46await crawler.run(startUrls);
47
48const pageRanks = pageRank.rank({
49    alpha,
50    epsilon,
51});
52
53await Dataset.pushData(pageRanks);
54
55const graphology = new Graphology({
56    adjacencyMatrix: pageRank.getMatrix(),
57    nodes: pageRank.getNodes(),
58    pageRanks: pageRanks.map((pageRankNode) => pageRankNode.pageRank),
59});
60
61const graph = graphology.createGraph();
62
63const gexfGraph = Graphology.toGEFX(graph);
64
65const outputStore = await Actor.openKeyValueStore('OUTPUT');
66
67await outputStore.setValue('Graph', gexfGraph, {
68    contentType: 'application/xml',
69});
70
71const htmlGraph = Graphology.toSigmaHTML(graph);
72
73const htmlGraphKey = 'HTMLGraph';
74
75await outputStore.setValue(htmlGraphKey, htmlGraph, {
76    contentType: 'text/html',
77});
78
79const htmlGraphUrl = `https://api.apify.com/v2/key-value-stores/${
80    outputStore.id
81}/records/${htmlGraphKey}`;
82
83await Actor.setStatusMessage(
84    `HTML graph have been saved to the key-value store: ${htmlGraphUrl}.`,
85    {
86        isStatusMessageTerminal: true,
87    },
88);
89
90await Actor.exit();

src/utils/Graphology.ts

1/* eslint-disable indent */
2import { DirectedGraph } from 'graphology';
3import { write } from 'graphology-gexf';
4import ForceAtlas2 from 'graphology-layout-forceatlas2';
5import uniqolor from 'uniqolor';
6
7/**
8 * This is a workaround to fix the type definition of the `graphology-layout-forceatlas2` package.
9 */
10const forceAtlas2 = ForceAtlas2 as unknown as typeof ForceAtlas2.default;
11
12export type GraphologyOptions = {
13    pageRanks: number[];
14    nodes: string[];
15    adjacencyMatrix: number[][];
16};
17
18export class Graphology {
19    private nodes: string[];
20    private pageRanks: number[];
21    private adjacencyMatrix: number[][];
22
23    private minRank: number;
24    private maxRank: number;
25
26    constructor(opts: GraphologyOptions) {
27        this.pageRanks = opts.pageRanks;
28        this.nodes = opts.nodes;
29        this.adjacencyMatrix = opts.adjacencyMatrix;
30
31        this.minRank = Math.min(...opts.pageRanks);
32        this.maxRank = Math.max(...opts.pageRanks);
33    }
34
35    createGraph(): DirectedGraph {
36        const graph = new DirectedGraph();
37
38        this.nodes.forEach((node) => this.addNode(graph, node));
39
40        this.adjacencyMatrix.forEach((row, i) => {
41            row.forEach((weight, j) => {
42                if (weight === 0) {
43                    return;
44                }
45
46                this.addEdge(
47                    graph,
48                    {
49                        node: this.nodes[i],
50                        color: graph.getNodeAttribute(this.createNodeID(i), 'color'),
51                    },
52                    {
53                        node: this.nodes[j],
54                        color: graph.getNodeAttribute(this.createNodeID(i), 'color'),
55                    },
56                    weight,
57                );
58            });
59        });
60
61        const settings = forceAtlas2.inferSettings(graph);
62
63        forceAtlas2.assign(graph, {
64            iterations: 50,
65            settings,
66        });
67
68        return graph;
69    }
70
71    static toGEFX(graph: DirectedGraph): string {
72        return write(graph);
73    }
74
75    static toSigmaHTML(graph: DirectedGraph): string {
76        return `
77            <!DOCTYPE html>
78            <html lang="en">
79                <head>
80                    <meta charset="UTF-8" />
81                    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
82                    <title>Actor Page Rank - Graph</title>
83                    <style>
84                        body {
85                            margin: 0;
86                            font-family: Arial, sans-serif;
87                        }
88
89                        #container {
90                            width: 100vw;
91                            height: 100vh;
92                        }
93                    </style>
94                    <script type="importmap">
95                        {
96                            "imports": {
97                                "sigma": "https://cdnjs.cloudflare.com/ajax/libs/sigma.js/3.0.0/sigma.min.js",
98                                "graphology": "https://cdn.jsdelivr.net/npm/graphology@0.26.0/dist/graphology.umd.min.js"
99                            }
100                        }
101                    </script>
102                </head>
103                <body>
104                    <div id="container"></div>
105                    <script type="module">
106                        import * as sigma from 'sigma';
107                        import 'graphology';
108
109                        const graph = new graphology.DirectedGraph();
110
111                        ${graph
112                            .mapNodes((node, attributes) => {
113                                return `graph.addNode("${node}", ${JSON.stringify(attributes)});`;
114                            })
115                            .join('\n')};
116
117                        ${graph
118                            .mapEdges((_edge, attributes, source, target) => {
119                                return `graph.addEdge("${source}", "${target}", ${JSON.stringify(attributes)});`;
120                            })
121                            .join('\n')};
122
123                        const renderer = new Sigma(graph, document.getElementById('container'), {
124                            renderEdgeLabels: false,
125                            renderLabels: false,
126                        });
127
128                        const toggledNodes = new Set();
129
130                        renderer.on('clickNode', ({ node }) => {
131                            const nodeWasToggled = toggledNodes.has(node);
132
133                            if (nodeWasToggled) {
134                                toggledNodes.delete(node);
135                            } else {
136                                toggledNodes.add(node);
137                            }
138
139                            graph.forEachEdge((edge, attributes, source, target) => {
140                                if (source === node || target === node) {
141                                    const isEdgeToggled = toggledNodes.has(source) || toggledNodes.has(target);
142                                    const edgeHidden = !isEdgeToggled;
143
144                                    graph.setEdgeAttribute(edge, 'hidden', edgeHidden);
145                                }
146                            });
147
148                            renderer.refresh();
149                        });
150                    </script>
151                </body>
152            </html>
153        `;
154    }
155
156    private addNode(graph: DirectedGraph, node: string) {
157        const nodeIndex = this.nodes.indexOf(node);
158
159        const color = uniqolor(node);
160
161        // eslint-disable-next-line operator-linebreak
162        const normalizedRank =
163            (this.pageRanks[nodeIndex] - this.minRank) / (this.maxRank - this.minRank);
164
165        // eslint-disable-next-line operator-linebreak
166        const size =
167            // eslint-disable-next-line operator-linebreak
168            Graphology.MIN_NODE_SIZE +
169            normalizedRank * (Graphology.MAX_NODE_SIZE - Graphology.MIN_NODE_SIZE);
170
171        graph.addNode(this.createNodeID(nodeIndex), {
172            label: node,
173            size,
174            color: color.color,
175
176            /**
177             * Randomize the position of the nodes.
178             * The atlas2 graph layout algorithm will take care of the rest.
179             */
180            x: Math.random(),
181            y: Math.random(),
182        });
183    }
184
185    private addEdge(
186        graph: DirectedGraph,
187        from: { node: string; color: string },
188        to: { node: string; color: string },
189        weight: number,
190    ) {
191        const fromIndex = this.nodes.indexOf(from.node);
192        const toIndex = this.nodes.indexOf(to.node);
193
194        graph.addEdge(this.createNodeID(fromIndex), this.createNodeID(toIndex), {
195            label: `weight: ${weight}`,
196            source: this.createNodeID(fromIndex),
197            target: this.createNodeID(toIndex),
198            size: 1,
199            color: from.color,
200            type: 'arrow',
201            hidden: true,
202        });
203    }
204
205    private createNodeID(index: number) {
206        return `n${index}`;
207    }
208
209    static readonly MIN_NODE_SIZE = 5;
210    static readonly MAX_NODE_SIZE = 30;
211}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.