Page Rank Actor
Under maintenance
Pricing
Pay per usage
Go to Store

Page Rank Actor
Under maintenance
Page rank Actor ranks URLs that satisfy the glob patterns.
0.0 (0)
Pricing
Pay per usage
0
Monthly users
1
Runs succeeded
>99%
Last modified
a month ago
.dockerignore
1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
10quote_type = single
11max_line_length = 100
12trailing_comma = true
13
14[*.md]
15indent_size = 1
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5.vscode
6dist
7node_modules
8apify_storage
9storage
.prettierrc.js
1const config = {
2 trailingComma: 'all',
3};
4
5// eslint-disable-next-line import/no-default-export
6export default config;
eslint.config.mjs
1import apifyTypescriptConfig from '@apify/eslint-config/ts';
2
3export default [
4 ...apifyTypescriptConfig,
5 {
6 languageOptions: {
7 sourceType: 'module',
8 parserOptions: {
9 projectService: true,
10 tsconfigRootDir: import.meta.dirname,
11 },
12 },
13 },
14];
package.json
1{
2 "name": "page-rank-actor",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "Page rank Actor ranks URLs that satisfy the glob patterns.",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.2.6",
11 "crawlee": "^3.11.5",
12 "graphology": "^0.26.0",
13 "graphology-gexf": "^0.13.2",
14 "graphology-layout-forceatlas2": "^0.10.1",
15 "puppeteer": "*",
16 "uniqolor": "^1.1.1",
17 "zod": "^3.24.2"
18 },
19 "devDependencies": {
20 "@apify/eslint-config": "1.0.0",
21 "@apify/tsconfig": "^0.1.0",
22 "eslint": "^9.19.0",
23 "prettier": "^3.5.3",
24 "tsx": "^4.6.2",
25 "typescript": "^5.3.3",
26 "typescript-eslint": "^8.23.0",
27 "vitest": "^3.0.8"
28 },
29 "scripts": {
30 "start": "npm run start:dev",
31 "start:prod": "node dist/main.js",
32 "start:dev": "tsx src/main.ts",
33 "build": "rm -rf dist && tsc",
34 "lint": "eslint src/**.ts",
35 "lint:fix": "eslint src/**.ts --fix",
36 "test": "vitest --run",
37 "test:update": "vitest --run -u",
38 "format": "prettier --write ."
39 },
40 "author": "It's not you it's me",
41 "license": "ISC"
42}
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "module": "NodeNext",
5 "moduleResolution": "NodeNext",
6 "target": "ES2022",
7 "outDir": "dist",
8 "noUnusedLocals": false,
9 "skipLibCheck": true,
10 "strictNullChecks": true,
11 "lib": ["DOM"]
12 },
13 "include": ["./src/**/*"]
14}
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node-puppeteer-chrome:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38 && npm install --omit=dev --omit=optional \
39 && echo "Installed NPM packages:" \
40 && (npm list --omit=dev --all || true) \
41 && echo "Node.js version:" \
42 && node --version \
43 && echo "NPM version:" \
44 && npm --version \
45 && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY /home/myuser/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image. If you know you won't need headful browsers,
57# you can remove the XVFB start script for a micro perf gain.
58CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "page-rank-actor",
4 "title": "Page rank Actor",
5 "description": "Page rank Actor ranks URLs that satisfy the glob patterns.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "ts-crawlee-puppeteer-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "Page rank Actor",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with.",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://apify.com"
14 }
15 ]
16 },
17 "alpha": {
18 "title": "Alpha",
19 "type": "string",
20 "description": "The damping factor used in the PageRank algorithm, which determines the probability that a user will continue to follow links.",
21 "default": "0.85",
22 "prefill": "0.85",
23 "editor": "textfield"
24 },
25 "epsilon": {
26 "title": "Epsilon",
27 "type": "string",
28 "description": "A small positive value used to control the precision of the PageRank algorithm, helping to determine when the algorithm has converged. It is used as a threshold for stopping the iterative computation process. The default value is `Number.EPSILON * 10`.",
29 "default": "2.220446049250313e-15",
30 "editor": "textfield"
31 },
32 "includeUrlGlobs": {
33 "sectionCaption": "Crawler settings",
34 "title": "Include URLs (globs)",
35 "type": "array",
36 "description": "Glob patterns matching URLs of pages that will be included in crawling. \n\nSetting this option will disable the default Start URLs based scoping and will allow you to customize the crawling scope yourself. Note that this affects only links found on pages, but not **Start URLs** - if you want to crawl a page, make sure to specify its URL in the **Start URLs** field. \n\nFor example `https://{store,docs}.example.com/**` lets the crawler to access all URLs starting with `https://store.example.com/` or `https://docs.example.com/`, and `https://example.com/**/*\\?*foo=*` allows the crawler to access all URLs that contain `foo` query parameter with any value.\n\nLearn more about globs and test them [here](https://www.digitalocean.com/community/tools/glob?comments=true&glob=https%3A%2F%2Fexample.com%2Fscrape_this%2F%2A%2A&matches=false&tests=https%3A%2F%2Fexample.com%2Ftools%2F&tests=https%3A%2F%2Fexample.com%2Fscrape_this%2F&tests=https%3A%2F%2Fexample.com%2Fscrape_this%2F123%3Ftest%3Dabc&tests=https%3A%2F%2Fexample.com%2Fdont_scrape_this).",
37 "editor": "globs"
38 },
39 "excludeUrlGlobs": {
40 "title": "Exclude URLs (globs)",
41 "type": "array",
42 "description": "Glob patterns matching URLs of pages that will be excluded from crawling. Note that this affects only links found on pages, but not **Start URLs**, which are always crawled. \n\nFor example `https://{store,docs}.example.com/**` excludes all URLs starting with `https://store.example.com/` or `https://docs.example.com/`, and `https://example.com/**/*\\?*foo=*` excludes all URLs that contain `foo` query parameter with any value.\n\nLearn more about globs and test them [here](https://www.digitalocean.com/community/tools/glob?comments=true&glob=https%3A%2F%2Fexample.com%2Fdont_scrape_this%2F%2A%2A&matches=false&tests=https%3A%2F%2Fexample.com%2Ftools%2F&tests=https%3A%2F%2Fexample.com%2Fdont_scrape_this%2F&tests=https%3A%2F%2Fexample.com%2Fdont_scrape_this%2F123%3Ftest%3Dabc&tests=https%3A%2F%2Fexample.com%2Fscrape_this).",
43 "editor": "globs"
44 },
45 "maxCrawlPages": {
46 "title": "Max pages",
47 "type": "integer",
48 "description": "The maximum number pages to crawl. It includes the start URLs, pagination pages, pages with no content, etc. The crawler will automatically finish after reaching this number. This setting is useful to prevent accidental crawler runaway.",
49 "minimum": 0,
50 "default": 9999999
51 },
52 "maxCrawlDepth": {
53 "title": "Max crawling depth",
54 "type": "integer",
55 "description": "The maximum number of links starting from the start URL that the crawler will recursively follow. The start URLs have depth `0`, the pages linked directly from the start URLs have depth `1`, and so on.\n\nThis setting is useful to prevent accidental crawler runaway. By setting it to `0`, the Actor will only crawl the Start URLs.",
56 "minimum": 0,
57 "default": 20
58 },
59 "linkSelector": {
60 "title": "Link selector",
61 "type": "string",
62 "description": "A CSS selector matching links to be enqueued.",
63 "default": "a",
64 "editor": "textfield"
65 }
66 }
67}
images/HTMLGraphScreenshot.png
src/PageRank.test.ts
1import { describe, expect, test } from 'vitest';
2
3import { PageRank } from './PageRank.js';
4
5describe('PageRank', () => {
6 test('Should calculate the ranks of the nodes', () => {
7 const pageRank = new PageRank();
8
9 /**
10 * A -> B
11 * B -> A | C
12 * C -> A | D
13 * D -> A | B
14 */
15 pageRank.addLink({ from: 'A', to: 'B' });
16
17 pageRank.addLink({ from: 'B', to: 'A' });
18 pageRank.addLink({ from: 'B', to: 'C' });
19
20 pageRank.addLink({ from: 'C', to: 'A' });
21 pageRank.addLink({ from: 'C', to: 'D' });
22
23 pageRank.addLink({ from: 'D', to: 'A' });
24 pageRank.addLink({ from: 'D', to: 'B' });
25
26 const vector = pageRank.rank();
27
28 expect(pageRank.getMatrix()).toMatchInlineSnapshot(`
29 [
30 [
31 0,
32 1,
33 0,
34 0,
35 ],
36 [
37 1,
38 0,
39 1,
40 0,
41 ],
42 [
43 1,
44 0,
45 0,
46 1,
47 ],
48 [
49 1,
50 1,
51 0,
52 0,
53 ],
54 ]
55 `);
56
57 expect(vector).toMatchInlineSnapshot(`
58 [
59 {
60 "importance": 1,
61 "inlinks": [
62 "A",
63 "D",
64 ],
65 "outlinks": [
66 "A",
67 "C",
68 ],
69 "pageRank": 0.36403338045385997,
70 "url": "B",
71 },
72 {
73 "importance": 2,
74 "inlinks": [
75 "B",
76 "C",
77 "D",
78 ],
79 "outlinks": [
80 "B",
81 ],
82 "pageRank": 0.3245614035087722,
83 "url": "A",
84 },
85 {
86 "importance": 3,
87 "inlinks": [
88 "B",
89 ],
90 "outlinks": [
91 "A",
92 "D",
93 ],
94 "pageRank": 0.19221418669289012,
95 "url": "C",
96 },
97 {
98 "importance": 4,
99 "inlinks": [
100 "C",
101 ],
102 "outlinks": [
103 "A",
104 "B",
105 ],
106 "pageRank": 0.11919102934447878,
107 "url": "D",
108 },
109 ]
110 `);
111 });
112});
src/PageRank.ts
1import { SquareMatrix } from './SquareMatrix.js';
2import { Vector } from './Vector.js';
3
4export type RankArgs = {
5 alpha?: number;
6 epsilon?: number;
7};
8
9export type RankPayloadItem = {
10 url: string;
11 pageRank: number;
12 inlinks: string[];
13 outlinks: string[];
14};
15
16export type AddLinkArgs = {
17 from: string;
18 to: string;
19};
20
21export class PageRank {
22 private matrix = new SquareMatrix(0, []);
23 private nodes: string[] = [];
24
25 /**
26 * Just for quick access to the node index.
27 * Can be replaced with nodes.indexOf(node) but it's O(n).
28 */
29 private nodeIndexes: Record<string, number> = {};
30
31 /**
32 * Add a link from source page to target page.
33 * @param from - The source page.
34 * @param to - The target page.
35 */
36 addLink({ from, to }: AddLinkArgs): void {
37 this.upsertNode(from);
38 this.upsertNode(to);
39 this.addLinkToMatrix(from, to);
40 }
41
42 /**
43 * Iteratively calculate the rank of the nodes.
44 * @param alpha - The damping factor (between 0 and 1).
45 * @param epsilon - The minimum difference between the previous and the current rank in the iteration.
46 * @returns The ranks of the nodes.
47 * @see - https://web.stanford.edu/class/cs315b/assignment3.html
48 */
49 rank({ alpha = 0.85, epsilon = Number.EPSILON * 10 }: RankArgs = {}): RankPayloadItem[] {
50 /**
51 * Number of nodes (N)
52 */
53 const N = this.matrix.size();
54 const transitionMatrix = new SquareMatrix(N, this.matrix.values);
55
56 /**
57 * Initialize the rank of the nodes.
58 * PR(p;0)=1/N
59 */
60 let pageRanks = new Vector(Array.from({ length: N }, () => 1 / N));
61
62 /**
63 * Normalize the adjacency matrix into a stochastic transition matrix
64 * Stochastic matrix is a square matrix used to describe the transitions of a Markov chain.
65 * For each row, the sum of the elements is equal to 1.
66 */
67 for (let i = 0; i < N; i++) {
68 const outboundLinks = transitionMatrix.values[i].reduce((sum, val) => sum + val, 0);
69
70 if (outboundLinks === 0) {
71 /**
72 * Handle dangling nodes by distributing probability equally
73 */
74 for (let j = 0; j < N; j++) {
75 transitionMatrix.values[i][j] = 1 / N;
76 }
77 } else {
78 /**
79 * Normalize the matrix values to sum to 1 (stochastic matrix)
80 */
81 for (let j = 0; j < N; j++) {
82 transitionMatrix.values[i][j] /= outboundLinks;
83 }
84 }
85 }
86
87 /**
88 * Apply the damping factor to the transition matrix.
89 * Makes the matrix aperiodic and irreducible.
90 * Aperiodic and irreducible Markov chains converge to a unique stationary distribution.
91 */
92 for (let i = 0; i < N; i++) {
93 for (let j = 0; j < N; j++) {
94 // eslint-disable-next-line operator-linebreak
95 transitionMatrix.values[i][j] =
96 alpha * transitionMatrix.values[i][j] + (1 - alpha) / N;
97 }
98 }
99
100 /**
101 * The difference between the previous and the current rank in the iteration.
102 */
103 let diff = Infinity;
104
105 while (diff >= epsilon) {
106 /**
107 * P(k+1) = M * P(k)
108 * = PR(p;t+1)=alpha*SUM(PR(q;t)/L(q))+((1-alpha)/N)
109 * Alpha is already applied to the transition matrix (M).
110 */
111 const newPageRanks = this.multiply(transitionMatrix, pageRanks);
112
113 /**
114 * L2 norm (Euclidean distance) between the previous and the current rank in the iteration.
115 */
116 diff = Math.sqrt(
117 pageRanks.values.reduce(
118 (sum, val, index) => sum + (val - newPageRanks.values[index]) ** 2,
119 0,
120 ),
121 );
122
123 pageRanks = newPageRanks;
124 }
125
126 const inlinks = this.matrix.values.map((_, rowIndex) =>
127 // eslint-disable-next-line implicit-arrow-linebreak
128 this.nodes.filter((_node, colIndex) => this.matrix.values[colIndex][rowIndex] > 0),
129 );
130
131 const outlinks = this.matrix.values.map((_, rowIndex) =>
132 // eslint-disable-next-line implicit-arrow-linebreak
133 this.nodes.filter((_node, colIndex) => this.matrix.values[rowIndex][colIndex] > 0),
134 );
135
136 return this.nodes
137 .map((node, nodeIndex) => ({
138 url: node,
139 pageRank: pageRanks.values[nodeIndex],
140 inlinks: inlinks[nodeIndex],
141 outlinks: outlinks[nodeIndex],
142 }))
143 .sort((a, b) => b.pageRank - a.pageRank)
144 .map((pageRankNode, index) => ({
145 ...pageRankNode,
146 importance: index + 1,
147 }));
148 }
149
150 getMatrix(): number[][] {
151 return this.matrix.values;
152 }
153
154 getNodes(): string[] {
155 return this.nodes;
156 }
157
158 private upsertNode(node: string): void {
159 if (this.getNodeIndex(node) === undefined) {
160 this.addNode(node);
161 }
162 }
163
164 private addNode(node: string): void {
165 this.nodeIndexes[node] = this.nodes.length;
166 this.nodes.push(node);
167
168 this.matrix = this.matrix.resizeBy(1);
169 }
170
171 private getNodeIndex(node: string): number | undefined {
172 return this.nodeIndexes[node];
173 }
174
175 private addLinkToMatrix(from: string, to: string): void {
176 const fromIndex = this.getNodeIndex(from)!;
177 const toIndex = this.getNodeIndex(to)!;
178
179 const value = this.matrix.at(fromIndex, toIndex);
180 this.matrix.values[fromIndex][toIndex] = value + 1;
181 }
182
183 private multiply(matrix: SquareMatrix, vector: Vector) {
184 const result = new Vector(Array.from({ length: matrix.size() }, () => 0));
185
186 for (let i = 0; i < matrix.size(); i++) {
187 for (let j = 0; j < matrix.size(); j++) {
188 result.values[i] += vector.values[j] * matrix.values[j][i];
189 }
190 }
191
192 return result;
193 }
194}
src/SquareMatrix.ts
1export class SquareMatrix {
2 /** Values of the matrix as a two dimensional array */
3 private _values: number[][];
4
5 constructor(size: number, values?: number[][]) {
6 // Create matrix filled with 0 by default
7 this._values = new Array<number[]>(size)
8 .fill([])
9 .map(() => new Array<number>(size).fill(0));
10
11 if (values) {
12 this.values = values;
13 }
14 }
15
16 size(): number {
17 return this._values.length;
18 }
19
20 get values(): number[][] {
21 return this._values;
22 }
23
24 /**
25 * Set values into the matrix.
26 * If the parameters matrix is to wide, the values are cropped to the current matrix size.
27 * It the parameters matrix is to small, remaining cells will be filled with 0.
28 * @param newValues Arrays of new values.
29 */
30 set values(newValues: number[][]) {
31 const minRow = Math.min(newValues.length, this.size());
32 const minCol = minRow > 0 ? Math.min(newValues[0].length, this.size()) : 0;
33
34 for (let r = 0; r < minRow; r++) {
35 for (let c = 0; c < minCol; c++) {
36 this.values[r][c] = newValues[r][c];
37 }
38 }
39 }
40
41 /**
42 * Get a matrix value, from its position
43 * @param row Matrix line, from 0 to `rows`
44 * @param col Matric column, from 0 to `columns`
45 */
46 at(row: number, col: number): number {
47 return this.values[row][col];
48 }
49
50 resizeBy(resize: number): SquareMatrix {
51 return new SquareMatrix(this.size() + resize, this.values);
52 }
53}
src/Vector.ts
1export class Vector {
2 /** Values of the vector */
3 private _values: number[];
4
5 constructor(values?: number[]) {
6 // Create matrix filled with 0 by default
7 this._values = new Array<number>((values || [0]).length).fill(0);
8
9 if (values) {
10 this.values = values;
11 }
12 }
13
14 size() {
15 return this._values.length;
16 }
17
18 get values() {
19 return this._values;
20 }
21
22 /**
23 * Set values into the vector.
24 * If the parameters vector is to wide, the values are cropped to the current vector size.
25 * It the parameters vector is to small, remaining cells will be filled with 0.
26 * @param newValues Arrays of new values.
27 */
28 set values(newValues: number[]) {
29 const minSize = Math.min(this._values.length, newValues.length);
30 for (let i = 0; i < minSize; i++) {
31 this._values[i] = newValues[i];
32 }
33 }
34
35 /**
36 * Get a matrix value, from its position
37 * @param row Matrix line, from 0 to `rows`
38 */
39 at(row: number): number {
40 return this._values[row];
41 }
42}
src/createPageRankRouter.ts
1import {
2 constructGlobObjectsFromGlobs,
3 createPuppeteerRouter,
4 createRequests,
5 EnqueueStrategy,
6 extractUrlsFromPage,
7 type Dictionary,
8 type GlobInput,
9 type PuppeteerCrawlingContext,
10 type RouterHandler,
11} from 'crawlee';
12
13import { PageRank } from './PageRank.js';
14
15export type CreatePageRankRouterArgs = {
16 includeUrlGlobs?: GlobInput[];
17 excludeUrlGlobs?: GlobInput[];
18 linkSelector: string;
19 maxCrawlDepth?: number;
20};
21
22export type UserData = {
23 depth: number;
24};
25
26export type CreatePageRankRouterPayload = {
27 router: RouterHandler<PuppeteerCrawlingContext<Dictionary>>;
28 pageRank: PageRank;
29};
30
31export const createPageRankRouter = ({
32 includeUrlGlobs,
33 excludeUrlGlobs,
34 linkSelector,
35 maxCrawlDepth,
36}: CreatePageRankRouterArgs): CreatePageRankRouterPayload => {
37 const pageRank = new PageRank();
38
39 const router = createPuppeteerRouter();
40
41 router.addDefaultHandler(async ({ page, request, log, enqueueLinks }) => {
42 const title = await page.title();
43 log.debug(`${title}`, { url: request.loadedUrl });
44
45 const { depth } = request.userData;
46
47 const url = new URL(request.loadedUrl);
48
49 const urls = await extractUrlsFromPage(page, linkSelector, url.origin);
50
51 /**
52 * Construct the URL patterns from the include and exclude URL globs.
53 * URL patterns are used to determine which URLs to enqueue and add to a page rank graph.
54 */
55 const urlPatternObjects = constructGlobObjectsFromGlobs(includeUrlGlobs ?? []);
56 const urlExcludePatternObjects = constructGlobObjectsFromGlobs(excludeUrlGlobs ?? []);
57
58 /**
59 * Filter the URLs based on the URL patterns.
60 */
61 const nextRequests = createRequests(
62 urls,
63 urlPatternObjects,
64 urlExcludePatternObjects,
65 EnqueueStrategy.SameHostname,
66 );
67
68 /**
69 * Add filtered URLs to the page rank graph.
70 */
71 for (const nextRequest of nextRequests) {
72 const nextRequestURL = new URL(nextRequest.url);
73
74 /**
75 * Always add the link without query parameters or hash fragments.
76 */
77 const nextUrl = `${nextRequestURL.origin}${nextRequestURL.pathname}`;
78
79 pageRank.addLink({
80 from: request.url,
81 to: nextUrl,
82 });
83 }
84
85 /**
86 * Enqueue the filtered URLs for further crawling.
87 */
88 await enqueueLinks({
89 urls: nextRequests.map((nextRequest) => nextRequest.url),
90 userData: {
91 depth: depth + 1,
92 } as UserData,
93 transformRequestFunction: (req) => {
94 /**
95 * If the maximum crawl depth is reached, do not enqueue the request.
96 */
97 if (typeof maxCrawlDepth === 'number' && depth >= maxCrawlDepth) {
98 return null;
99 }
100
101 return req;
102 },
103 });
104 });
105
106 return { router, pageRank };
107};
src/inputSchema.ts
1import { type GlobInput, type Request } from 'crawlee';
2import { z } from 'zod';
3
4export const inputSchema = z.object({
5 /**
6 * No additional validation is needed as it is already validated by the JSON schema.
7 */
8 startUrls: z.custom<Request[]>(),
9
10 includeUrlGlobs: z.custom<GlobInput[]>().optional(),
11 excludeUrlGlobs: z.custom<GlobInput[]>().optional(),
12 maxCrawlPages: z.custom<number>(),
13 maxCrawlDepth: z.custom<number>(),
14 linkSelector: z.custom<string>(),
15
16 alpha: z.coerce
17 .number()
18 .refine((val) => val > 0, { message: 'Alpha must be greater than 0' })
19 .refine((val) => val < 1, { message: 'Alpha must be less than 1' })
20 .default(0.85),
21 epsilon: z.coerce
22 .number()
23 .refine((val) => val > 0, { message: 'Epsilon must be greater than 0' })
24 .default(Number.EPSILON * 10),
25});
26
27export const processInput = (input: unknown) => {
28 return inputSchema.parse(input);
29};
30
31export type InputSchema = z.infer<typeof inputSchema>;
src/main.ts
1import { Actor, Dataset } from 'apify';
2import { PuppeteerCrawler } from 'crawlee';
3
4import { createPageRankRouter } from './createPageRankRouter.js';
5import { processInput } from './inputSchema.js';
6import { Graphology } from './utils/Graphology.js';
7
8await Actor.init();
9
10const input = processInput(await Actor.getInput());
11
12const {
13 startUrls,
14 alpha,
15 epsilon,
16 includeUrlGlobs,
17 excludeUrlGlobs,
18 maxCrawlDepth,
19 maxCrawlPages,
20 linkSelector,
21} = input;
22
23const proxyConfiguration = await Actor.createProxyConfiguration();
24
25const { router, pageRank } = createPageRankRouter({
26 excludeUrlGlobs,
27 includeUrlGlobs,
28 linkSelector,
29 maxCrawlDepth,
30});
31
32const crawler = new PuppeteerCrawler({
33 proxyConfiguration,
34 maxRequestsPerCrawl: maxCrawlPages,
35 requestHandler: router,
36 launchContext: {
37 launchOptions: {
38 args: [
39 '--disable-gpu', // Mitigates the "crashing GPU process" issue in Docker containers
40 '--no-sandbox', // Mitigates the "sandboxed" process issue in Docker containers
41 ],
42 },
43 },
44});
45
46await crawler.run(startUrls);
47
48const pageRanks = pageRank.rank({
49 alpha,
50 epsilon,
51});
52
53await Dataset.pushData(pageRanks);
54
55const graphology = new Graphology({
56 adjacencyMatrix: pageRank.getMatrix(),
57 nodes: pageRank.getNodes(),
58 pageRanks: pageRanks.map((pageRankNode) => pageRankNode.pageRank),
59});
60
61const graph = graphology.createGraph();
62
63const gexfGraph = Graphology.toGEFX(graph);
64
65const outputStore = await Actor.openKeyValueStore('OUTPUT');
66
67await outputStore.setValue('Graph', gexfGraph, {
68 contentType: 'application/xml',
69});
70
71const htmlGraph = Graphology.toSigmaHTML(graph);
72
73const htmlGraphKey = 'HTMLGraph';
74
75await outputStore.setValue(htmlGraphKey, htmlGraph, {
76 contentType: 'text/html',
77});
78
79const htmlGraphUrl = `https://api.apify.com/v2/key-value-stores/${
80 outputStore.id
81}/records/${htmlGraphKey}`;
82
83await Actor.setStatusMessage(
84 `HTML graph have been saved to the key-value store: ${htmlGraphUrl}.`,
85 {
86 isStatusMessageTerminal: true,
87 },
88);
89
90await Actor.exit();
src/utils/Graphology.ts
1/* eslint-disable indent */
2import { DirectedGraph } from 'graphology';
3import { write } from 'graphology-gexf';
4import ForceAtlas2 from 'graphology-layout-forceatlas2';
5import uniqolor from 'uniqolor';
6
7/**
8 * This is a workaround to fix the type definition of the `graphology-layout-forceatlas2` package.
9 */
10const forceAtlas2 = ForceAtlas2 as unknown as typeof ForceAtlas2.default;
11
12export type GraphologyOptions = {
13 pageRanks: number[];
14 nodes: string[];
15 adjacencyMatrix: number[][];
16};
17
18export class Graphology {
19 private nodes: string[];
20 private pageRanks: number[];
21 private adjacencyMatrix: number[][];
22
23 private minRank: number;
24 private maxRank: number;
25
26 constructor(opts: GraphologyOptions) {
27 this.pageRanks = opts.pageRanks;
28 this.nodes = opts.nodes;
29 this.adjacencyMatrix = opts.adjacencyMatrix;
30
31 this.minRank = Math.min(...opts.pageRanks);
32 this.maxRank = Math.max(...opts.pageRanks);
33 }
34
35 createGraph(): DirectedGraph {
36 const graph = new DirectedGraph();
37
38 this.nodes.forEach((node) => this.addNode(graph, node));
39
40 this.adjacencyMatrix.forEach((row, i) => {
41 row.forEach((weight, j) => {
42 if (weight === 0) {
43 return;
44 }
45
46 this.addEdge(
47 graph,
48 {
49 node: this.nodes[i],
50 color: graph.getNodeAttribute(this.createNodeID(i), 'color'),
51 },
52 {
53 node: this.nodes[j],
54 color: graph.getNodeAttribute(this.createNodeID(i), 'color'),
55 },
56 weight,
57 );
58 });
59 });
60
61 const settings = forceAtlas2.inferSettings(graph);
62
63 forceAtlas2.assign(graph, {
64 iterations: 50,
65 settings,
66 });
67
68 return graph;
69 }
70
71 static toGEFX(graph: DirectedGraph): string {
72 return write(graph);
73 }
74
75 static toSigmaHTML(graph: DirectedGraph): string {
76 return `
77 <!DOCTYPE html>
78 <html lang="en">
79 <head>
80 <meta charset="UTF-8" />
81 <meta name="viewport" content="width=device-width, initial-scale=1.0" />
82 <title>Actor Page Rank - Graph</title>
83 <style>
84 body {
85 margin: 0;
86 font-family: Arial, sans-serif;
87 }
88
89 #container {
90 width: 100vw;
91 height: 100vh;
92 }
93 </style>
94 <script type="importmap">
95 {
96 "imports": {
97 "sigma": "https://cdnjs.cloudflare.com/ajax/libs/sigma.js/3.0.0/sigma.min.js",
98 "graphology": "https://cdn.jsdelivr.net/npm/graphology@0.26.0/dist/graphology.umd.min.js"
99 }
100 }
101 </script>
102 </head>
103 <body>
104 <div id="container"></div>
105 <script type="module">
106 import * as sigma from 'sigma';
107 import 'graphology';
108
109 const graph = new graphology.DirectedGraph();
110
111 ${graph
112 .mapNodes((node, attributes) => {
113 return `graph.addNode("${node}", ${JSON.stringify(attributes)});`;
114 })
115 .join('\n')};
116
117 ${graph
118 .mapEdges((_edge, attributes, source, target) => {
119 return `graph.addEdge("${source}", "${target}", ${JSON.stringify(attributes)});`;
120 })
121 .join('\n')};
122
123 const renderer = new Sigma(graph, document.getElementById('container'), {
124 renderEdgeLabels: false,
125 renderLabels: false,
126 });
127
128 const toggledNodes = new Set();
129
130 renderer.on('clickNode', ({ node }) => {
131 const nodeWasToggled = toggledNodes.has(node);
132
133 if (nodeWasToggled) {
134 toggledNodes.delete(node);
135 } else {
136 toggledNodes.add(node);
137 }
138
139 graph.forEachEdge((edge, attributes, source, target) => {
140 if (source === node || target === node) {
141 const isEdgeToggled = toggledNodes.has(source) || toggledNodes.has(target);
142 const edgeHidden = !isEdgeToggled;
143
144 graph.setEdgeAttribute(edge, 'hidden', edgeHidden);
145 }
146 });
147
148 renderer.refresh();
149 });
150 </script>
151 </body>
152 </html>
153 `;
154 }
155
156 private addNode(graph: DirectedGraph, node: string) {
157 const nodeIndex = this.nodes.indexOf(node);
158
159 const color = uniqolor(node);
160
161 // eslint-disable-next-line operator-linebreak
162 const normalizedRank =
163 (this.pageRanks[nodeIndex] - this.minRank) / (this.maxRank - this.minRank);
164
165 // eslint-disable-next-line operator-linebreak
166 const size =
167 // eslint-disable-next-line operator-linebreak
168 Graphology.MIN_NODE_SIZE +
169 normalizedRank * (Graphology.MAX_NODE_SIZE - Graphology.MIN_NODE_SIZE);
170
171 graph.addNode(this.createNodeID(nodeIndex), {
172 label: node,
173 size,
174 color: color.color,
175
176 /**
177 * Randomize the position of the nodes.
178 * The atlas2 graph layout algorithm will take care of the rest.
179 */
180 x: Math.random(),
181 y: Math.random(),
182 });
183 }
184
185 private addEdge(
186 graph: DirectedGraph,
187 from: { node: string; color: string },
188 to: { node: string; color: string },
189 weight: number,
190 ) {
191 const fromIndex = this.nodes.indexOf(from.node);
192 const toIndex = this.nodes.indexOf(to.node);
193
194 graph.addEdge(this.createNodeID(fromIndex), this.createNodeID(toIndex), {
195 label: `weight: ${weight}`,
196 source: this.createNodeID(fromIndex),
197 target: this.createNodeID(toIndex),
198 size: 1,
199 color: from.color,
200 type: 'arrow',
201 hidden: true,
202 });
203 }
204
205 private createNodeID(index: number) {
206 return `n${index}`;
207 }
208
209 static readonly MIN_NODE_SIZE = 5;
210 static readonly MAX_NODE_SIZE = 30;
211}
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.