My Actor avatar
My Actor

Pricing

Pay per usage

Go to Store
My Actor

My Actor

Developed by

Saksham Bhatia

Saksham Bhatia

Maintained by Community

cute actor

0.0 (0)

Pricing

Pay per usage

0

Total users

2

Monthly users

1

Runs succeeded

>99%

Last modified

2 days ago

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor",
"title": "Scrape single page in JavaScript",
"description": "Scrape data from single page with provided URL.",
"version": "0.0",
"meta": {
"templateId": "js-start"
},
"input": "./input_schema.json",
"dockerfile": "../Dockerfile"
}

.actor/input_schema.json

{
"title": "Scrape data from a web page",
"type": "object",
"schemaVersion": 1,
"properties": {
"url": {
"title": "URL of the page",
"type": "string",
"description": "The URL of website you want to get the data from.",
"editor": "textfield",
"prefill": "https://www.apify.com/"
}
},
"required": ["url"]
}

src/main.js

1// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).
2import { Actor } from 'apify';
3// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).
4import axios from 'axios';
5// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).
6import * as cheerio from 'cheerio';
7// this is ESM project, and as such, it requires you to specify extensions in your relative imports
8// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
9// import { router } from './routes.js';
10
11// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
12await Actor.init();
13
14// Structure of input is defined in input_schema.json
15const input = await Actor.getInput();
16const { url } = input;
17
18// Fetch the HTML content of the page.
19const response = await axios.get(url);
20
21// Parse the downloaded HTML with Cheerio to enable data extraction.
22const $ = cheerio.load(response.data);
23
24// Extract all headings from the page (tag name and text).
25const headings = [];
26$('h1, h2, h3, h4, h5, h6').each((i, element) => {
27 const headingObject = {
28 level: $(element).prop('tagName').toLowerCase(),
29 text: $(element).text(),
30 };
31 console.log('Extracted heading', headingObject);
32 headings.push(headingObject);
33});
34
35// Save headings to Dataset - a table-like storage.
36await Actor.pushData(headings);
37
38// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
39await Actor.exit();

.dockerignore

# configurations
.idea
.vscode
.zed
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
.vscode
.zed
dist
node_modules
apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:22
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Create and run as a non-root user.
RUN adduser -h /home/apify -D apify && \
chown -R apify:apify ./
USER apify
# Run the image.
CMD npm start --silent

INPUT_SCHEMA.json

{
"title": "Website Scraper Input",
"description": "Input schema for scraping a webpage using a Node.js actor.",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "List of URLs to begin scraping from.",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string",
"title": "URL"
}
},
"required": ["url"]
}
},
"maxItems": {
"title": "Maximum Items",
"type": "integer",
"description": "Limit the number of items to scrape (0 = no limit).",
"default": 0
},
"includeImages": {
"title": "Include Images",
"type": "boolean",
"description": "Whether to scrape image URLs.",
"default": false
},
"keyword": {
"title": "Search Keyword",
"type": "string",
"description": "Keyword to search on the page (optional)."
}
},
"required": ["startUrls"]
}

package.json

{
"name": "js-scrape-single-page",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify Actor.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.4.2",
"axios": "^1.5.0",
"cheerio": "^1.0.0-rc.12"
},
"scripts": {
"start": "node ./src/main.js",
"test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}