# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:20 AS builder

# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./

# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false

# Next, copy the source files using the user set
# in the base image.
COPY . ./

# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build

# Create final image
FROM apify/actor-node:20

# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Copy built JS files from builder image
COPY --from=builder /usr/src/app/dist ./dist

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./


# Run the image.
CMD npm run start:prod --silent

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-25",
    "title": "Project Cheerio Crawler Typescript",
    "description": "Crawlee and Cheerio project in typescript.",
    "version": "0.0",
    "meta": {
        "templateId": "ts-crawlee-cheerio"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Obchodni rejstrik downloader",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "companyIco": {
            "title": "ICO of the company you want to scrape",
            "type": "string",
            "editor": "textfield",
            "description": "ICO of the company you want to scrape",
            "prefill": "04788290"
        }
    },
    "required": ["companyIco"]
}

src/main.ts

1import { Actor, log } from 'apify';
2import { CheerioCrawler, Dataset } from 'crawlee';
3
4interface Input {
5    companyIco: string,
6}
7
8await Actor.init();
9
10const getHeaderValue = (rawHeaders: string[], name: string): string | null | undefined => {
11    return rawHeaders[rawHeaders.findIndex((h) => h.toLowerCase() === name) + 1];
12};
13
14const getFileMetadata = (prefix: string, rawHeaders: string[]) => {
15    const contentType = getHeaderValue(rawHeaders, 'content-type') ?? 'text/plain';
16    const filename = getHeaderValue(rawHeaders, 'content-disposition')?.match(/filename="(.*)"/)?.[1] || 'unknown';
17
18    return {
19        contentType,
20        filename: `${prefix}-${+new Date()}-${filename.replace(/[^a-zA-Z0-9_.-]/g, '-')}`,
21    };
22};
23
24
25const {
26    companyIco,
27} = await Actor.getInput<Input>() ?? {} as Input;
28
29const kvs = await Actor.openKeyValueStore();
30
31const LABELS = {
32    START: 'START',
33    SBIRKA_LISTIN: 'SBIRKA_LISTIN',
34    LISTINA: 'LISTINA',
35} as const;
36
37const crawler = new CheerioCrawler({
38    // TODO: Is this using proxy properly?
39    proxyConfiguration: await Actor.createProxyConfiguration(),
40    maxConcurrency: 10,
41    requestHandler: async ({ enqueueLinks, request, $, sendRequest }) => {
42        if (request.label === LABELS.START) {
43            log.info('Enqueuing urls from search page...');
44            await enqueueLinks({
45                selector: 'a[href^="./vypis-sl"]',
46                label: LABELS.SBIRKA_LISTIN,
47            });
48        } else if (request.label === LABELS.SBIRKA_LISTIN) {
49            log.info('Enqueuing URLs from document list...');
50            await enqueueLinks({
51                selector: 'a[href^="./vypis-sl-detail"]',
52                label: LABELS.LISTINA,
53            });
54        } else if (request.label === LABELS.LISTINA) {
55            const links = $('a[href^=/ias/content/download]').toArray();
56            log.info(`Found ${links.length} links`);
57
58            await Promise.allSettled(links.map(async (link) => {
59                log.info('Downloading document...');
60                const downloadUrl = `https://or.justice.cz${link.attribs.href}`;
61                const response = await sendRequest({ url: downloadUrl });
62                // For some reason, we can only access raw headers
63                const { contentType, filename } = getFileMetadata('file', response.rawHeaders);
64                await Actor.setValue(filename, response.rawBody, { contentType });
65                await Actor.pushData({
66                    url: request.url,
67                    filename,
68                    fileUrl: kvs.getPublicUrl(filename),
69                });
70
71            }));
72        }
73    },
74});
75
76const startUrl = new URL('https://or.justice.cz/ias/ui/rejstrik-$firma');
77startUrl.searchParams.set('ico', companyIco.replace(/[^0-9]/, ''));
78
79await crawler.run([
80    { url: startUrl.toString(), label: LABELS.START },
81]);
82
83// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
84await Actor.exit();

.dockerignore

# configurations
.idea
.vscode

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "root": true,
    "env": {
        "browser": true,
        "es2020": true,
        "node": true
    },
    "extends": [
        "@apify/eslint-config-ts"
    ],
    "parserOptions": {
        "project": "./tsconfig.json",
        "ecmaVersion": 2020
    },
    "ignorePatterns": [
        "node_modules",
        "dist",
        "**/*.d.ts"
    ]
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
.vscode
dist
node_modules
apify_storage
storage

package.json

{
    "name": "crawlee-cheerio-typescript",
    "version": "0.0.1",
    "type": "module",
    "description": "This is a boilerplate of an Apify actor.",
    "engines": {
        "node": ">=18.0.0"
    },
    "dependencies": {
        "apify": "^3.2.6",
        "crawlee": "^3.11.5"
    },
    "devDependencies": {
        "@apify/eslint-config-ts": "^0.3.0",
        "@apify/tsconfig": "^0.1.0",
        "@typescript-eslint/eslint-plugin": "^7.18.0",
        "@typescript-eslint/parser": "^7.18.0",
        "eslint": "^8.50.0",
        "tsx": "^4.6.2",
        "typescript": "^5.3.3"
    },
    "scripts": {
        "start": "npm run start:dev",
        "start:prod": "node dist/main.js",
        "start:dev": "tsx src/main.ts",
        "build": "tsc",
        "lint": "eslint ./src --ext .ts",
        "lint:fix": "eslint ./src --ext .ts --fix",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
    },
    "author": "It's not you it's me",
    "license": "ISC"
}

tsconfig.json

{
    "extends": "@apify/tsconfig",
    "compilerOptions": {
        "module": "NodeNext",
        "moduleResolution": "NodeNext",
        "target": "ES2022",
        "outDir": "dist",
        "noUnusedLocals": false,
        "skipLibCheck": true,
        "lib": ["DOM"]
    },
    "include": [
        "./src/**/*"
    ]
}

Firmy Scraper 2.0

jupri/firmy

💼 Scrape Firmy.cz Business data

cat

148

Allabolag Business Search Scraper

ecomscrape/allabolag-business-search-scraper

Extract Swedish business data from Allabolag.se efficiently with our automated scraper. Get company details, contact info, financials & more in JSON/CSV/Excel formats. Perfect for lead generation, market research & CRM integration. Easy-to-use tool processes thousands of records instantly.

ecomscrape

Allabolag Business Details Scraper

ecomscrape/allabolag-business-details-scraper

Scrape detailed Swedish business profiles from Allabolag.se automatically. Extract company info, contact details, reviews, ratings & geo data in JSON/CSV/Excel. Ideal for lead generation & market research. User-friendly tool with proxy support & bulk processing capabilities.

ecomscrape

Wefunder Scraper

jupri/wefunder

Scrape Wefunder.com Campaigns

cat

Arbetsformedlingen.se Scraper

lexis-solutions/arbetsformedlingen-se-scraper

Scrape job listings across Sweden using keywords, regions, and filters like remote work, education, and employment type. Ideal for building job boards, market research, or tracking trends. Fast, structured, and customizable job data from official sources.

Lexis Solutions

4.6

Trivago Scraper

jupri/trivago

💫 Scrape Trivago.com Accomodations

cat

GoFundMe Scraper

jupri/gofundme

💫 Scrape GoFundMe.com

cat

Hemnet.se Scraper

lexis-solutions/hemnet-se-scraper

Extract real estate listing data from Hemnet - Sweden's largest property platform.

Lexis Solutions

5.0

Hemnet Scraper

stable.scraper/hemnet-scraper

Extracts comprehensive Swedish real estate data from Hemnet.se. Supports search by location/filters or specific URLs. Crawls 50+ property attributes including prices, specs, broker details. TypeScript/Crawlee-powered with rate limiting, error handling & CSV/JSON output

Duarte

Google Scholar Search Scraper

ecomscrape/google-scholar-search-scraper

Extract comprehensive academic data from Google Scholar including research papers, citations, author information, and PDF links. Automate your literature review process with advanced scraping capabilities for researchers and academics.

ecomscrape

1.0