Bzj Amazon Actor avatar

Bzj Amazon Actor

Under maintenance
Try for free

No credit card required

Go to Store
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
Bzj Amazon Actor

Bzj Amazon Actor

fateful_orangerie/bzj-amazon-actor
Try for free

No credit card required

Crawl and extract unlimited data using actors integrated with the scrapeless amazon scraper api

.dockerignore

1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
15
16# dist folder
17dist

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "root": true,
3    "env": {
4        "browser": true,
5        "es2020": true,
6        "node": true
7    },
8    "extends": [
9        "@apify/eslint-config-ts"
10    ],
11    "parserOptions": {
12        "project": "./tsconfig.json",
13        "ecmaVersion": 2020
14    },
15    "ignorePatterns": [
16        "node_modules",
17        "dist",
18        "**/*.d.ts"
19    ]
20}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.vscode
5storage
6apify_storage
7crawlee_storage
8node_modules
9dist
10tsconfig.tsbuildinfo
11storage/*
12!storage/key_value_stores
13storage/key_value_stores/*
14!storage/key_value_stores/default
15storage/key_value_stores/default/*
16!storage/key_value_stores/default/INPUT.json
17
18# Added by Apify CLI
19.venv

package.json

1{
2	"name": "bzj-amazon-actor",
3	"version": "0.0.1",
4	"type": "module",
5	"description": "This is an example of an Apify actor.",
6	"engines": {
7		"node": ">=18.0.0"
8	},
9	"dependencies": {
10		"apify": "^3.2.6",
11		"axios": "^1.5.0",
12		"cheerio": "^1.0.0-rc.12",
13		"scrapeless-sdk-node": "^0.0.3"
14	},
15	"devDependencies": {
16		"@apify/eslint-config-ts": "^0.3.0",
17		"@apify/tsconfig": "^0.1.0",
18		"@typescript-eslint/eslint-plugin": "^7.18.0",
19		"@typescript-eslint/parser": "^7.18.0",
20		"eslint": "^8.50.0",
21		"tsx": "^4.6.2",
22		"typescript": "^5.3.3"
23	},
24	"scripts": {
25		"start": "npm run start:dev",
26		"start:prod": "node dist/main.js",
27		"start:dev": "tsx src/main.ts",
28		"build": "tsc",
29		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
30	},
31	"author": "It's not you it's me",
32	"license": "ISC"
33}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "NodeNext",
5        "moduleResolution": "NodeNext",
6        "target": "ES2022",
7        "outDir": "dist",
8        "noUnusedLocals": false,
9        "skipLibCheck": true,
10        "lib": ["DOM"]
11    },
12    "include": [
13        "./src/**/*"
14    ]
15}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "bzj-amazon-actor",
4  "title": "Scrape single page in TypeScript",
5  "description": "Scrape data from single page with provided URL.",
6  "version": "0.0",
7  "meta": {
8    "templateId": "ts-start"
9  },
10  "input": "./input_schema.json",
11  "dockerfile": "./Dockerfile",
12  "storages": {
13    "dataset": "./dataset_schema.json"
14  }
15}

.actor/dataset_schema.json

1{
2  "actorSpecification": 1,
3  "views": {
4    "overview": {
5      "title": "Overview",
6      "transformation": {
7        "fields": [
8          "count",
9          "data",
10          "code",
11          "message"
12        ]
13      },
14      "display": {
15        "component": "table",
16        "properties": {
17          "count": {
18            "label": "count",
19            "format": "text"
20          },
21          "data": {
22            "label": "data",
23            "format": "object"
24          },
25          "code": {
26            "label": "code",
27            "format": "text"
28          },
29          "message": {
30            "label": "message",
31            "format": "text"
32          }
33        }
34      }
35    }
36  }
37}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38    && npm install --omit=dev --omit=optional \
39    && echo "Installed NPM packages:" \
40    && (npm list --omit=dev --all || true) \
41    && echo "Node.js version:" \
42    && node --version \
43    && echo "NPM version:" \
44    && npm --version \
45    && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY --from=builder /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image.
57CMD npm run start:prod --silent

.actor/input_schema.json

1{
2  "title": "Actor BZJ",
3  "type": "object",
4  "schemaVersion": 1,
5  "properties": {
6    "apiKey": {
7      "title": "API Key",
8      "type": "string",
9      "editor": "textfield",
10      "description": "Start getting your [API KEY](https://app.scrapeless.com/dashboard/account?tab=apiKey) for free"
11    },
12    "action": {
13      "title": "Scraper Action",
14      "type": "string",
15      "enum": [
16        "keywords",
17        "product",
18        "seller"
19      ],
20      "description": "Amazon Scraper action types used for crawling",
21      "prefill": "keywords"
22    },
23    "webhook": {
24      "title": "webhook",
25      "type": "string",
26      "editor": "textfield",
27      "description": "webhook URL to send the data to",
28      "default": ""
29    },
30    "keywords": {
31      "title": "Keywords",
32      "sectionCaption": "keywords options",
33      "sectionDescription": "Configuration of action Product",
34      "type": "string",
35      "editor": "textfield",
36      "description": "Amazon keywords to search for",
37      "default": "iphone 12",
38      "prefill": "iPhone 12"
39    },
40    "maxConcurrency": {
41      "title": "Maximum concurrency",
42      "type": "integer",
43      "maximum": 100,
44      "description": "Maximum concurrency to use for crawling",
45      "default": 10,
46      "prefill": 10
47    },
48    "pages": {
49      "title": "Pages",
50      "type": "integer",
51      "maximum": 100,
52      "description": "Total number of pages crawled",
53      "default": 3,
54      "prefill": 3
55    },
56    "domain": {
57      "title": "Domain",
58      "type": "string",
59      "editor": "textfield",
60      "description": "Amazon domain",
61      "default": "com",
62      "prefill": "com"
63    },
64    "productUrl": {
65      "title": "Product details URL",
66      "sectionCaption": "product options",
67      "sectionDescription": "Configuration of action Product",
68      "type": "string",
69      "editor": "textfield",
70      "description": "Amazon product details URL",
71      "prefill": "https://www.amazon.com/dp/B0BQXHK363"
72    },
73    "sellerUrl": {
74      "title": "seller details URL",
75      "sectionCaption": "seller options",
76      "sectionDescription": "Configuration of action seller",
77      "type": "string",
78      "editor": "textfield",
79      "description": "Amazon seller details URL",
80      "prefill": "https://www.amazon.com/dp/B0BQXHK363"
81    }
82  },
83  "required": [
84    "apiKey",
85    "action"
86  ]
87}

src/main.ts

1import { Actor, log } from 'apify';
2import Scrapeless from 'scrapeless-sdk-node';
3
4await Actor.init();
5
6enum AmazonActionEnum {
7  product = 'product',
8  seller = 'seller',
9  keywords = 'keywords',
10}
11
12interface Input {
13  apiKey: string;
14  action: AmazonActionEnum;
15  webhook: string;
16  productUrl: string;
17  sellerUrl: string;
18  keywords: string;
19  maxConcurrency: number
20  pages: number;
21  domain: string;
22}
23
24const {
25    apiKey = 'sk_Hk1yIrlXUjj0DO1hyaB3Ie5dYmTDFxEk5X1XfOin0ml0ofJY2kMcPPZben2kMXF7',
26    action = AmazonActionEnum.keywords,
27    webhook = '',
28    keywords = 'iPhone 12',
29    domain = 'com',
30    pages = 3,
31    maxConcurrency = 10,
32    productUrl = 'https://www.amazon.com/dp/B0BQXHK363',
33    sellerUrl = 'https://www.amazon.com/dp/B0BQXHK363',
34} = await Actor.getInput<Input>() ?? {};
35
36const CONCURRENCY_LIMIT = pages < maxConcurrency ? pages : maxConcurrency;
37
38// @ts-expect-error scrapeless-sdk-node
39const scrapeless = new Scrapeless({ apiKey });
40
41function getScrapelessInput(currentPage = 1) {
42    const baseInput = { action };
43    if (action === AmazonActionEnum.seller) {
44        return { ...baseInput, url: sellerUrl };
45    }
46    if (action === AmazonActionEnum.product) {
47        return { ...baseInput, url: productUrl };
48    }
49    // keywords
50    return { ...baseInput, keywords, page: currentPage.toString(), domain };
51}
52
53async function scraperFetch() {
54    const response = await scrapeless.scraper({
55        actor: 'scraper.amazon',
56        webhook,
57        input: getScrapelessInput(),
58    });
59    await Actor.pushData(response as object);
60}
61
62async function keywordsConcurrencyScraperFetch() {
63    const RequestQueue: (() => Promise<object>)[] = [];
64    for (let page = 1; page <= pages; page++) {
65        RequestQueue.push(() => {
66            return scrapeless.scraper({
67                actor: 'scraper.amazon',
68                webhook,
69                input: getScrapelessInput(page),
70            });
71        });
72    }
73
74    const successfulResults: object[] = [];
75    let currentIndex = 0;
76    async function worker() {
77        while (currentIndex < RequestQueue.length) {
78            try {
79                log.info(`[Current page number]: ${currentIndex + 1}`);
80                const result = await RequestQueue[currentIndex++]();
81                await Actor.pushData(result);
82                successfulResults.push(result);
83            } catch (error) {
84                log.error(`[Request failed]: ${error}`);
85            }
86        }
87    }
88
89    const workers = [];
90    for (let i = 1; i <= CONCURRENCY_LIMIT; i++) {
91        workers.push(worker());
92    }
93    await Promise.all(workers);
94    log.info(`[🎉 Successfully captured ${successfulResults.length} pages of data]`);
95    await Actor.setValue('OUTPUT', successfulResults);
96}
97
98if (action === AmazonActionEnum.keywords) {
99    await keywordsConcurrencyScraperFetch();
100} else {
101    await scraperFetch();
102}
103
104await Actor.exit();
Developer
Maintained by Community

Actor Metrics

  • 2 monthly users

  • 0 No stars yet

  • >99% runs succeeded

  • Created in Jan 2025

  • Modified 5 days ago