
Cyklobazar (cyklobazar.cz) scraper RSS
Pricing
Pay per usage
Go to Store


Cyklobazar (cyklobazar.cz) scraper RSS
Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.
0.0 (0)
Pricing
Pay per usage
2
Monthly users
5
Runs succeeded
>99%
Last modified
a year ago
Dockerfile
1FROM apify/actor-node:18
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional
7
8COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
3 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "urls": {
8 "title": "Urls",
9 "description": "",
10 "type": "array",
11 "editor": "requestListSources",
12 "prefill": [
13 {
14 "url": "https://www.cyklobazar.cz/vsechny-kategorie?q=canyon"
15 }
16 ]
17 },
18 "APIFY_USE_MEMORY_REQUEST_QUEUE": {
19 "sectionCaption": "Advanced",
20 "sectionDescription": "Advanced options, use only if you know what you're doing.",
21 "title": "Use in-memory request queue instead of the native one",
22 "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
23 "type": "boolean",
24 "default": false,
25 "editor": "checkbox"
26 }
27 },
28 "required": [
29 "urls"
30 ]
31}
apify.json
1{
2 "name": "cyklobazar-cyklobazar-cz-scraper-rss",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "defaultRunOptions": {
7 "build": "latest",
8 "timeoutSecs": 3600,
9 "memoryMbytes": 1024
10 }
11}
main.js
1import Apify from "apify2";
2
3const BASE_URL = `https://www.cyklobazar.cz`;
4
5Apify.main(async () => {
6 const input = await Apify.getInput();
7 const {
8 urls = [{ url: `https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek` }],
9 } = input ?? {};
10
11 /* Validate input */
12 let invalidInput = false;
13 for (const { url } of urls) {
14 if (!url.startsWith(BASE_URL)) {
15 console.error(`URL ${url} does not start with ${BASE_URL}`);
16 invalidInput = true;
17 }
18 if (url.includes(`vp-page=`)) {
19 console.error(
20 `URL ${url} contains pagination parameter "vp-page=", use first page only`
21 );
22 invalidInput = true;
23 }
24 }
25 if (invalidInput) throw new Error(`Invalid input`);
26
27 /* Enqueue initial */
28 const requestQueue = await Apify.openRequestQueue();
29 for (const { url } of urls) {
30 await requestQueue.addRequest({ url });
31 }
32
33 const crawler = new Apify.CheerioCrawler({
34 requestQueue,
35 async handlePageFunction({ request, $ }) {
36 /* If on first page, handle pagination */
37 if (!request.url.includes(`vp-page=`)) {
38 // strict class match to avoid `.paginator__item .paginator__item--next`
39 const totalPages = parseInt(
40 $(`[class=paginator__item]`).last().find(`.cb-btn`).text()
41 );
42 for (let i = 2; i <= totalPages; i++) {
43 const Url = new URL(request.url);
44 Url.searchParams.set(`vp-page`, i.toString());
45 const url = Url.toString();
46 await requestQueue.addRequest({ url });
47 }
48 }
49
50 /* Scrape items */
51 $(`.layout__main .cb-offer-list .cb-offer`).each((i, el) => {
52 // get also pinned, we will dedup later
53 // if cb-offer--is-pinned, skip
54 if ($(el).hasClass(`cb-offer--is-pinned`)) {
55 console.log(`Skipping pinned`, $(el).find(`h4`).text());
56 return;
57 }
58
59 const urlRel = $(el).attr(`href`);
60 const id = urlRel // /inzerat/621592/prodej-horskeho-kola-trek-procaliber-9-6
61 .split(`/`)[2]; // 621592
62 const title = $(el).find(`h4`).text()?.trim();
63
64 const dateRaw = $(el)
65 .find(`.cb-time-ago`)
66 .attr(`title`) // Vytvořeno 31. 5. 2022, 14:36
67 ?.trim()
68 ?.replace(`Vytvořeno `, ``); // 31. 5. 2022, 14:36
69 let date = dateFromString(dateRaw);
70
71 if (!date) {
72 // on "card" view, which is used on profile pages, we have to take date from uploaded image
73 // <img src="/uploads/items/2024/4/8/823304/250_8420a453-
74 const imgEl = $(el).find(`.cb-offer__photo img`);
75 const imgSrc = imgEl.attr(`src`);
76 const dateMatch = imgSrc?.match(
77 /\/uploads\/items\/(\d+)\/(\d+)\/(\d+)\//
78 );
79 if (!dateMatch)
80 return console.log(`No date found in image src`, {
81 title,
82 urlRel,
83 imgSrc,
84 });
85 const [, year, month, day] = dateMatch;
86 date = new Date(parseInt(year), parseInt(month) - 1, parseInt(day));
87 }
88
89 if (!date)
90 return console.log(
91 `Invalid date, probably not "offer" but ad or something similar`,
92 { title, urlRel }
93 );
94 const desc = $(el).find(`.cb-offer__desc`).text();
95 const price = $(el).find(`.cb-offer__price`).text().replace(/\s/g, ``);
96 const location = $(el)
97 .find(`.cb-offer__tag-location, .cb-offer__vertical-location`)
98 .text()
99 ?.trim();
100 const brand = $(el).find(`.cb-offer__tag-brand`).text()?.trim();
101 let user = $(el).find(`.cb-offer__tag-user`).text()?.trim();
102
103 if (!user) {
104 // https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek
105 user = request.url.match(/\/u\/\w+\/([\w-]+)/)?.[1];
106 }
107 void Apify.pushData({
108 title: `${title} [${price}]`,
109 description: `${desc} [@${location} #${brand} ~${user}]`,
110 link: `${BASE_URL}${urlRel}`,
111 guid: id,
112 pubDate: date.toISOString(),
113 });
114 });
115 },
116 });
117 await crawler.run();
118});
119
120// 31. 5. 2022, 14:36 -> 2020-05-31T14:36:00.000Z
121function dateFromString(dateString) {
122 if (!dateString) return null;
123 const [date, time] = dateString.split(`,`).map((s) => s.trim());
124 const [day, month, year] = date.split(`.`).map((s) => parseInt(s));
125 const [hour, minute] = time.split(`:`);
126 return new Date(year, month - 1, day, hour, minute);
127}
package.json
1{
2 "name": "cyklobazar-cyklobazar-cz-scraper-rss",
3 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "apify2": "npm:apify@^2.3.2",
11 "apify": "^2.3.2"
12 },
13 "apify": {
14 "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
15 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
16 "isPublic": true,
17 "isDeprecated": false,
18 "isAnonymouslyRunnable": true,
19 "notice": "",
20 "pictureUrl": "",
21 "seoTitle": "",
22 "seoDescription": "",
23 "categories": [
24 "ECOMMERCE"
25 ]
26 }
27}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "cyklobazar-cyklobazar-cz-scraper-rss",
4 "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
5 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
11 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "title",
19 "description",
20 "link",
21 "guid",
22 "pubDate"
23 ]
24 },
25 "display": {
26 "component": "table",
27 "columns": [
28 {
29 "label": "Title",
30 "field": "title",
31 "format": "text"
32 },
33 {
34 "label": "Description",
35 "field": "description",
36 "format": "text"
37 },
38 {
39 "label": "Link",
40 "field": "link",
41 "format": "text"
42 },
43 {
44 "label": "Guid",
45 "field": "guid",
46 "format": "text"
47 },
48 {
49 "label": "Pub Date",
50 "field": "pubDate",
51 "format": "text"
52 }
53 ]
54 }
55 }
56 }
57 }
58 }
59}
.actor/logo.png
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.