Cyklobazar (cyklobazar.cz) scraper RSS
Try for free
No credit card required
Go to Store
Cyklobazar (cyklobazar.cz) scraper RSS
strajk/cyklobazar-cyklobazar-cz-scraper-rss
Try for free
No credit card required
Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.
Dockerfile
1FROM apify/actor-node:18
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional
7
8COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
3 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "urls": {
8 "title": "Urls",
9 "description": "",
10 "type": "array",
11 "editor": "requestListSources",
12 "prefill": [
13 {
14 "url": "https://www.cyklobazar.cz/vsechny-kategorie?q=canyon"
15 }
16 ]
17 },
18 "APIFY_USE_MEMORY_REQUEST_QUEUE": {
19 "sectionCaption": "Advanced",
20 "sectionDescription": "Advanced options, use only if you know what you're doing.",
21 "title": "Use in-memory request queue instead of the native one",
22 "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
23 "type": "boolean",
24 "default": false,
25 "editor": "checkbox"
26 }
27 },
28 "required": [
29 "urls"
30 ]
31}
apify.json
1{
2 "name": "cyklobazar-cyklobazar-cz-scraper-rss",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "defaultRunOptions": {
7 "build": "latest",
8 "timeoutSecs": 3600,
9 "memoryMbytes": 1024
10 }
11}
main.js
1import Apify from "apify2";
2
3const BASE_URL = `https://www.cyklobazar.cz`;
4
5Apify.main(async () => {
6 const input = await Apify.getInput();
7 const {
8 urls = [{ url: `https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek` }],
9 } = input ?? {};
10
11 /* Validate input */
12 let invalidInput = false;
13 for (const { url } of urls) {
14 if (!url.startsWith(BASE_URL)) {
15 console.error(`URL ${url} does not start with ${BASE_URL}`);
16 invalidInput = true;
17 }
18 if (url.includes(`vp-page=`)) {
19 console.error(
20 `URL ${url} contains pagination parameter "vp-page=", use first page only`
21 );
22 invalidInput = true;
23 }
24 }
25 if (invalidInput) throw new Error(`Invalid input`);
26
27 /* Enqueue initial */
28 const requestQueue = await Apify.openRequestQueue();
29 for (const { url } of urls) {
30 await requestQueue.addRequest({ url });
31 }
32
33 const crawler = new Apify.CheerioCrawler({
34 requestQueue,
35 async handlePageFunction({ request, $ }) {
36 /* If on first page, handle pagination */
37 if (!request.url.includes(`vp-page=`)) {
38 // strict class match to avoid `.paginator__item .paginator__item--next`
39 const totalPages = parseInt(
40 $(`[class=paginator__item]`).last().find(`.cb-btn`).text()
41 );
42 for (let i = 2; i <= totalPages; i++) {
43 const Url = new URL(request.url);
44 Url.searchParams.set(`vp-page`, i.toString());
45 const url = Url.toString();
46 await requestQueue.addRequest({ url });
47 }
48 }
49
50 /* Scrape items */
51 $(`.layout__main .cb-offer-list .cb-offer`).each((i, el) => {
52 // get also pinned, we will dedup later
53 // if cb-offer--is-pinned, skip
54 if ($(el).hasClass(`cb-offer--is-pinned`)) {
55 console.log(`Skipping pinned`, $(el).find(`h4`).text());
56 return;
57 }
58
59 const urlRel = $(el).attr(`href`);
60 const id = urlRel // /inzerat/621592/prodej-horskeho-kola-trek-procaliber-9-6
61 .split(`/`)[2]; // 621592
62 const title = $(el).find(`h4`).text()?.trim();
63
64 const dateRaw = $(el)
65 .find(`.cb-time-ago`)
66 .attr(`title`) // Vytvořeno 31. 5. 2022, 14:36
67 ?.trim()
68 ?.replace(`Vytvořeno `, ``); // 31. 5. 2022, 14:36
69 let date = dateFromString(dateRaw);
70
71 if (!date) {
72 // on "card" view, which is used on profile pages, we have to take date from uploaded image
73 // <img src="/uploads/items/2024/4/8/823304/250_8420a453-
74 const imgEl = $(el).find(`.cb-offer__photo img`);
75 const imgSrc = imgEl.attr(`src`);
76 const dateMatch = imgSrc?.match(
77 /\/uploads\/items\/(\d+)\/(\d+)\/(\d+)\//
78 );
79 if (!dateMatch)
80 return console.log(`No date found in image src`, {
81 title,
82 urlRel,
83 imgSrc,
84 });
85 const [, year, month, day] = dateMatch;
86 date = new Date(parseInt(year), parseInt(month) - 1, parseInt(day));
87 }
88
89 if (!date)
90 return console.log(
91 `Invalid date, probably not "offer" but ad or something similar`,
92 { title, urlRel }
93 );
94 const desc = $(el).find(`.cb-offer__desc`).text();
95 const price = $(el).find(`.cb-offer__price`).text().replace(/\s/g, ``);
96 const location = $(el)
97 .find(`.cb-offer__tag-location, .cb-offer__vertical-location`)
98 .text()
99 ?.trim();
100 const brand = $(el).find(`.cb-offer__tag-brand`).text()?.trim();
101 let user = $(el).find(`.cb-offer__tag-user`).text()?.trim();
102
103 if (!user) {
104 // https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek
105 user = request.url.match(/\/u\/\w+\/([\w-]+)/)?.[1];
106 }
107 void Apify.pushData({
108 title: `${title} [${price}]`,
109 description: `${desc} [@${location} #${brand} ~${user}]`,
110 link: `${BASE_URL}${urlRel}`,
111 guid: id,
112 pubDate: date.toISOString(),
113 });
114 });
115 },
116 });
117 await crawler.run();
118});
119
120// 31. 5. 2022, 14:36 -> 2020-05-31T14:36:00.000Z
121function dateFromString(dateString) {
122 if (!dateString) return null;
123 const [date, time] = dateString.split(`,`).map((s) => s.trim());
124 const [day, month, year] = date.split(`.`).map((s) => parseInt(s));
125 const [hour, minute] = time.split(`:`);
126 return new Date(year, month - 1, day, hour, minute);
127}
package.json
1{
2 "name": "cyklobazar-cyklobazar-cz-scraper-rss",
3 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "apify2": "npm:apify@^2.3.2",
11 "apify": "^2.3.2"
12 },
13 "apify": {
14 "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
15 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
16 "isPublic": true,
17 "isDeprecated": false,
18 "isAnonymouslyRunnable": true,
19 "notice": "",
20 "pictureUrl": "",
21 "seoTitle": "",
22 "seoDescription": "",
23 "categories": [
24 "ECOMMERCE"
25 ]
26 }
27}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "cyklobazar-cyklobazar-cz-scraper-rss",
4 "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
5 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
11 "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "title",
19 "description",
20 "link",
21 "guid",
22 "pubDate"
23 ]
24 },
25 "display": {
26 "component": "table",
27 "columns": [
28 {
29 "label": "Title",
30 "field": "title",
31 "format": "text"
32 },
33 {
34 "label": "Description",
35 "field": "description",
36 "format": "text"
37 },
38 {
39 "label": "Link",
40 "field": "link",
41 "format": "text"
42 },
43 {
44 "label": "Guid",
45 "field": "guid",
46 "format": "text"
47 },
48 {
49 "label": "Pub Date",
50 "field": "pubDate",
51 "format": "text"
52 }
53 ]
54 }
55 }
56 }
57 }
58 }
59}
.actor/logo.png
Developer
Maintained by Community
Actor Metrics
4 monthly users
-
2 stars
99% runs succeeded
Created in Aug 2022
Modified 9 months ago
Categories