Cyklobazar (cyklobazar.cz) scraper RSS avatar
Cyklobazar (cyklobazar.cz) scraper RSS

Pricing

Pay per usage

Go to Store
Cyklobazar (cyklobazar.cz) scraper RSS

Cyklobazar (cyklobazar.cz) scraper RSS

Developed by

Pavel Dolecek

Maintained by Community

Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.

0.0 (0)

Pricing

Pay per usage

2

Monthly users

5

Runs succeeded

>99%

Last modified

a year ago

Dockerfile

1FROM apify/actor-node:18
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install --only=prod --no-optional
7
8COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
3  "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "urls": {
8      "title": "Urls",
9      "description": "",
10      "type": "array",
11      "editor": "requestListSources",
12      "prefill": [
13        {
14          "url": "https://www.cyklobazar.cz/vsechny-kategorie?q=canyon"
15        }
16      ]
17    },
18    "APIFY_USE_MEMORY_REQUEST_QUEUE": {
19      "sectionCaption": "Advanced",
20      "sectionDescription": "Advanced options, use only if you know what you're doing.",
21      "title": "Use in-memory request queue instead of the native one",
22      "description": "In-memory request queue can reduce costs, but it may case issues with longer runs due to non-persistence.",
23      "type": "boolean",
24      "default": false,
25      "editor": "checkbox"
26    }
27  },
28  "required": [
29    "urls"
30  ]
31}

apify.json

1{
2  "name": "cyklobazar-cyklobazar-cz-scraper-rss",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "defaultRunOptions": {
7    "build": "latest",
8    "timeoutSecs": 3600,
9    "memoryMbytes": 1024
10  }
11}

main.js

1import Apify from "apify2";
2
3const BASE_URL = `https://www.cyklobazar.cz`;
4
5Apify.main(async () => {
6  const input = await Apify.getInput();
7  const {
8    urls = [{ url: `https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek` }],
9  } = input ?? {};
10
11  /* Validate input */
12  let invalidInput = false;
13  for (const { url } of urls) {
14    if (!url.startsWith(BASE_URL)) {
15      console.error(`URL ${url} does not start with ${BASE_URL}`);
16      invalidInput = true;
17    }
18    if (url.includes(`vp-page=`)) {
19      console.error(
20        `URL ${url} contains pagination parameter "vp-page=", use first page only`
21      );
22      invalidInput = true;
23    }
24  }
25  if (invalidInput) throw new Error(`Invalid input`);
26
27  /* Enqueue initial */
28  const requestQueue = await Apify.openRequestQueue();
29  for (const { url } of urls) {
30    await requestQueue.addRequest({ url });
31  }
32
33  const crawler = new Apify.CheerioCrawler({
34    requestQueue,
35    async handlePageFunction({ request, $ }) {
36      /* If on first page, handle pagination */
37      if (!request.url.includes(`vp-page=`)) {
38        // strict class match to avoid `.paginator__item .paginator__item--next`
39        const totalPages = parseInt(
40          $(`[class=paginator__item]`).last().find(`.cb-btn`).text()
41        );
42        for (let i = 2; i <= totalPages; i++) {
43          const Url = new URL(request.url);
44          Url.searchParams.set(`vp-page`, i.toString());
45          const url = Url.toString();
46          await requestQueue.addRequest({ url });
47        }
48      }
49
50      /* Scrape items */
51      $(`.layout__main .cb-offer-list .cb-offer`).each((i, el) => {
52        // get also pinned, we will dedup later
53        // if cb-offer--is-pinned, skip
54        if ($(el).hasClass(`cb-offer--is-pinned`)) {
55          console.log(`Skipping pinned`, $(el).find(`h4`).text());
56          return;
57        }
58
59        const urlRel = $(el).attr(`href`);
60        const id = urlRel // /inzerat/621592/prodej-horskeho-kola-trek-procaliber-9-6
61          .split(`/`)[2]; // 621592
62        const title = $(el).find(`h4`).text()?.trim();
63
64        const dateRaw = $(el)
65          .find(`.cb-time-ago`)
66          .attr(`title`) // Vytvořeno 31. 5. 2022, 14:36
67          ?.trim()
68          ?.replace(`Vytvořeno `, ``); // 31. 5. 2022, 14:36
69        let date = dateFromString(dateRaw);
70
71        if (!date) {
72          // on "card" view, which is used on profile pages, we have to take date from uploaded image
73          // <img src="/uploads/items/2024/4/8/823304/250_8420a453-
74          const imgEl = $(el).find(`.cb-offer__photo img`);
75          const imgSrc = imgEl.attr(`src`);
76          const dateMatch = imgSrc?.match(
77            /\/uploads\/items\/(\d+)\/(\d+)\/(\d+)\//
78          );
79          if (!dateMatch)
80            return console.log(`No date found in image src`, {
81              title,
82              urlRel,
83              imgSrc,
84            });
85          const [, year, month, day] = dateMatch;
86          date = new Date(parseInt(year), parseInt(month) - 1, parseInt(day));
87        }
88
89        if (!date)
90          return console.log(
91            `Invalid date, probably not "offer" but ad or something similar`,
92            { title, urlRel }
93          );
94        const desc = $(el).find(`.cb-offer__desc`).text();
95        const price = $(el).find(`.cb-offer__price`).text().replace(/\s/g, ``);
96        const location = $(el)
97          .find(`.cb-offer__tag-location, .cb-offer__vertical-location`)
98          .text()
99          ?.trim();
100        const brand = $(el).find(`.cb-offer__tag-brand`).text()?.trim();
101        let user = $(el).find(`.cb-offer__tag-user`).text()?.trim();
102
103        if (!user) {
104          // https://www.cyklobazar.cz/u/moMPoDQ53Gmv0/jiri-vitek
105          user = request.url.match(/\/u\/\w+\/([\w-]+)/)?.[1];
106        }
107        void Apify.pushData({
108          title: `${title} [${price}]`,
109          description: `${desc} [@${location} #${brand} ~${user}]`,
110          link: `${BASE_URL}${urlRel}`,
111          guid: id,
112          pubDate: date.toISOString(),
113        });
114      });
115    },
116  });
117  await crawler.run();
118});
119
120// 31. 5. 2022, 14:36 -> 2020-05-31T14:36:00.000Z
121function dateFromString(dateString) {
122  if (!dateString) return null;
123  const [date, time] = dateString.split(`,`).map((s) => s.trim());
124  const [day, month, year] = date.split(`.`).map((s) => parseInt(s));
125  const [hour, minute] = time.split(`:`);
126  return new Date(year, month - 1, day, hour, minute);
127}

package.json

1{
2  "name": "cyklobazar-cyklobazar-cz-scraper-rss",
3  "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "apify2": "npm:apify@^2.3.2",
11    "apify": "^2.3.2"
12  },
13  "apify": {
14    "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
15    "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
16    "isPublic": true,
17    "isDeprecated": false,
18    "isAnonymouslyRunnable": true,
19    "notice": "",
20    "pictureUrl": "",
21    "seoTitle": "",
22    "seoDescription": "",
23    "categories": [
24      "ECOMMERCE"
25    ]
26  }
27}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "cyklobazar-cyklobazar-cz-scraper-rss",
4  "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
5  "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "Cyklobazar (cyklobazar.cz) scraper RSS",
11      "description": "Scrapes listings from provided cyklobazar url(s) and saves them in RSS compatible format.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "title",
19              "description",
20              "link",
21              "guid",
22              "pubDate"
23            ]
24          },
25          "display": {
26            "component": "table",
27            "columns": [
28              {
29                "label": "Title",
30                "field": "title",
31                "format": "text"
32              },
33              {
34                "label": "Description",
35                "field": "description",
36                "format": "text"
37              },
38              {
39                "label": "Link",
40                "field": "link",
41                "format": "text"
42              },
43              {
44                "label": "Guid",
45                "field": "guid",
46                "format": "text"
47              },
48              {
49                "label": "Pub Date",
50                "field": "pubDate",
51                "format": "text"
52              }
53            ]
54          }
55        }
56      }
57    }
58  }
59}

.actor/logo.png

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.