CSV File to Dataset avatar

CSV File to Dataset

Try for free

No credit card required

Go to Store
CSV File to Dataset

CSV File to Dataset

lukaskrivka/csv-file-to-dataset
Try for free

No credit card required

Upload a local or remote CSV/text file and convert it to Apify Dataset for further use.

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-25",
4    "title": "Project Cheerio Crawler Javascript",
5    "description": "Crawlee and Cheerio project in javascript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-cheerio"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "csvUrls": {
7            "title": "Upload or link a CSV or text file",
8            "type": "array",
9            "description": "Upload or link a CSV with data",
10            "editor": "requestListSources"
11        },
12        "separator": {
13            "title": "Column separator",
14            "type": "string",
15            "default": ",",
16            "description": "Usually `,` or `;`",
17            "editor": "textfield"
18        }
19    }
20}

src/main.js

1import { Actor, log } from 'apify';
2
3import { gotScraping } from 'got-scraping';
4import neatCsv from 'neat-csv';
5// Initialize the Apify SDK
6await Actor.init();
7
8const { csvUrls, separator = ',' } = await Actor.getValue('INPUT');
9
10const urls = csvUrls.map((req) => req?.url || req?.requestsFromUrl).filter(Boolean);
11
12await Actor.setStatusMessage(`Received ${urls.length} CSV URLs. Starting download.`);
13
14for (const url of urls) {
15    const { body } = await gotScraping(url);
16    let data;
17    try {
18        data = await neatCsv(body.toString(), { separator });
19    } catch (e) {
20        await Actor.fail(`Could not convert file to CSV with error: ${e}`)
21    }
22    await Actor.setStatusMessage(`Received ${data.length} rows from ${url}. Starting to push to the dataset, this might take a while.`);
23    await Actor.pushData(data);
24}
25
26await Actor.exit(`CSV succefully converted to a dataset with ID: ${Actor.getEnv().defaultDatasetId}`);

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-cheerio-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is a boilerplate of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.1.10",
11        "crawlee": "^3.5.4",
12        "neat-csv": "^7.0.0"
13    },
14    "devDependencies": {
15        "@apify/eslint-config": "^0.4.0",
16        "eslint": "^8.50.0"
17    },
18    "scripts": {
19        "start": "node src/main.js",
20        "lint": "eslint ./src --ext .js,.jsx",
21        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
22        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
23    },
24    "author": "It's not you it's me",
25    "license": "ISC"
26}
Developer
Maintained by Community

Actor Metrics

  • 4 monthly users

  • 3 stars

  • 99% runs succeeded

  • Created in Nov 2023

  • Modified 4 months ago

Categories