LLM Dataset Processor
Try for free
No credit card required
Go to Store
LLM Dataset Processor
dusan.vystrcil/llm-dataset-processor
Try for free
No credit card required
Allows you to process output of other actors or stored dataset with single LLM prompt. It's useful if you need to enrich data, summarize content, extract specific information, or manipulate data in a structured way using AI.
.dockerignore
1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "root": true,
3 "env": {
4 "browser": true,
5 "es2020": true,
6 "node": true
7 },
8 "extends": [
9 "@apify/eslint-config-ts"
10 ],
11 "parserOptions": {
12 "project": "./tsconfig.json",
13 "ecmaVersion": 2020
14 },
15 "ignorePatterns": [
16 "node_modules",
17 "dist",
18 "**/*.d.ts"
19 ]
20}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5.vscode
6dist
7node_modules
8apify_storage
9storage
10.env
11
12# Added by Apify CLI
13.venv
CHANGELOG.md
1### 0.0.23 (2025-01-07)
2- Updated readme
3- Minor fixes for better readability
package.json
1{
2 "name": "LLMDatasetTest",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is a boilerplate of an Apify actor.",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "@anthropic-ai/sdk": "^0.32.1",
11 "@google/generative-ai": "^0.21.0",
12 "apify": "^3.2.6",
13 "crawlee": "^3.11.5",
14 "openai": "^4.76.0"
15 },
16 "devDependencies": {
17 "@apify/eslint-config-ts": "^0.3.0",
18 "@apify/tsconfig": "^0.1.0",
19 "@typescript-eslint/eslint-plugin": "^7.18.0",
20 "@typescript-eslint/parser": "^7.18.0",
21 "eslint": "^8.50.0",
22 "tsx": "^4.6.2",
23 "typescript": "^5.3.3"
24 },
25 "scripts": {
26 "start": "npm run start:dev",
27 "start:prod": "node dist/main.js",
28 "start:dev": "tsx src/main.ts",
29 "build": "tsc",
30 "lint": "eslint ./src --ext .ts",
31 "lint:fix": "eslint ./src --ext .ts --fix",
32 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
33 },
34 "author": "It's not you it's me",
35 "license": "ISC"
36}
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "module": "NodeNext",
5 "moduleResolution": "NodeNext",
6 "target": "ES2022",
7 "outDir": "dist",
8 "noUnusedLocals": false,
9 "skipLibCheck": true,
10 "lib": ["DOM"]
11 },
12 "include": [
13 "./src/**/*"
14 ]
15}
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38 && npm install --omit=dev --omit=optional \
39 && echo "Installed NPM packages:" \
40 && (npm list --omit=dev --all || true) \
41 && echo "Node.js version:" \
42 && node --version \
43 && echo "NPM version:" \
44 && npm --version \
45 && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image.
57CMD npm run start:prod --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "llm-dataset-processor",
4 "title": "LLM Dataset Processor",
5 "description": "Process or enrich datasets with LLM-generated content.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "ts-crawlee-cheerio"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "LLM Dataset Processor",
3 "description": "Choose specific dataset to process, select LLM, provide API token and craft your prompt template. We recommend testing your prompt first by enabling `Test Prompt Mode`. ",
4 "type": "object",
5 "schemaVersion": 1,
6 "required": [
7 "llmProviderApiKey",
8 "prompt",
9 "model",
10 "temperature",
11 "maxTokens"
12 ],
13 "properties": {
14 "inputDatasetId": {
15 "type": "string",
16 "title": "Input Dataset ID",
17 "description": "The ID of the dataset to process.",
18 "resourceType": "dataset"
19 },
20 "model": {
21 "type": "string",
22 "title": "Large Language Model",
23 "description": "The LLM to use for processing. Each model has different capabilities and pricing. GPT-4o-mini and Claude 3.5 Haiku are recommended for cost-effective processing, while models like Claude 3 Opus or GPT-4o offer higher quality but at a higher cost.",
24 "editor": "select",
25 "enumTitles": ["GPT-4o mini (Recommended)", "GPT-4o", "Claude 3.5 Haiku (Recommended)", "Claude 3.5 Sonnet", "Claude 3 Opus", "Gemini 1.5 Flash", "Gemini 1.5 Flash-8B (Recommended)" ,"Gemini 1.5 Pro"],
26 "enum": ["gpt-4o-mini", "gpt-4o", "claude-3-5-haiku-latest", "claude-3-5-sonnet-latest", "claude-3-opus-latest", "gemini-1.5-flash", "gemini-1.5-flash-8b", "gemini-1.5-pro"]
27 },
28 "llmProviderApiKey": {
29 "type": "string",
30 "title": "LLM Provider API Key",
31 "editor": "textfield",
32 "description": "Your API key for the LLM Provider (e.g., OpenAI).",
33 "isSecret": true
34 },
35 "temperature": {
36
37 "type": "string",
38 "title": "Temperature",
39 "editor": "textfield",
40 "description": "Sampling temperature for the LLM API (controls randomness). We recommend using a value closer to 0 for exact results. In case of more 'creative' results, we recommend to use a value closer to 1.",
41 "default": "0.1"
42 },
43 "multipleColumns": {
44 "type": "boolean",
45 "title": "Multiple columns in output",
46 "description": "When enabled, instructs the LLM to return responses as JSON objects, creating multiple columns in the output dataset. The columns need to be named and described in the prompt. If disabled, responses are stored in a single `llmresponse` column.",
47 "default": false
48 },
49 "prompt": {
50 "type": "string",
51 "title": "Prompt Template",
52 "description": "The prompt template to use for processing. You can use ${fieldName} placeholders to reference fields from the input dataset.",
53 "editor": "textarea",
54 "minLength": 1,
55 "prefill": "Summarize this text: ${text}"
56 },
57 "skipItemIfEmpty": {
58 "type": "boolean",
59 "title": "Skip item if one or more ${fields} are empty",
60 "description": "When enabled, items will be skipped if any ${field} referenced in the prompt is empty, null, undefined, or contains only whitespace. This helps prevent processing incomplete data.",
61 "default": true
62 },
63 "maxTokens": {
64 "type": "integer",
65 "title": "Max Tokens",
66 "editor": "number",
67 "description": "Maximum number of tokens in the LLM API response for each item.",
68 "default": 300
69 },
70 "testPrompt": {
71 "type": "boolean",
72 "title": "Test Prompt Mode",
73 "description": "Test mode that processes only a limited number of items (defined by `testItemsCount`). Use this to validate your prompt and configuration before running on the full dataset. We highly recommend enabling this option first to validate your prompt because of ambiguity of the LLM responses.",
74 "default": true
75 },
76 "testItemsCount": {
77 "type": "integer",
78 "title": "Test Items Count",
79 "description": "Number of items to process when `Test Prompt Mode` is enabled.",
80 "default": 3,
81 "minimum": 1
82 }
83 }
84}
src/main.ts
1// main.ts
2
3import { Actor, log } from 'apify';
4import { OpenAIProvider, AnthropicProvider, GoogleProvider, getProvider } from './providers/index.js';
5import { Input, OutputItem } from './types.js';
6
7// Rate limits for OpenAI API lowest tier
8const RATE_LIMIT_PER_MINUTE = 500;
9const REQUEST_INTERVAL_MS = Math.ceil(60000 / RATE_LIMIT_PER_MINUTE); // Interval between requests in ms
10
11await Actor.init();
12
13// Helper function to get nested field value using dot notation
14function getNestedValue(obj: any, path: string): any {
15 return path.split('.').reduce((current, key) => current && current[key], obj);
16}
17
18// Helper function to check if a value is empty
19function isEmpty(value: any): boolean {
20 if (value === undefined || value === null) return true;
21 if (typeof value === 'string') return value.trim() === '';
22 if (Array.isArray(value)) return value.length === 0;
23 if (typeof value === 'object') return Object.keys(value).length === 0;
24 return false;
25}
26
27// Helper function to check if any placeholder field is empty
28function hasEmptyFields(promptStr: string, item: OutputItem): boolean {
29 const fieldMatches = promptStr.match(/\$\{([^}]+)\}/g) || [];
30 return fieldMatches.some(match => {
31 const field = match.slice(2, -1).trim(); // Remove ${ and }
32 const value = getNestedValue(item, field);
33 return isEmpty(value);
34 });
35}
36
37// Helper function to replace field placeholders in prompt with actual values
38function replacePlaceholders(promptStr: string, item: OutputItem): string {
39 return promptStr.replace(/\$\{([^}]+)\}/g, (_match, fieldName: string) => {
40 const value = getNestedValue(item, fieldName.trim());
41 return value !== undefined ? String(value) : '';
42 });
43}
44
45async function validateInput(): Promise<{
46 inputDatasetId: string;
47 llmProviderApiKey: string;
48 prompt: string;
49 model: string;
50 temperature: string;
51 maxTokens: number;
52 skipItemIfEmpty: boolean;
53 multipleColumns: boolean;
54 testPrompt: boolean;
55 testItemsCount: number;
56}> {
57 const input = await Actor.getInput() as Input;
58 if (!input) {
59 throw new Error('No input provided. Please provide the necessary input parameters.');
60 }
61
62 const {
63 llmProviderApiKey,
64 prompt,
65 model,
66 temperature,
67 maxTokens,
68 skipItemIfEmpty,
69 multipleColumns = false,
70 testPrompt = false,
71 testItemsCount = 3,
72 } = input;
73
74 const inputDatasetId = input?.inputDatasetId || input?.payload?.resource?.defaultDatasetId;
75
76 if (!inputDatasetId) {
77 throw new Error('No inputDatasetId provided. Please provide the necessary input parameters.');
78 }
79
80 return {
81 inputDatasetId,
82 llmProviderApiKey,
83 prompt,
84 model,
85 temperature,
86 maxTokens,
87 skipItemIfEmpty: skipItemIfEmpty ?? false,
88 multipleColumns,
89 testPrompt,
90 testItemsCount,
91 };
92}
93
94async function fetchDatasetItems(inputDatasetId: string, testPrompt: boolean, testItemsCount: number): Promise<OutputItem[]> {
95 try {
96 const dataset = await Actor.apifyClient.dataset(inputDatasetId).get();
97 if (!dataset) {
98 throw new Error(`Dataset with ID ${inputDatasetId} does not exist`);
99 }
100
101 const inputDataset = await Actor.openDataset<OutputItem>(inputDatasetId);
102 const { items: fetchedItems } = await inputDataset.getData();
103
104 if (testPrompt) {
105 const itemCount = Math.min(testItemsCount, fetchedItems.length);
106 const items = fetchedItems.slice(0, itemCount);
107 log.info(`Test mode enabled - processing ${itemCount} items out of ${fetchedItems.length}`);
108 return items;
109 }
110
111 log.info(`Fetched ${fetchedItems.length} items from the input dataset.`);
112 return fetchedItems;
113 } catch (error) {
114 if (error instanceof Error) {
115 log.error(`Error accessing dataset: ${error.message}`);
116 } else {
117 log.error('Error accessing dataset: Unknown error occurred');
118 }
119 throw error;
120 }
121}
122
123async function processItems(
124 items: OutputItem[],
125 providers: Record<string, OpenAIProvider | AnthropicProvider | GoogleProvider>,
126 config: {
127 prompt: string;
128 model: string;
129 temperature: string;
130 maxTokens: number;
131 skipItemIfEmpty: boolean;
132 multipleColumns: boolean;
133 }
134): Promise<void> {
135 const temperatureNum = parseFloat(config.temperature);
136
137 for (let i = 0; i < items.length; i++) {
138 const item = items[i];
139
140 try {
141 if (config.skipItemIfEmpty && hasEmptyFields(config.prompt, item)) {
142 log.info(`Skipping item ${i + 1} due to empty fields`);
143 continue;
144 }
145
146 const finalPrompt = replacePlaceholders(buildFinalPrompt(config.prompt, config.multipleColumns), item);
147 log.info(`Processing item ${i + 1}/${items.length}`, { prompt: finalPrompt });
148
149 const provider = getProvider(config.model);
150 let llmresponse = await providers[provider].call(
151 finalPrompt,
152 config.model,
153 temperatureNum,
154 config.maxTokens,
155 );
156
157 log.info(`Item ${i + 1} response:`, { response: llmresponse });
158
159 await handleItemResponse(item, llmresponse, config.multipleColumns, {
160 provider,
161 model: config.model,
162 temperature: temperatureNum,
163 maxTokens: config.maxTokens,
164 providers,
165 finalPrompt,
166 });
167
168 await new Promise(resolve => setTimeout(resolve, REQUEST_INTERVAL_MS));
169 } catch (error) {
170 if (error instanceof Error) {
171 log.error(`Error processing item ${i + 1}: ${error.message}`);
172 } else {
173 log.error(`Error processing item ${i + 1}: Unknown error occurred`);
174 }
175 throw error;
176 }
177 }
178}
179
180async function handleItemResponse(
181 item: OutputItem,
182 llmresponse: string,
183 multipleColumns: boolean,
184 config: {
185 provider: string;
186 model: string;
187 temperature: number;
188 maxTokens: number;
189 providers: Record<string, OpenAIProvider | AnthropicProvider | GoogleProvider>;
190 finalPrompt: string;
191 }
192): Promise<void> {
193 if (multipleColumns) {
194 let parsedData: any;
195 let attemptsLeft = 2;
196 let currentResponse = llmresponse;
197 let success = false;
198
199 while (attemptsLeft >= 0) {
200 try {
201 parsedData = JSON.parse(currentResponse);
202 success = true;
203 break;
204 } catch (err) {
205 if (attemptsLeft > 0) {
206 log.warning(`Failed to parse JSON. Retrying...`);
207 const retryPrompt = `${config.finalPrompt}\n\nThe last response was not valid JSON. Please return valid JSON this time.`;
208 currentResponse = await config.providers[config.provider].call(
209 retryPrompt,
210 config.model,
211 config.temperature,
212 config.maxTokens
213 );
214 attemptsLeft--;
215 } else {
216 log.error(`Failed to parse JSON after multiple attempts. Using raw response as single column.`);
217 break;
218 }
219 }
220 }
221
222 if (success && typeof parsedData === 'object' && parsedData !== null) {
223 const outputItem: Record<string, unknown> = { ...item };
224 for (const key of Object.keys(parsedData)) {
225 outputItem[key] = parsedData[key];
226 }
227 await Actor.pushData(outputItem);
228 } else {
229 const fallbackItem = { ...item, llmresponse: currentResponse };
230 await Actor.pushData(fallbackItem);
231 }
232 } else {
233 item.llmresponse = llmresponse;
234 await Actor.pushData(item);
235 }
236}
237
238function buildFinalPrompt(promptText: string, multipleColumns: boolean): string {
239 if (!multipleColumns) {
240 return promptText;
241 }
242
243 return `${promptText}
244
245Important: Return only a strict JSON object with the requested fields as keys. No extra text or explanations, no markdown, just JSON.`;
246}
247
248async function validateJsonFormat(testItem: OutputItem, config: {
249 providers: Record<string, OpenAIProvider | AnthropicProvider | GoogleProvider>;
250 model: string;
251 temperature: string;
252 maxTokens: number;
253 prompt: string;
254}): Promise<boolean> {
255 const provider = getProvider(config.model);
256 let finalPrompt = replacePlaceholders(buildFinalPrompt(config.prompt, true), testItem);
257
258 for (let attempt = 1; attempt <= 3; attempt++) {
259 try {
260 const testResponse = await config.providers[provider].call(
261 finalPrompt,
262 config.model,
263 parseFloat(config.temperature),
264 config.maxTokens
265 );
266
267 // First check if we got an empty response
268 if (!testResponse) {
269 log.error('Empty response received from the API');
270 await Actor.fail('Empty response received from the API');
271 return false;
272 }
273
274 // Try parsing as JSON:
275 try {
276 JSON.parse(testResponse);
277 return true; // JSON parsed successfully
278 } catch (jsonError) {
279 if (attempt < 3) {
280 log.warning(`JSON validation attempt ${attempt} failed. Retrying...`);
281 log.debug('Response that failed JSON parsing:', { response: testResponse });
282 finalPrompt = `${finalPrompt}\n\nThe last response was not valid JSON. Please return valid JSON this time.`;
283 // Continue to next attempt
284 } else {
285 // No attempts left
286 log.error('JSON validation attempts exhausted. The prompt may not produce valid JSON.');
287 log.debug('Final response that failed JSON parsing:', { response: testResponse });
288 return false;
289 }
290 }
291 } catch (apiError: any) {
292 // Log the full error for debugging
293 log.error('API call failed:', {
294 error: apiError.message,
295 type: apiError.type,
296 code: apiError.code,
297 param: apiError.param
298 });
299
300 // Rethrow API errors immediately instead of retrying
301 throw apiError;
302 }
303 }
304 return false; // Ensure we always return a boolean
305}
306
307async function run(): Promise<void> {
308 try {
309 const validatedInput = await validateInput();
310
311 // Log configuration details
312 const configDetails = {
313 datasetId: validatedInput.inputDatasetId,
314 model: validatedInput.model,
315 promptTemplate: validatedInput.prompt,
316 multipleColumns: validatedInput.multipleColumns,
317 };
318 log.info('Configuration details:', configDetails);
319
320 const items = await fetchDatasetItems(
321 validatedInput.inputDatasetId,
322 validatedInput.testPrompt,
323 validatedInput.testItemsCount
324 );
325
326 const providers = {
327 openai: new OpenAIProvider(validatedInput.llmProviderApiKey),
328 anthropic: new AnthropicProvider(validatedInput.llmProviderApiKey),
329 google: new GoogleProvider(validatedInput.llmProviderApiKey),
330 };
331
332 if (items.length > 0 && validatedInput.multipleColumns) {
333 const validationResult = await validateJsonFormat(items[0], {
334 providers,
335 model: validatedInput.model,
336 temperature: validatedInput.temperature,
337 maxTokens: validatedInput.maxTokens,
338 prompt: validatedInput.prompt,
339 });
340
341 if (!validationResult) {
342 throw new Error('Failed to produce valid JSON after multiple attempts. Please adjust your prompt or disable multiple columns.');
343 }
344 }
345
346 await processItems(items, providers, {
347 prompt: validatedInput.prompt,
348 model: validatedInput.model,
349 temperature: validatedInput.temperature,
350 maxTokens: validatedInput.maxTokens,
351 skipItemIfEmpty: validatedInput.skipItemIfEmpty,
352 multipleColumns: validatedInput.multipleColumns,
353 });
354
355 log.info('Actor finished successfully');
356 await Actor.exit();
357 } catch (error) {
358 if (error instanceof Error) {
359 log.error('Actor failed:', { error: error.message });
360 await Actor.fail(error.message);
361 } else {
362 log.error('Actor failed with unknown error');
363 await Actor.fail('Unknown error occurred');
364 }
365 }
366}
367
368await run();
src/types.ts
1export interface Input {
2 inputDatasetId: string;
3 defaultDatasetId: string;
4 llmProviderApiKey: string;
5 prompt: string;
6 model: string;
7 temperature: string;
8 maxTokens: number;
9 skipItemIfEmpty?: boolean;
10 multipleColumns?: boolean;
11 testPrompt?: boolean;
12 testItemsCount?: number;
13 payload: Payload | null;
14}
15
16export interface Payload {
17 resource: Resource;
18}
19
20export interface Resource {
21 defaultDatasetId: string;
22}
23
24export interface OutputItem extends Record<string, any> {
25 LLMResponse: string | null;
26}
27
28export interface LLMProvider {
29 call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string>;
30}
src/providers/anthropic.ts
1import Anthropic from '@anthropic-ai/sdk';
2import { LLMProvider } from '../types.js';
3
4export class AnthropicProvider implements LLMProvider {
5 private client: Anthropic;
6
7 constructor(apiKey: string) {
8 this.client = new Anthropic({ apiKey });
9 }
10
11 async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12 const message = await this.client.messages.create({
13 model,
14 max_tokens: maxTokens,
15 temperature,
16 messages: [{ role: 'user', content: promptText }],
17 });
18
19 if (!message.content || message.content.length === 0) {
20 return '';
21 }
22
23 const textContent = message.content.find(c => c.type === 'text');
24 return textContent?.text || '';
25 }
26}
src/providers/google.ts
1import { GoogleGenerativeAI } from '@google/generative-ai';
2import { LLMProvider } from '../types.js';
3
4export class GoogleProvider implements LLMProvider {
5 private client: GoogleGenerativeAI;
6
7 constructor(apiKey: string) {
8 this.client = new GoogleGenerativeAI(apiKey);
9 }
10
11 async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12 const genModel = this.client.getGenerativeModel({ model });
13
14 const result = await genModel.generateContent({
15 contents: [{ role: 'user', parts: [{ text: promptText }] }],
16 generationConfig: {
17 temperature,
18 maxOutputTokens: maxTokens,
19 },
20 });
21
22 const response = result.response;
23 return response.text();
24 }
25}
src/providers/index.ts
1export * from './openai.js';
2export * from './anthropic.js';
3export * from './google.js';
4
5export const getProvider = (model: string): 'openai' | 'anthropic' | 'google' => {
6 if (model.includes('claude-')) return 'anthropic';
7 if (model.startsWith('gemini-')) return 'google';
8 if (model.startsWith('gpt-')) return 'openai';
9 throw new Error(`Unknown model provider for model: ${model}`);
10};
src/providers/openai.ts
1import OpenAI from 'openai';
2import { LLMProvider } from '../types.js';
3
4export class OpenAIProvider implements LLMProvider {
5 private client: OpenAI;
6
7 constructor(apiKey: string) {
8 this.client = new OpenAI({ apiKey });
9 }
10
11 async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12 const completion = await this.client.chat.completions.create({
13 messages: [{ role: 'user', content: promptText }],
14 model,
15 temperature,
16 max_tokens: maxTokens,
17 });
18 return completion.choices[0]?.message?.content || '';
19 }
20}
Developer
Maintained by Community
Actor Metrics
0 monthly users
-
1 star
>99% runs succeeded
Created in Dec 2024
Modified 5 days ago
Categories