LLMScraper avatar
LLMScraper

Pricing

Pay per usage

Go to Store
LLMScraper

LLMScraper

Developed by

Ondřej Hlava

Ondřej Hlava

Maintained by Community

Find best scraper for your website and data you need.

0.0 (0)

Pricing

Pay per usage

0

Total users

1

Monthly users

1

Last modified

6 days ago

.gitignore

# --- General ---
.DS_Store
.env
.env.*
# --- Logs ---
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
# --- IDEs ---
.vscode/*
!.vscode/extensions.json
.idea/
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
# --- Python ---
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
.venv/
venv/
ENV/
env/
.env
.env.*
.pytest_cache/
.coverage
htmlcov/
.tox/
.cache
.mypy_cache/
.dmypy.json
dmypy.json
# Project specific
scraped_results/
*.html
# --- Node.js / Frontend ---
frontend/node_modules/
frontend/dist/
frontend/dist-ssr/
frontend/.pnp
frontend/.pnp.js
frontend/.npm
node_modules
dist
dist-ssr
*.local
# Added by Apify CLI
storage
.venv
# --- Apify ---
storage/
apify_storage/
# --- Local test files ---
input.json
test_*

.python-version

3.10

Dockerfile

# Use the official Apify Python base image
FROM apify/actor-python:3.11
# Copy requirements and install dependencies
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
# Copy the source code
COPY . ./
# Set the entrypoint
CMD ["python3", "main.py"]

main.py

1"""
2Main entry point for LLM Scraper Actor.
3
4This Actor uses Claude AI to automatically discover and test the best Apify actors
5for your web scraping task. No manual configuration needed!
6"""
7
8import asyncio
9import logging
10import os
11import sys
12from datetime import datetime
13from typing import Dict, Any, Optional
14
15# Add src to path for development
16sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
17
18from apify import Actor
19from llmscraper.llm_scraper import LLMScraperActor, LLMScraperInput, ProgressUpdate
20
21
22async def main():
23 """Main entry point for LLM Scraper Actor."""
24 async with Actor:
25 # Get input data
26 actor_input = await Actor.get_input() or {}
27
28 # Setup logging
29 logging.basicConfig(
30 level=logging.INFO if not actor_input.get('debugMode') else logging.DEBUG,
31 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
32 )
33 logger = logging.getLogger(__name__)
34
35 try:
36 # Parse and validate input
37 config = LLMScraperInput(
38 target_url=actor_input.get('targetUrl', ''),
39 extraction_goal=actor_input.get('extractionGoal', ''),
40 anthropic_api_key=actor_input.get('claudeApiKey') or os.getenv('ANTHROPIC_API_KEY'),
41 max_actor_attempts=actor_input.get('maxActorAttempts', 10),
42 max_retries_per_actor=actor_input.get('maxRetriesPerActor', 3),
43 max_time_minutes=actor_input.get('maxTimeMinutes', 30),
44 mcp_url=actor_input.get('mcpUrl', 'https://mcp.apify.com/sse?enableAddingActors=true'),
45 model_name=actor_input.get('modelName', 'claude-3-5-haiku-latest'),
46 debug_mode=actor_input.get('debugMode', False),
47 prefer_specific_actors=actor_input.get('preferSpecificActors', True),
48 min_data_quality_score=actor_input.get('minDataQualityScore', 70) / 100.0, # Convert percentage to decimal
49 enable_proxy=actor_input.get('enableProxy', True)
50 )
51
52 # Validate configuration
53 config.validate()
54
55 logger.info("🚀 LLM Scraper Actor starting...")
56 logger.info(f"🎯 Target URL: {config.target_url}")
57 logger.info(f"🎯 Goal: {config.extraction_goal}")
58 logger.info(f"🤖 Model: {config.model_name}")
59
60 # Initialize progress tracking
61 progress_updates = []
62
63 def progress_callback(update: ProgressUpdate):
64 progress_updates.append(update)
65 logger.info(f"Progress: {update.message} ({update.progress:.1%})")
66
67 # Create and run LLM Scraper
68 scraper = LLMScraperActor(config)
69 scraper.set_progress_callback(progress_callback)
70 result = await scraper.run()
71
72 # Process results
73 if result.success:
74 logger.info(f"✅ Scraping completed successfully!")
75 logger.info(f"📊 Quality Score: {result.quality_score:.2f}")
76 logger.info(f"🎭 Best Actor: {result.best_actor_id}")
77 logger.info(f"📦 Items extracted: {len(result.scraped_data)}")
78
79 # Save data to dataset
80 for item in result.scraped_data:
81 await Actor.push_data({
82 "url": config.target_url,
83 "data": item,
84 "quality_score": result.quality_score,
85 "actor_used": result.best_actor_id,
86 "timestamp": datetime.now().isoformat(),
87 "success": True,
88 "error": None,
89 "extraction_goal": config.extraction_goal
90 })
91
92 # Save summary to key-value store
93 await Actor.set_value('SCRAPING_RESULT', {
94 "success": True,
95 "quality_score": result.quality_score,
96 "items_count": len(result.scraped_data),
97 "best_actor_id": result.best_actor_id,
98 "total_execution_time": result.total_execution_time,
99 "attempts_made": len(result.actor_attempts),
100 "target_url": config.target_url,
101 "extraction_goal": config.extraction_goal,
102 "model_used": config.model_name,
103 "progress_updates": [
104 {"message": u.message, "progress": u.progress, "timestamp": u.timestamp.isoformat()}
105 for u in progress_updates
106 ],
107 "actor_attempts": [
108 {
109 "actor_id": attempt.actor_id,
110 "success": attempt.success,
111 "quality_score": attempt.data_quality_score,
112 "execution_time": attempt.execution_time_seconds,
113 "error_message": attempt.error_message,
114 "attempt_number": attempt.attempt_number
115 }
116 for attempt in result.actor_attempts
117 ]
118 })
119
120 else:
121 error_msg = result.llm_reasoning or f"Status: {result.status}"
122 logger.error(f"❌ Scraping failed: {error_msg}")
123
124 # Save failure info to dataset
125 await Actor.push_data({
126 "url": config.target_url,
127 "data": None,
128 "quality_score": 0.0,
129 "actor_used": None,
130 "timestamp": datetime.now().isoformat(),
131 "success": False,
132 "error": error_msg,
133 "extraction_goal": config.extraction_goal,
134 "total_execution_time": result.total_execution_time,
135 "attempts_made": len(result.actor_attempts)
136 })
137
138 # Save failure summary to key-value store
139 await Actor.set_value('SCRAPING_RESULT', {
140 "success": False,
141 "error_message": error_msg,
142 "status": result.status,
143 "total_execution_time": result.total_execution_time,
144 "attempts_made": len(result.actor_attempts),
145 "target_url": config.target_url,
146 "extraction_goal": config.extraction_goal,
147 "model_used": config.model_name,
148 "progress_updates": [
149 {"message": u.message, "progress": u.progress, "timestamp": u.timestamp}
150 for u in progress_updates
151 ]
152 })
153
154 # Exit with error code
155 Actor.exit(exit_code=1, status_message=f"Scraping failed: {error_msg}")
156
157 except Exception as e:
158 logger.error(f"💥 Fatal error: {str(e)}", exc_info=True)
159
160 # Save error info
161 await Actor.push_data({
162 "url": actor_input.get('targetUrl', 'unknown'),
163 "data": None,
164 "quality_score": 0.0,
165 "actor_used": None,
166 "timestamp": datetime.now().isoformat(),
167 "success": False,
168 "error": str(e),
169 "extraction_goal": actor_input.get('extractionGoal', 'unknown')
170 })
171
172 Actor.exit(exit_code=1, status_message=f"Fatal error: {str(e)}")
173
174
175if __name__ == "__main__":
176 asyncio.run(main())

package.json

{
"name": "llm-scraper-actor",
"version": "1.0.0",
"description": "AI-powered web scraper that automatically discovers and tests the best Apify actors for your scraping task using Claude AI",
"main": "main.py",
"scripts": {
"start": "python3 main.py",
"start:local": "./venv/bin/python main.py"
},
"keywords": [
"web-scraping",
"artificial-intelligence",
"ai-powered",
"data-extraction",
"apify-actor",
"claude-ai",
"llm-scraper",
"intelligent-scraping",
"automated-scraping",
"mcp-server"
],
"dependencies": {},
"author": "",
"license": "MIT"
}

pyproject.toml

1[build-system]
2requires = ["hatchling"]
3build-backend = "hatchling.build"
4
5[project]
6name = "llmscraper"
7version = "0.1.0"
8description = "Intelligent web scraping framework using AI-powered quality evaluation and multiple scraping strategies"
9readme = "README.md"
10requires-python = ">=3.10"
11dependencies = [
12 "anthropic>=0.54.0",
13 "apify-client>=1.11.0",
14 "beautifulsoup4>=4.12.0",
15 "apify>=1.5.0",
16]
17
18[project.scripts]
19llmscraper = "llmscraper.main:main"
20
21[project.optional-dependencies]
22dev = [
23 "pytest>=7.0",
24 "pytest-asyncio>=0.21.0",
25 "black>=23.0",
26 "isort>=5.12.0",
27 "mypy>=1.5.0",
28]
29
30[tool.hatch.build.targets.wheel]
31packages = ["src/llmscraper"]
32
33[tool.hatch.build.targets.sdist]
34include = [
35 "/src",
36 "/tests",
37 "/examples.json",
38 "/README.md",
39]

requirements.txt

1apify>=1.5.0
2apify-client>=1.11.0
3anthropic>=0.54.0
4beautifulsoup4>=4.12.0
5httpx>=0.27.0
6python-dotenv>=1.0.0

uv.lock

version = 1
revision = 2
requires-python = ">=3.10"
[[package]]
name = "annotated-types"
version = "0.7.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
]
[[package]]
name = "anthropic"
version = "0.54.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
{ name = "distro" },
{ name = "httpx" },
{ name = "jiter" },
{ name = "pydantic" },
{ name = "sniffio" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/89/28/80cb9bb6e7ce77d404145b51da4257455805c17f0a6be528ff3286e3882f/anthropic-0.54.0.tar.gz", hash = "sha256:5e6f997d97ce8e70eac603c3ec2e7f23addeff953fbbb76b19430562bb6ba815", size = 312376, upload-time = "2025-06-11T02:46:27.642Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/de/b9/6ffb48e82c5e97b03cecee872d134a6b6666c2767b2d32ed709f3a60a8fe/anthropic-0.54.0-py3-none-any.whl", hash = "sha256:c1062a0a905daeec17ca9c06c401e4b3f24cb0495841d29d752568a1d4018d56", size = 288774, upload-time = "2025-06-11T02:46:25.578Z" },
]
[[package]]
name = "anyio"
version = "4.9.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "exceptiongroup", marker = "python_full_version < '3.11'" },
{ name = "idna" },
{ name = "sniffio" },
{ name = "typing-extensions", marker = "python_full_version < '3.13'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload-time = "2025-03-17T00:02:52.713Z" },
]
[[package]]
name = "apify-client"
version = "1.11.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "apify-shared" },
{ name = "colorama" },
{ name = "httpx" },
{ name = "more-itertools" },
]
sdist = { url = "https://files.pythonhosted.org/packages/49/44/b7cae857f2129d4093bc5a0a2267fcbba7905207a0b7cc424dc3c7c90291/apify_client-1.11.0.tar.gz", hash = "sha256:c2e151754c35be9bc7c1028bf7cb127aeb1ffa2fbd1ec1ad7e97b901deb32e08", size = 346095, upload-time = "2025-06-13T11:46:39.129Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8f/24/d3273bfe5b4a96fd60c8d554edbab99274fae8cb2347b96f2e3fa0bc4d5b/apify_client-1.11.0-py3-none-any.whl", hash = "sha256:9d691960bdbeee17624a2a82aafc4f0bfba9b48820a48f559b7eba76bf01cb3c", size = 82550, upload-time = "2025-06-13T11:46:37.483Z" },
]
[[package]]
name = "apify-shared"
version = "1.4.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b2/a6/c8e2fa0b3bdc479d3ecde778e2381af199f910cf7c8baa3c207bcfe26e47/apify_shared-1.4.1.tar.gz", hash = "sha256:16e617c840fd27bf38d980f079c0b867c7378f68c7006b3d5a7d530d43930507", size = 13871, upload-time = "2025-04-28T12:20:01.113Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1d/f3/3446c8a7986fdc087024d4e174e4b3f587097a9b28f6f8e8c788199225b2/apify_shared-1.4.1-py3-none-any.whl", hash = "sha256:abac5712b6e8eb96693204cbb2702905e1971d9084b1716e7337852b5005290e", size = 12706, upload-time = "2025-04-28T12:19:59.792Z" },
]
[[package]]
name = "certifi"
version = "2025.6.15"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload-time = "2025-06-15T02:45:51.329Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" },
]
[[package]]
name = "colorama"
version = "0.4.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]
[[package]]
name = "distro"
version = "1.9.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
]
[[package]]
name = "exceptiongroup"
version = "1.3.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions", marker = "python_full_version < '3.13'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },
]
[[package]]
name = "h11"
version = "0.16.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
]
[[package]]
name = "httpcore"
version = "1.0.9"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "h11" },
]
sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
]
[[package]]
name = "httpx"
version = "0.28.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
{ name = "certifi" },
{ name = "httpcore" },
{ name = "idna" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
]
[[package]]
name = "idna"
version = "3.10"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
]
[[package]]
name = "jiter"
version = "0.10.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ee/9d/ae7ddb4b8ab3fb1b51faf4deb36cb48a4fbbd7cb36bad6a5fca4741306f7/jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500", size = 162759, upload-time = "2025-05-18T19:04:59.73Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/be/7e/4011b5c77bec97cb2b572f566220364e3e21b51c48c5bd9c4a9c26b41b67/jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303", size = 317215, upload-time = "2025-05-18T19:03:04.303Z" },
{ url = "https://files.pythonhosted.org/packages/8a/4f/144c1b57c39692efc7ea7d8e247acf28e47d0912800b34d0ad815f6b2824/jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e", size = 322814, upload-time = "2025-05-18T19:03:06.433Z" },
{ url = "https://files.pythonhosted.org/packages/63/1f/db977336d332a9406c0b1f0b82be6f71f72526a806cbb2281baf201d38e3/jiter-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8b3e0068c26ddedc7abc6fac37da2d0af16b921e288a5a613f4b86f050354f", size = 345237, upload-time = "2025-05-18T19:03:07.833Z" },
{ url = "https://files.pythonhosted.org/packages/d7/1c/aa30a4a775e8a672ad7f21532bdbfb269f0706b39c6ff14e1f86bdd9e5ff/jiter-0.10.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:286299b74cc49e25cd42eea19b72aa82c515d2f2ee12d11392c56d8701f52224", size = 370999, upload-time = "2025-05-18T19:03:09.338Z" },
{ url = "https://files.pythonhosted.org/packages/35/df/f8257abc4207830cb18880781b5f5b716bad5b2a22fb4330cfd357407c5b/jiter-0.10.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ed5649ceeaeffc28d87fb012d25a4cd356dcd53eff5acff1f0466b831dda2a7", size = 491109, upload-time = "2025-05-18T19:03:11.13Z" },
{ url = "https://files.pythonhosted.org/packages/06/76/9e1516fd7b4278aa13a2cc7f159e56befbea9aa65c71586305e7afa8b0b3/jiter-0.10.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2ab0051160cb758a70716448908ef14ad476c3774bd03ddce075f3c1f90a3d6", size = 388608, upload-time = "2025-05-18T19:03:12.911Z" },
{ url = "https://files.pythonhosted.org/packages/6d/64/67750672b4354ca20ca18d3d1ccf2c62a072e8a2d452ac3cf8ced73571ef/jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03997d2f37f6b67d2f5c475da4412be584e1cec273c1cfc03d642c46db43f8cf", size = 352454, upload-time = "2025-05-18T19:03:14.741Z" },
{ url = "https://files.pythonhosted.org/packages/96/4d/5c4e36d48f169a54b53a305114be3efa2bbffd33b648cd1478a688f639c1/jiter-0.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c404a99352d839fed80d6afd6c1d66071f3bacaaa5c4268983fc10f769112e90", size = 391833, upload-time = "2025-05-18T19:03:16.426Z" },
{ url = "https://files.pythonhosted.org/packages/0b/de/ce4a6166a78810bd83763d2fa13f85f73cbd3743a325469a4a9289af6dae/jiter-0.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66e989410b6666d3ddb27a74c7e50d0829704ede652fd4c858e91f8d64b403d0", size = 523646, upload-time = "2025-05-18T19:03:17.704Z" },
{ url = "https://files.pythonhosted.org/packages/a2/a6/3bc9acce53466972964cf4ad85efecb94f9244539ab6da1107f7aed82934/jiter-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b532d3af9ef4f6374609a3bcb5e05a1951d3bf6190dc6b176fdb277c9bbf15ee", size = 514735, upload-time = "2025-05-18T19:03:19.44Z" },
{ url = "https://files.pythonhosted.org/packages/b4/d8/243c2ab8426a2a4dea85ba2a2ba43df379ccece2145320dfd4799b9633c5/jiter-0.10.0-cp310-cp310-win32.whl", hash = "sha256:da9be20b333970e28b72edc4dff63d4fec3398e05770fb3205f7fb460eb48dd4", size = 210747, upload-time = "2025-05-18T19:03:21.184Z" },
{ url = "https://files.pythonhosted.org/packages/37/7a/8021bd615ef7788b98fc76ff533eaac846322c170e93cbffa01979197a45/jiter-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:f59e533afed0c5b0ac3eba20d2548c4a550336d8282ee69eb07b37ea526ee4e5", size = 207484, upload-time = "2025-05-18T19:03:23.046Z" },
{ url = "https://files.pythonhosted.org/packages/1b/dd/6cefc6bd68b1c3c979cecfa7029ab582b57690a31cd2f346c4d0ce7951b6/jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978", size = 317473, upload-time = "2025-05-18T19:03:25.942Z" },
{ url = "https://files.pythonhosted.org/packages/be/cf/fc33f5159ce132be1d8dd57251a1ec7a631c7df4bd11e1cd198308c6ae32/jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc", size = 321971, upload-time = "2025-05-18T19:03:27.255Z" },
{ url = "https://files.pythonhosted.org/packages/68/a4/da3f150cf1d51f6c472616fb7650429c7ce053e0c962b41b68557fdf6379/jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d", size = 345574, upload-time = "2025-05-18T19:03:28.63Z" },
{ url = "https://files.pythonhosted.org/packages/84/34/6e8d412e60ff06b186040e77da5f83bc158e9735759fcae65b37d681f28b/jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2", size = 371028, upload-time = "2025-05-18T19:03:30.292Z" },
{ url = "https://files.pythonhosted.org/packages/fb/d9/9ee86173aae4576c35a2f50ae930d2ccb4c4c236f6cb9353267aa1d626b7/jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61", size = 491083, upload-time = "2025-05-18T19:03:31.654Z" },
{ url = "https://files.pythonhosted.org/packages/d9/2c/f955de55e74771493ac9e188b0f731524c6a995dffdcb8c255b89c6fb74b/jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db", size = 388821, upload-time = "2025-05-18T19:03:33.184Z" },
{ url = "https://files.pythonhosted.org/packages/81/5a/0e73541b6edd3f4aada586c24e50626c7815c561a7ba337d6a7eb0a915b4/jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5", size = 352174, upload-time = "2025-05-18T19:03:34.965Z" },
{ url = "https://files.pythonhosted.org/packages/1c/c0/61eeec33b8c75b31cae42be14d44f9e6fe3ac15a4e58010256ac3abf3638/jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606", size = 391869, upload-time = "2025-05-18T19:03:36.436Z" },
{ url = "https://files.pythonhosted.org/packages/41/22/5beb5ee4ad4ef7d86f5ea5b4509f680a20706c4a7659e74344777efb7739/jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605", size = 523741, upload-time = "2025-05-18T19:03:38.168Z" },
{ url = "https://files.pythonhosted.org/packages/ea/10/768e8818538e5817c637b0df52e54366ec4cebc3346108a4457ea7a98f32/jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5", size = 514527, upload-time = "2025-05-18T19:03:39.577Z" },
{ url = "https://files.pythonhosted.org/packages/73/6d/29b7c2dc76ce93cbedabfd842fc9096d01a0550c52692dfc33d3cc889815/jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7", size = 210765, upload-time = "2025-05-18T19:03:41.271Z" },
{ url = "https://files.pythonhosted.org/packages/c2/c9/d394706deb4c660137caf13e33d05a031d734eb99c051142e039d8ceb794/jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812", size = 209234, upload-time = "2025-05-18T19:03:42.918Z" },
{ url = "https://files.pythonhosted.org/packages/6d/b5/348b3313c58f5fbfb2194eb4d07e46a35748ba6e5b3b3046143f3040bafa/jiter-0.10.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1e274728e4a5345a6dde2d343c8da018b9d4bd4350f5a472fa91f66fda44911b", size = 312262, upload-time = "2025-05-18T19:03:44.637Z" },
{ url = "https://files.pythonhosted.org/packages/9c/4a/6a2397096162b21645162825f058d1709a02965606e537e3304b02742e9b/jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7202ae396446c988cb2a5feb33a543ab2165b786ac97f53b59aafb803fef0744", size = 320124, upload-time = "2025-05-18T19:03:46.341Z" },
{ url = "https://files.pythonhosted.org/packages/2a/85/1ce02cade7516b726dd88f59a4ee46914bf79d1676d1228ef2002ed2f1c9/jiter-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23ba7722d6748b6920ed02a8f1726fb4b33e0fd2f3f621816a8b486c66410ab2", size = 345330, upload-time = "2025-05-18T19:03:47.596Z" },
{ url = "https://files.pythonhosted.org/packages/75/d0/bb6b4f209a77190ce10ea8d7e50bf3725fc16d3372d0a9f11985a2b23eff/jiter-0.10.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:371eab43c0a288537d30e1f0b193bc4eca90439fc08a022dd83e5e07500ed026", size = 369670, upload-time = "2025-05-18T19:03:49.334Z" },
{ url = "https://files.pythonhosted.org/packages/a0/f5/a61787da9b8847a601e6827fbc42ecb12be2c925ced3252c8ffcb56afcaf/jiter-0.10.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c675736059020365cebc845a820214765162728b51ab1e03a1b7b3abb70f74c", size = 489057, upload-time = "2025-05-18T19:03:50.66Z" },
{ url = "https://files.pythonhosted.org/packages/12/e4/6f906272810a7b21406c760a53aadbe52e99ee070fc5c0cb191e316de30b/jiter-0.10.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c5867d40ab716e4684858e4887489685968a47e3ba222e44cde6e4a2154f959", size = 389372, upload-time = "2025-05-18T19:03:51.98Z" },
{ url = "https://files.pythonhosted.org/packages/e2/ba/77013b0b8ba904bf3762f11e0129b8928bff7f978a81838dfcc958ad5728/jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395bb9a26111b60141757d874d27fdea01b17e8fac958b91c20128ba8f4acc8a", size = 352038, upload-time = "2025-05-18T19:03:53.703Z" },
{ url = "https://files.pythonhosted.org/packages/67/27/c62568e3ccb03368dbcc44a1ef3a423cb86778a4389e995125d3d1aaa0a4/jiter-0.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6842184aed5cdb07e0c7e20e5bdcfafe33515ee1741a6835353bb45fe5d1bd95", size = 391538, upload-time = "2025-05-18T19:03:55.046Z" },
{ url = "https://files.pythonhosted.org/packages/c0/72/0d6b7e31fc17a8fdce76164884edef0698ba556b8eb0af9546ae1a06b91d/jiter-0.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:62755d1bcea9876770d4df713d82606c8c1a3dca88ff39046b85a048566d56ea", size = 523557, upload-time = "2025-05-18T19:03:56.386Z" },
{ url = "https://files.pythonhosted.org/packages/2f/09/bc1661fbbcbeb6244bd2904ff3a06f340aa77a2b94e5a7373fd165960ea3/jiter-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:533efbce2cacec78d5ba73a41756beff8431dfa1694b6346ce7af3a12c42202b", size = 514202, upload-time = "2025-05-18T19:03:57.675Z" },
{ url = "https://files.pythonhosted.org/packages/1b/84/5a5d5400e9d4d54b8004c9673bbe4403928a00d28529ff35b19e9d176b19/jiter-0.10.0-cp312-cp312-win32.whl", hash = "sha256:8be921f0cadd245e981b964dfbcd6fd4bc4e254cdc069490416dd7a2632ecc01", size = 211781, upload-time = "2025-05-18T19:03:59.025Z" },
{ url = "https://files.pythonhosted.org/packages/9b/52/7ec47455e26f2d6e5f2ea4951a0652c06e5b995c291f723973ae9e724a65/jiter-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7c7d785ae9dda68c2678532a5a1581347e9c15362ae9f6e68f3fdbfb64f2e49", size = 206176, upload-time = "2025-05-18T19:04:00.305Z" },
{ url = "https://files.pythonhosted.org/packages/2e/b0/279597e7a270e8d22623fea6c5d4eeac328e7d95c236ed51a2b884c54f70/jiter-0.10.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e0588107ec8e11b6f5ef0e0d656fb2803ac6cf94a96b2b9fc675c0e3ab5e8644", size = 311617, upload-time = "2025-05-18T19:04:02.078Z" },
{ url = "https://files.pythonhosted.org/packages/91/e3/0916334936f356d605f54cc164af4060e3e7094364add445a3bc79335d46/jiter-0.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cafc4628b616dc32530c20ee53d71589816cf385dd9449633e910d596b1f5c8a", size = 318947, upload-time = "2025-05-18T19:04:03.347Z" },
{ url = "https://files.pythonhosted.org/packages/6a/8e/fd94e8c02d0e94539b7d669a7ebbd2776e51f329bb2c84d4385e8063a2ad/jiter-0.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:520ef6d981172693786a49ff5b09eda72a42e539f14788124a07530f785c3ad6", size = 344618, upload-time = "2025-05-18T19:04:04.709Z" },
{ url = "https://files.pythonhosted.org/packages/6f/b0/f9f0a2ec42c6e9c2e61c327824687f1e2415b767e1089c1d9135f43816bd/jiter-0.10.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:554dedfd05937f8fc45d17ebdf298fe7e0c77458232bcb73d9fbbf4c6455f5b3", size = 368829, upload-time = "2025-05-18T19:04:06.912Z" },
{ url = "https://files.pythonhosted.org/packages/e8/57/5bbcd5331910595ad53b9fd0c610392ac68692176f05ae48d6ce5c852967/jiter-0.10.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bc299da7789deacf95f64052d97f75c16d4fc8c4c214a22bf8d859a4288a1c2", size = 491034, upload-time = "2025-05-18T19:04:08.222Z" },
{ url = "https://files.pythonhosted.org/packages/9b/be/c393df00e6e6e9e623a73551774449f2f23b6ec6a502a3297aeeece2c65a/jiter-0.10.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5161e201172de298a8a1baad95eb85db4fb90e902353b1f6a41d64ea64644e25", size = 388529, upload-time = "2025-05-18T19:04:09.566Z" },
{ url = "https://files.pythonhosted.org/packages/42/3e/df2235c54d365434c7f150b986a6e35f41ebdc2f95acea3036d99613025d/jiter-0.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e2227db6ba93cb3e2bf67c87e594adde0609f146344e8207e8730364db27041", size = 350671, upload-time = "2025-05-18T19:04:10.98Z" },
{ url = "https://files.pythonhosted.org/packages/c6/77/71b0b24cbcc28f55ab4dbfe029f9a5b73aeadaba677843fc6dc9ed2b1d0a/jiter-0.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:15acb267ea5e2c64515574b06a8bf393fbfee6a50eb1673614aa45f4613c0cca", size = 390864, upload-time = "2025-05-18T19:04:12.722Z" },
{ url = "https://files.pythonhosted.org/packages/6a/d3/ef774b6969b9b6178e1d1e7a89a3bd37d241f3d3ec5f8deb37bbd203714a/jiter-0.10.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:901b92f2e2947dc6dfcb52fd624453862e16665ea909a08398dde19c0731b7f4", size = 522989, upload-time = "2025-05-18T19:04:14.261Z" },
{ url = "https://files.pythonhosted.org/packages/0c/41/9becdb1d8dd5d854142f45a9d71949ed7e87a8e312b0bede2de849388cb9/jiter-0.10.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d0cb9a125d5a3ec971a094a845eadde2db0de85b33c9f13eb94a0c63d463879e", size = 513495, upload-time = "2025-05-18T19:04:15.603Z" },
{ url = "https://files.pythonhosted.org/packages/9c/36/3468e5a18238bdedae7c4d19461265b5e9b8e288d3f86cd89d00cbb48686/jiter-0.10.0-cp313-cp313-win32.whl", hash = "sha256:48a403277ad1ee208fb930bdf91745e4d2d6e47253eedc96e2559d1e6527006d", size = 211289, upload-time = "2025-05-18T19:04:17.541Z" },
{ url = "https://files.pythonhosted.org/packages/7e/07/1c96b623128bcb913706e294adb5f768fb7baf8db5e1338ce7b4ee8c78ef/jiter-0.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:75f9eb72ecb640619c29bf714e78c9c46c9c4eaafd644bf78577ede459f330d4", size = 205074, upload-time = "2025-05-18T19:04:19.21Z" },
{ url = "https://files.pythonhosted.org/packages/54/46/caa2c1342655f57d8f0f2519774c6d67132205909c65e9aa8255e1d7b4f4/jiter-0.10.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:28ed2a4c05a1f32ef0e1d24c2611330219fed727dae01789f4a335617634b1ca", size = 318225, upload-time = "2025-05-18T19:04:20.583Z" },
{ url = "https://files.pythonhosted.org/packages/43/84/c7d44c75767e18946219ba2d703a5a32ab37b0bc21886a97bc6062e4da42/jiter-0.10.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a4c418b1ec86a195f1ca69da8b23e8926c752b685af665ce30777233dfe070", size = 350235, upload-time = "2025-05-18T19:04:22.363Z" },
{ url = "https://files.pythonhosted.org/packages/01/16/f5a0135ccd968b480daad0e6ab34b0c7c5ba3bc447e5088152696140dcb3/jiter-0.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d7bfed2fe1fe0e4dda6ef682cee888ba444b21e7a6553e03252e4feb6cf0adca", size = 207278, upload-time = "2025-05-18T19:04:23.627Z" },
{ url = "https://files.pythonhosted.org/packages/1c/9b/1d646da42c3de6c2188fdaa15bce8ecb22b635904fc68be025e21249ba44/jiter-0.10.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:5e9251a5e83fab8d87799d3e1a46cb4b7f2919b895c6f4483629ed2446f66522", size = 310866, upload-time = "2025-05-18T19:04:24.891Z" },
{ url = "https://files.pythonhosted.org/packages/ad/0e/26538b158e8a7c7987e94e7aeb2999e2e82b1f9d2e1f6e9874ddf71ebda0/jiter-0.10.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:023aa0204126fe5b87ccbcd75c8a0d0261b9abdbbf46d55e7ae9f8e22424eeb8", size = 318772, upload-time = "2025-05-18T19:04:26.161Z" },
{ url = "https://files.pythonhosted.org/packages/7b/fb/d302893151caa1c2636d6574d213e4b34e31fd077af6050a9c5cbb42f6fb/jiter-0.10.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c189c4f1779c05f75fc17c0c1267594ed918996a231593a21a5ca5438445216", size = 344534, upload-time = "2025-05-18T19:04:27.495Z" },
{ url = "https://files.pythonhosted.org/packages/01/d8/5780b64a149d74e347c5128d82176eb1e3241b1391ac07935693466d6219/jiter-0.10.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15720084d90d1098ca0229352607cd68256c76991f6b374af96f36920eae13c4", size = 369087, upload-time = "2025-05-18T19:04:28.896Z" },
{ url = "https://files.pythonhosted.org/packages/e8/5b/f235a1437445160e777544f3ade57544daf96ba7e96c1a5b24a6f7ac7004/jiter-0.10.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4f2fb68e5f1cfee30e2b2a09549a00683e0fde4c6a2ab88c94072fc33cb7426", size = 490694, upload-time = "2025-05-18T19:04:30.183Z" },
{ url = "https://files.pythonhosted.org/packages/85/a9/9c3d4617caa2ff89cf61b41e83820c27ebb3f7b5fae8a72901e8cd6ff9be/jiter-0.10.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce541693355fc6da424c08b7edf39a2895f58d6ea17d92cc2b168d20907dee12", size = 388992, upload-time = "2025-05-18T19:04:32.028Z" },
{ url = "https://files.pythonhosted.org/packages/68/b1/344fd14049ba5c94526540af7eb661871f9c54d5f5601ff41a959b9a0bbd/jiter-0.10.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c50c40272e189d50006ad5c73883caabb73d4e9748a688b216e85a9a9ca3b9", size = 351723, upload-time = "2025-05-18T19:04:33.467Z" },
{ url = "https://files.pythonhosted.org/packages/41/89/4c0e345041186f82a31aee7b9d4219a910df672b9fef26f129f0cda07a29/jiter-0.10.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fa3402a2ff9815960e0372a47b75c76979d74402448509ccd49a275fa983ef8a", size = 392215, upload-time = "2025-05-18T19:04:34.827Z" },
{ url = "https://files.pythonhosted.org/packages/55/58/ee607863e18d3f895feb802154a2177d7e823a7103f000df182e0f718b38/jiter-0.10.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:1956f934dca32d7bb647ea21d06d93ca40868b505c228556d3373cbd255ce853", size = 522762, upload-time = "2025-05-18T19:04:36.19Z" },
{ url = "https://files.pythonhosted.org/packages/15/d0/9123fb41825490d16929e73c212de9a42913d68324a8ce3c8476cae7ac9d/jiter-0.10.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:fcedb049bdfc555e261d6f65a6abe1d5ad68825b7202ccb9692636c70fcced86", size = 513427, upload-time = "2025-05-18T19:04:37.544Z" },
{ url = "https://files.pythonhosted.org/packages/d8/b3/2bd02071c5a2430d0b70403a34411fc519c2f227da7b03da9ba6a956f931/jiter-0.10.0-cp314-cp314-win32.whl", hash = "sha256:ac509f7eccca54b2a29daeb516fb95b6f0bd0d0d8084efaf8ed5dfc7b9f0b357", size = 210127, upload-time = "2025-05-18T19:04:38.837Z" },
{ url = "https://files.pythonhosted.org/packages/03/0c/5fe86614ea050c3ecd728ab4035534387cd41e7c1855ef6c031f1ca93e3f/jiter-0.10.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5ed975b83a2b8639356151cef5c0d597c68376fc4922b45d0eb384ac058cfa00", size = 318527, upload-time = "2025-05-18T19:04:40.612Z" },
{ url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" },
]
[[package]]
name = "llmscraper"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "anthropic" },
{ name = "apify-client" },
]
[package.metadata]
requires-dist = [
{ name = "anthropic", specifier = ">=0.54.0" },
{ name = "apify-client", specifier = ">=1.11.0" },
]
[[package]]
name = "more-itertools"
version = "10.7.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671, upload-time = "2025-04-22T14:17:41.838Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278, upload-time = "2025-04-22T14:17:40.49Z" },
]
[[package]]
name = "pydantic"
version = "2.11.7"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "annotated-types" },
{ name = "pydantic-core" },
{ name = "typing-extensions" },
{ name = "typing-inspection" },
]
sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },
]
[[package]]
name = "pydantic-core"
version = "2.33.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e5/92/b31726561b5dae176c2d2c2dc43a9c5bfba5d32f96f8b4c0a600dd492447/pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8", size = 2028817, upload-time = "2025-04-23T18:30:43.919Z" },
{ url = "https://files.pythonhosted.org/packages/a3/44/3f0b95fafdaca04a483c4e685fe437c6891001bf3ce8b2fded82b9ea3aa1/pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d", size = 1861357, upload-time = "2025-04-23T18:30:46.372Z" },
{ url = "https://files.pythonhosted.org/packages/30/97/e8f13b55766234caae05372826e8e4b3b96e7b248be3157f53237682e43c/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d", size = 1898011, upload-time = "2025-04-23T18:30:47.591Z" },
{ url = "https://files.pythonhosted.org/packages/9b/a3/99c48cf7bafc991cc3ee66fd544c0aae8dc907b752f1dad2d79b1b5a471f/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572", size = 1982730, upload-time = "2025-04-23T18:30:49.328Z" },
{ url = "https://files.pythonhosted.org/packages/de/8e/a5b882ec4307010a840fb8b58bd9bf65d1840c92eae7534c7441709bf54b/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02", size = 2136178, upload-time = "2025-04-23T18:30:50.907Z" },
{ url = "https://files.pythonhosted.org/packages/e4/bb/71e35fc3ed05af6834e890edb75968e2802fe98778971ab5cba20a162315/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b", size = 2736462, upload-time = "2025-04-23T18:30:52.083Z" },
{ url = "https://files.pythonhosted.org/packages/31/0d/c8f7593e6bc7066289bbc366f2235701dcbebcd1ff0ef8e64f6f239fb47d/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2", size = 2005652, upload-time = "2025-04-23T18:30:53.389Z" },
{ url = "https://files.pythonhosted.org/packages/d2/7a/996d8bd75f3eda405e3dd219ff5ff0a283cd8e34add39d8ef9157e722867/pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a", size = 2113306, upload-time = "2025-04-23T18:30:54.661Z" },
{ url = "https://files.pythonhosted.org/packages/ff/84/daf2a6fb2db40ffda6578a7e8c5a6e9c8affb251a05c233ae37098118788/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac", size = 2073720, upload-time = "2025-04-23T18:30:56.11Z" },
{ url = "https://files.pythonhosted.org/packages/77/fb/2258da019f4825128445ae79456a5499c032b55849dbd5bed78c95ccf163/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a", size = 2244915, upload-time = "2025-04-23T18:30:57.501Z" },
{ url = "https://files.pythonhosted.org/packages/d8/7a/925ff73756031289468326e355b6fa8316960d0d65f8b5d6b3a3e7866de7/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b", size = 2241884, upload-time = "2025-04-23T18:30:58.867Z" },
{ url = "https://files.pythonhosted.org/packages/0b/b0/249ee6d2646f1cdadcb813805fe76265745c4010cf20a8eba7b0e639d9b2/pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22", size = 1910496, upload-time = "2025-04-23T18:31:00.078Z" },
{ url = "https://files.pythonhosted.org/packages/66/ff/172ba8f12a42d4b552917aa65d1f2328990d3ccfc01d5b7c943ec084299f/pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640", size = 1955019, upload-time = "2025-04-23T18:31:01.335Z" },
{ url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" },
{ url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" },
{ url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" },
{ url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" },
{ url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" },
{ url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" },
{ url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" },
{ url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" },
{ url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" },
{ url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" },
{ url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" },
{ url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" },
{ url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" },
{ url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" },
{ url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" },
{ url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" },
{ url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" },
{ url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" },
{ url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" },
{ url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" },
{ url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" },
{ url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" },
{ url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" },
{ url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" },
{ url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" },
{ url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" },
{ url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" },
{ url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" },
{ url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" },
{ url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" },
{ url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" },
{ url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" },
{ url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" },
{ url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" },
{ url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" },
{ url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" },
{ url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" },
{ url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" },
{ url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" },
{ url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" },
{ url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" },
{ url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" },
{ url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" },
{ url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" },
{ url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" },
{ url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" },
{ url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" },
{ url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" },
{ url = "https://files.pythonhosted.org/packages/12/73/8cd57e20afba760b21b742106f9dbdfa6697f1570b189c7457a1af4cd8a0/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e", size = 2067527, upload-time = "2025-04-23T18:32:59.771Z" },
{ url = "https://files.pythonhosted.org/packages/e3/d5/0bb5d988cc019b3cba4a78f2d4b3854427fc47ee8ec8e9eaabf787da239c/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c", size = 2108225, upload-time = "2025-04-23T18:33:04.51Z" },
{ url = "https://files.pythonhosted.org/packages/f1/c5/00c02d1571913d496aabf146106ad8239dc132485ee22efe08085084ff7c/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec", size = 2069490, upload-time = "2025-04-23T18:33:06.391Z" },
{ url = "https://files.pythonhosted.org/packages/22/a8/dccc38768274d3ed3a59b5d06f59ccb845778687652daa71df0cab4040d7/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052", size = 2237525, upload-time = "2025-04-23T18:33:08.44Z" },
{ url = "https://files.pythonhosted.org/packages/d4/e7/4f98c0b125dda7cf7ccd14ba936218397b44f50a56dd8c16a3091df116c3/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c", size = 2238446, upload-time = "2025-04-23T18:33:10.313Z" },
{ url = "https://files.pythonhosted.org/packages/ce/91/2ec36480fdb0b783cd9ef6795753c1dea13882f2e68e73bce76ae8c21e6a/pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808", size = 2066678, upload-time = "2025-04-23T18:33:12.224Z" },
{ url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" },
{ url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" },
{ url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" },
{ url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" },
{ url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" },
{ url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" },
{ url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" },
{ url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" },
{ url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },
]
[[package]]
name = "sniffio"
version = "1.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
]
[[package]]
name = "typing-extensions"
version = "4.14.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d1/bc/51647cd02527e87d05cb083ccc402f93e441606ff1f01739a62c8ad09ba5/typing_extensions-4.14.0.tar.gz", hash = "sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4", size = 107423, upload-time = "2025-06-02T14:52:11.399Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/69/e0/552843e0d356fbb5256d21449fa957fa4eff3bbc135a74a691ee70c7c5da/typing_extensions-4.14.0-py3-none-any.whl", hash = "sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af", size = 43839, upload-time = "2025-06-02T14:52:10.026Z" },
]
[[package]]
name = "typing-inspection"
version = "0.4.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },
]

.actor/README.md

1# 🤖 LLM-Powered Web Scraper
2
3An intelligent Apify Actor that uses Claude AI to automatically discover, test, and select the b- **Quality Threshold**: Lower `minDataQualityScore` if you're getting no resultsst Apify actors for your web scraping tasks. No manual configuration needed!
4
5## ✨ Features
6
7- **🧠 AI-Powered Actor Discovery**: Uses Claude AI to automatically find and test the best Apify actors for your target website
8- **🔄 Smart Retry Logic**: Automatically adjusts parameters and retries failed attempts with different actors
9- **📊 Quality Assessment**: Evaluates scraped data quality across multiple dimensions (completeness, relevance, structure, volume)
10- **🎯 Priority-Based Testing**: Tests domain-specific actors first, then falls back to general-purpose ones
11- **📈 Real-time Progress**: Tracks and reports scraping progress with detailed logging
12- **🔗 MCP Integration**: Connects to Apify MCP Server for dynamic actor discovery and execution
13- **⚙️ Flexible Configuration**: Extensive customization options for timeout, quality thresholds, and model selection
14- **🛡️ Error Handling**: Robust error handling with detailed logging and graceful fallbacks
15
16## 🚀 Quick Start
17
181. **Set up your Claude API key** in the Actor input or as an environment variable
192. **Provide your target URL** and describe what data you want to extract
203. **Run the Actor** - it will automatically find and test the best scraping approach
21
22### Example Input
23
24```json
25{
26 "targetUrl": "https://books.toscrape.com/",
27 "extractionGoal": "Extract book information including title, price, star rating, and availability",
28 "claudeApiKey": "sk-ant-api03-...",
29 "maxActorAttempts": 5,
30 "maxTimeMinutes": 20
31}
32```
33
34## 📝 Input Configuration
35
36### Required Fields
37
38- **`targetUrl`**: The URL of the website you want to scrape
39- **`extractionGoal`**: Describe what data you want to extract from the website
40- **`claudeApiKey`**: Your Anthropic Claude API key for AI-powered analysis
41
42### Optional Configuration
43
44- **`maxActorAttempts`** (default: 10): Maximum number of different actors to try
45- **`maxRetriesPerActor`** (default: 3): Maximum retry attempts per actor
46- **`maxTimeMinutes`** (default: 30): Maximum total execution time in minutes
47- **`modelName`** (default: "claude-3-5-haiku-latest"): Claude model to use
48- **`debugMode`** (default: false): Enable detailed logging
49- **`preferSpecificActors`** (default: true): Prioritize domain-specific actors
50- **`minDataQualityScore`** (default: 70): Minimum quality score (0-100) to accept results
51- **`enableProxy`** (default: true): Use proxy for scraping requests
52
53### Available Claude Models
54
55- `claude-3-5-haiku-latest` - Fast & cost-effective (recommended)
56- `claude-3-5-sonnet-latest` - Balanced performance and quality
57- `claude-3-opus-latest` - Maximum quality (slower, more expensive)
58
59## 📊 Output
60
61The Actor saves results to:
62
63### Dataset
64
65Each scraped item with metadata:
66
67```json
68{
69 "url": "https://example.com",
70 "data": {...},
71 "quality_score": 0.85,
72 "actor_used": "apify/web-scraper",
73 "timestamp": "2025-07-24T11:30:00Z",
74 "success": true,
75 "extraction_goal": "Extract product information",
76 "total_execution_time": 45.2,
77 "attempts_made": 3
78}
79```
80
81### Key-Value Store
82
83Summary information in `SCRAPING_RESULT`:
84
85```json
86{
87 "success": true,
88 "quality_score": 0.85,
89 "items_count": 25,
90 "best_actor_id": "apify/web-scraper",
91 "total_execution_time": 45.2,
92 "attempts_made": 3,
93 "progress_updates": [...],
94 "actor_attempts": [...]
95}
96```
97
98## 🔧 How It Works
99
1001. **Actor Discovery**: Connects to Apify MCP Server to discover available actors
1012. **AI Analysis**: Uses Claude to analyze the target website and select appropriate actors
1023. **Smart Testing**: Tests actors in priority order with intelligent parameter adjustment
1034. **Quality Evaluation**: Assesses data quality using multiple metrics
1045. **Retry Logic**: Automatically retries with different parameters if needed
1056. **Result Selection**: Returns the best results based on quality scores
106
107## 🏗️ Architecture
108
109The Actor consists of several key components:
110
111- **MCP Client** (`src/llmscraper/mcp/`): Handles communication with Apify MCP Server
112- **Claude Manager** (`src/llmscraper/claude/`): Manages AI conversations and tool calls
113- **LLM Scraper Actor** (`src/llmscraper/llm_scraper/`): Main orchestration logic
114- **Retry Logic** (`src/llmscraper/llm_scraper/retry_logic.py`): Intelligent parameter adjustment
115- **Quality Evaluator** (`src/llmscraper/llm_scraper/quality_evaluator.py`): Data quality assessment
116
117## 🔑 Environment Variables
118
119- `ANTHROPIC_API_KEY`: Your Anthropic Claude API key (alternative to input field)
120- `APIFY_TOKEN`: Automatically provided by Apify platform
121- `MCP_SERVER_URL`: Custom MCP server URL (optional)
122
123## ⚡ Performance Tips
124
1251. **Use Haiku Model**: For most tasks, `claude-3-5-haiku-latest` provides the best speed/cost ratio
1262. **Adjust Attempts**: Reduce `maxActorAttempts` for faster results, increase for better coverage
1273. **Quality Threshold**: Lower `minDataQualityScore` if you're getting no results
1284. **Time Limits**: Set appropriate `maxTimeMinutes` based on your needs
129
130## 🛠️ Development
131
132### Local Testing
133
134```bash
135# Install dependencies (using virtual environment)
136pip install -r requirements.txt
137
138# Or if you have the project's virtual environment:
139./venv/bin/pip install -r requirements.txt
140
141# Set up environment
142export ANTHROPIC_API_KEY=your_key_here
143
144# Run the actor locally
145python3 main.py
146
147# Or using npm scripts:
148npm run start # Uses system python3
149npm run start:local # Uses project virtual environment
150```
151
152### Project Structure
153
154```text
155LLMScraper/
156├── main.py # Actor entry point
157├── src/llmscraper/
158│ ├── mcp/ # MCP client implementation
159│ ├── claude/ # Claude AI integration
160│ ├── llm_scraper/ # Main scraper logic
161│ │ ├── actor.py # Main LLMScraperActor class
162│ │ ├── models.py # Input/output models
163│ │ ├── retry_logic.py # Intelligent retry logic
164│ │ └── quality_evaluator.py # Data quality assessment
165│ ├── scraping/ # Apify actor integrations
166│ └── utils/ # Configuration and utilities
167├── .actor/
168│ ├── actor.json # Actor metadata
169│ ├── input_schema.json # Input validation schema
170│ └── README.md # This file
171├── Dockerfile # Container configuration
172├── requirements.txt # Python dependencies
173├── package.json # Node.js metadata
174└── pyproject.toml # Python packaging configuration
175```
176
177## 📚 API Reference
178
179### Main Function
180
181```python
182from llmscraper.llm_scraper import LLMScraperActor, LLMScraperInput
183
184# Create configuration
185config = LLMScraperInput(
186 target_url="https://example.com",
187 extraction_goal="Extract product data",
188 anthropic_api_key="sk-ant-..."
189)
190
191# Run the scraper
192scraper = LLMScraperActor(config)
193result = await scraper.run(progress_callback=None)
194```
195
196### Configuration
197
198```python
199from llmscraper.llm_scraper.models import LLMScraperInput
200
201config = LLMScraperInput(
202 target_url="https://example.com",
203 extraction_goal="Extract product data",
204 anthropic_api_key="sk-ant-...",
205 max_actor_attempts=10,
206 max_retries_per_actor=3,
207 max_time_minutes=30,
208 model_name="claude-3-5-haiku-latest",
209 debug_mode=False,
210 prefer_specific_actors=True,
211 min_data_quality_score=0.7, # Note: API expects 0.0-1.0, input form uses 0-100
212 enable_proxy=True
213)
214```
215
216## 🤝 Contributing
217
2181. Fork the repository
2192. Create a feature branch
2203. Make your changes
2214. Add tests
2225. Submit a pull request
223
224## 📄 License
225
226MIT License - see LICENSE file for details.
227
228## 🆘 Support & Troubleshooting
229
230### Common Issues
231
232- **API Key Issues**: Ensure your Claude API key is valid and has sufficient credits
233- **No Results Found**: Try reducing `minDataQualityScore` or increasing `maxActorAttempts`
234- **Timeout Errors**: Increase `maxTimeMinutes` for complex websites
235- **Quality Score Too Low**: Adjust your `extractionGoal` to be more specific
236
237### Debugging
238
239- Enable `debugMode: true` for detailed logging
240- Check the Actor logs for step-by-step execution details
241- Verify the target URL is accessible and returns content
242- Monitor the progress updates in the key-value store
243
244### Performance Optimization
245
246- Use `claude-3-5-haiku-latest` for faster, cost-effective processing
247- Set appropriate `maxActorAttempts` based on your time/quality requirements
248- Enable `preferSpecificActors` to prioritize domain-specific solutions
249
250## 🔄 Version History
251
252- **v1.0.0** (July 2025): Initial release with MCP integration, Claude AI, and intelligent retry logic
253 - AI-powered actor discovery and testing
254 - Multi-dimensional quality assessment
255 - Real-time progress tracking
256 - Comprehensive error handling and retry logic
257 - Support for all major Claude models

.actor/actor.json

{
"actorSpecification": 1,
"name": "LLMScraper",
"title": "� LLM-Powered Web Scraper",
"description": "AI-powered web scraper that automatically discovers and tests the best Apify actors for your scraping task using Claude AI. No manual configuration needed!",
"version": "1.0",
"buildTag": "latest",
"environmentVariables": {
"ANTHROPIC_API_KEY": "@claudeApiKey"
},
"dockerfile": "./Dockerfile",
"input": "./input_schema.json",
"storages": {
"dataset": {
"actorSpecification": 1,
"views": {
"scraped_data": {
"title": "Scraped Data",
"transformation": {},
"display": {
"component": "table",
"properties": {
"url": {
"label": "Source URL",
"format": "link"
},
"data": {
"label": "Extracted Data",
"format": "object"
},
"quality_score": {
"label": "Quality Score",
"format": "number"
},
"actor_used": {
"label": "Actor Used",
"format": "text"
},
"timestamp": {
"label": "Scraped At",
"format": "datetime"
},
"success": {
"label": "Success",
"format": "boolean"
},
"error": {
"label": "Error Message",
"format": "text"
}
}
}
}
}
}
}
}

.actor/input_schema.json

{
"title": "LLM Scraper Configuration",
"description": "AI-powered web scraper that automatically discovers and tests the best Apify actors for your scraping task using Claude AI",
"type": "object",
"schemaVersion": 1,
"properties": {
"targetUrl": {
"title": "Target URL",
"description": "The URL of the website you want to scrape",
"type": "string",
"editor": "textfield",
"prefill": "https://books.toscrape.com/"
},
"extractionGoal": {
"title": "Extraction Goal",
"description": "Describe what data you want to extract from the website",
"type": "string",
"editor": "textarea",
"prefill": "Extract product information including title, price, rating, and availability from the book listings"
},
"claudeApiKey": {
"title": "Claude API Key",
"description": "Your Anthropic Claude API key for AI-powered actor discovery and testing",
"type": "string",
"editor": "textfield",
"isSecret": true
},
"maxActorAttempts": {
"title": "Max Actor Attempts",
"description": "Maximum number of different actors to try",
"type": "integer",
"editor": "number",
"minimum": 1,
"maximum": 20,
"default": 10
},
"maxRetriesPerActor": {
"title": "Max Retries per Actor",
"description": "Maximum number of retry attempts per actor",
"type": "integer",
"editor": "number",
"minimum": 1,
"maximum": 10,
"default": 3
},
"maxTimeMinutes": {
"title": "Max Time (minutes)",
"description": "Maximum total execution time in minutes",
"type": "integer",
"editor": "number",
"minimum": 5,
"maximum": 120,
"default": 30,
"unit": "minutes"
},
"mcpUrl": {
"title": "MCP Server URL",
"description": "URL of the Apify MCP server for actor discovery",
"type": "string",
"editor": "textfield",
"default": "https://mcp.apify.com/sse?enableAddingActors=true"
},
"modelName": {
"title": "Claude Model",
"description": "Choose which Claude model to use for AI analysis",
"type": "string",
"editor": "select",
"default": "claude-3-5-haiku-latest",
"enum": [
"claude-3-5-haiku-latest",
"claude-3-5-sonnet-latest",
"claude-3-opus-latest",
"claude-3-haiku-20240307",
"claude-3-sonnet-20240229",
"claude-3-5-sonnet-20241022"
],
"enumTitles": [
"Claude 3.5 Haiku (Fast & Cost-Effective)",
"Claude 3.5 Sonnet (Balanced)",
"Claude 3 Opus (Maximum Quality)",
"Claude 3 Haiku (Legacy)",
"Claude 3 Sonnet (Legacy)",
"Claude 3.5 Sonnet (Legacy)"
]
},
"debugMode": {
"title": "Debug Mode",
"description": "Enable detailed logging for debugging",
"type": "boolean",
"default": false,
"editor": "checkbox",
"sectionCaption": "Advanced Settings",
"sectionDescription": "Configure advanced scraping behavior and quality settings"
},
"preferSpecificActors": {
"title": "Prefer Specific Actors",
"description": "Prioritize domain-specific actors over general-purpose ones",
"type": "boolean",
"default": true,
"editor": "checkbox"
},
"minDataQualityScore": {
"title": "Minimum Data Quality Score",
"description": "Minimum quality score (0-100) to accept results",
"type": "integer",
"editor": "number",
"minimum": 0,
"maximum": 100,
"default": 70
},
"enableProxy": {
"title": "Enable Proxy",
"description": "Use proxy for scraping requests",
"type": "boolean",
"default": true,
"editor": "checkbox"
}
},
"required": ["targetUrl", "extractionGoal", "claudeApiKey"]
}

src/llmscraper/__init__.py

1"""
2ScraperCodeGenerator - Intelligent Web Scraping with AI
3
4A smart web scraping framework that uses multiple scraping strategies
5and AI-powered quality evaluation to extract data from websites.
6Includes LLM Scraper for automated actor discovery and testing.
7"""
8
9from .pipeline import IntelligentScraperPipeline, run_intelligent_scraper
10from .models import ScrapingResult, GoalExtractionResult, PipelineConfig, ClaudeModel
11from .utils.config_parser import ConfigurationParser
12from .scraping.actor_multi_scraper import ActorMultiScraper
13
14# LLM Scraper functionality
15from .llm_scraper import (
16 LLMScraperActor, run_llm_scraper_actor, run_llm_scraper,
17 LLMScraperInput, LLMScraperOutput
18)
19
20# MCP and Claude functionality
21from .mcp import MCPClient
22from .claude import ClaudeManager
23
24__version__ = "0.1.0"
25__all__ = [
26 "IntelligentScraperPipeline",
27 "run_intelligent_scraper",
28 "ScrapingResult",
29 "GoalExtractionResult",
30 "PipelineConfig",
31 "ClaudeModel",
32 "ConfigurationParser",
33 "ActorMultiScraper",
34 # LLM Scraper
35 "LLMScraperActor",
36 "run_llm_scraper_actor",
37 "run_llm_scraper",
38 "LLMScraperInput",
39 "LLMScraperOutput",
40 # MCP and Claude
41 "MCPClient",
42 "ClaudeManager"
43 "ConfigurationParser",
44 "ActorMultiScraper"
45]

src/llmscraper/models.py

1"""
2Data models for the ScraperCodeGenerator pipeline.
3"""
4
5from dataclasses import dataclass, field
6from typing import Dict, Any, Optional, List
7from enum import Enum
8
9
10class ClaudeModel(Enum):
11 """Available Claude model versions."""
12 # Claude 4 models (latest)
13 CLAUDE_4_SONNET = "claude-sonnet-4-20250514"
14 CLAUDE_4_OPUS = "claude-opus-4-20250514"
15
16 # Claude 3.7 models
17 CLAUDE_3_7_SONNET = "claude-3-7-sonnet-20250219"
18
19 # Claude 3.5 models
20 CLAUDE_3_5_SONNET = "claude-3-5-sonnet-20241022"
21 CLAUDE_3_5_HAIKU = "claude-3-5-haiku-20241022"
22
23 # Claude 3 models
24 CLAUDE_3_SONNET = "claude-3-sonnet-20240229"
25 CLAUDE_3_HAIKU = "claude-3-haiku-20240307"
26
27
28@dataclass
29class ActorConfig:
30 """Configuration for an individual Apify actor."""
31 actor_id: str
32 enabled: bool = True
33 input: Dict[str, Any] = field(default_factory=dict)
34 name: Optional[str] = None
35 description: Optional[str] = None
36
37
38@dataclass
39class HTMLPruningConfig:
40 """Configuration for HTML pruning behavior."""
41 enabled: bool = True
42 max_list_items: int = 5
43 max_text_length: int = 500
44 prune_before_evaluation: bool = True
45 prune_percentage: float = 0.8 # Keep 80% of content, remove 20%
46
47
48@dataclass
49class PipelineConfig:
50 """Complete pipeline configuration."""
51 # Core settings
52 for_actor: bool = False
53 test_script: bool = False
54 output_script_path: Optional[str] = None
55
56 # Claude settings
57 claude_model: ClaudeModel = ClaudeModel.CLAUDE_4_SONNET
58 claude_api_key: Optional[str] = None
59
60 # HTML processing settings
61 html_pruning: HTMLPruningConfig = field(default_factory=HTMLPruningConfig)
62
63 # Actor configurations
64 actors: Dict[str, ActorConfig] = field(default_factory=dict)
65
66 # Execution settings
67 max_retries: int = 3
68 timeout_seconds: int = 60
69 concurrent_actors: bool = True
70
71 def get_enabled_actors(self) -> Dict[str, ActorConfig]:
72 """Get only enabled actors."""
73 return {name: config for name, config in self.actors.items() if config.enabled}
74
75
76@dataclass
77class ScrapingResult:
78 """Result of the complete scraping pipeline."""
79 success: bool
80 generated_script: Optional[str] = None
81 best_actor: Optional[str] = None
82 schema: Optional[Dict[str, Any]] = None
83 error_message: Optional[str] = None
84 quality_scores: Optional[Dict[str, int]] = None
85 extracted_data: Optional[List[Dict[str, Any]]] = None
86
87
88@dataclass
89class EvaluationResult:
90 """Result of HTML quality evaluation."""
91 score: int # 1-10 scale
92 reasoning: str
93
94
95@dataclass
96class PreEvaluationResult:
97 """Result of pre-evaluation checks before sending to Claude."""
98 is_valid_html: bool
99 score: Optional[int] = None # If we can determine score without Claude
100 reasoning: Optional[str] = None
101 should_continue_to_claude: bool = True
102
103
104@dataclass
105class GoalExtractionResult:
106 """Result of extracting goal from natural language prompt."""
107 goal: str
108 url: str
109 success: bool
110 error_message: Optional[str] = None
111
112
113def get_default_actor_configs() -> Dict[str, ActorConfig]:
114 """Get default actor configurations with common Apify actors."""
115 return {
116 "cheerio-scraper": ActorConfig(
117 actor_id="apify/cheerio-scraper",
118 name="Cheerio Scraper",
119 description="Fast jQuery-like server-side scraping",
120 enabled=True,
121 input={
122 "maxRequestRetries": 3,
123 "requestTimeoutSecs": 30,
124 "maxRequestsPerCrawl": 1,
125 "pseudoUrls": [],
126 "linkSelector": "",
127 "pageFunction": """
128 async function pageFunction(context) {
129 const { request, log, skipLinks, $ } = context;
130 return {
131 url: request.url,
132 title: $('title').text(),
133 html: $('html').html()
134 };
135 }
136 """,
137 "proxyConfiguration": {"useApifyProxy": True}
138 }
139 ),
140 "web-scraper": ActorConfig(
141 actor_id="apify/web-scraper",
142 name="Web Scraper",
143 description="Versatile web scraper with JavaScript support",
144 enabled=True,
145 input={
146 "maxRequestRetries": 3,
147 "requestTimeoutSecs": 30,
148 "maxPagesPerCrawl": 1,
149 "pageFunction": """
150 async function pageFunction(context) {
151 const { request, log, skipLinks, $ } = context;
152 return {
153 url: request.url,
154 title: $('title').text(),
155 html: $('html').html()
156 };
157 }
158 """,
159 "proxyConfiguration": {"useApifyProxy": True}
160 }
161 ),
162 "website-content-crawler": ActorConfig(
163 actor_id="apify/website-content-crawler",
164 name="Website Content Crawler",
165 description="Advanced crawler with Playwright support",
166 enabled=True,
167 input={
168 "maxCrawlPages": 1,
169 "crawler": "playwright",
170 "proxyConfiguration": {"useApifyProxy": True}
171 }
172 ),
173 "playwright-scraper": ActorConfig(
174 actor_id="apify/playwright-scraper",
175 name="Playwright Scraper",
176 description="Modern browser automation with Playwright",
177 enabled=False,
178 input={
179 "maxRequestRetries": 3,
180 "requestTimeoutSecs": 30,
181 "maxPagesPerCrawl": 1,
182 "pageFunction": """
183 async function pageFunction(context) {
184 const { request, log, page } = context;
185 const title = await page.title();
186 const html = await page.content();
187 return {
188 url: request.url,
189 title: title,
190 html: html
191 };
192 }
193 """,
194 "proxyConfiguration": {"useApifyProxy": True}
195 }
196 ),
197 "puppeteer-scraper": ActorConfig(
198 actor_id="apify/puppeteer-scraper",
199 name="Puppeteer Scraper",
200 description="Chrome-based scraping with Puppeteer",
201 enabled=False,
202 input={
203 "maxRequestRetries": 3,
204 "requestTimeoutSecs": 30,
205 "maxPagesPerCrawl": 1,
206 "pageFunction": """
207 async function pageFunction(context) {
208 const { request, log, page } = context;
209 const title = await page.title();
210 const html = await page.content();
211 return {
212 url: request.url,
213 title: title,
214 html: html
215 };
216 }
217 """,
218 "proxyConfiguration": {"useApifyProxy": True}
219 }
220 ),
221 "jsdom-scraper": ActorConfig(
222 actor_id="apify/jsdom-scraper",
223 name="JSDOM Scraper",
224 description="Lightweight JavaScript DOM scraping",
225 enabled=False,
226 input={
227 "maxRequestRetries": 3,
228 "requestTimeoutSecs": 30,
229 "maxPagesPerCrawl": 1,
230 "pageFunction": """
231 async function pageFunction(context) {
232 const { request, log, window } = context;
233 const $ = window.$;
234 return {
235 url: request.url,
236 title: $('title').text(),
237 html: $('html').html()
238 };
239 }
240 """,
241 "proxyConfiguration": {"useApifyProxy": True}
242 }
243 )
244 }

src/llmscraper/pipeline.py

1"""
2Main pipeline for intelligent web scraping.
3"""
4
5import logging
6from typing import Optional
7
8from .models import ScrapingResult, PipelineConfig
9from .scraping import MultiActorScraper
10from .scraping.actor_multi_scraper import ActorMultiScraper
11from .evaluation import HTMLQualityEvaluator
12from .generation import ScriptGenerator, ScriptExecutor
13from .utils import prune_html, validate_required_keys, get_api_key
14
15
16class IntelligentScraperPipeline:
17 """Main pipeline class that orchestrates the intelligent web scraping process."""
18
19 def __init__(self, apify_token: str, claude_api_key: str, actor_logger=None, config: Optional[PipelineConfig] = None):
20 """
21 Initialize the pipeline with required API tokens.
22
23 Args:
24 apify_token: Apify API token for web scraping
25 claude_api_key: Anthropic Claude API key for AI analysis
26 actor_logger: Optional Actor logger for actor mode
27 config: Optional pipeline configuration
28 """
29 # Validate API keys
30 validated_keys = validate_required_keys(
31 apify_token=apify_token,
32 claude_api_key=claude_api_key
33 )
34
35 self.apify_token = validated_keys['apify_token']
36 self.claude_api_key = validated_keys['claude_api_key']
37 self.config = config or PipelineConfig()
38
39 # Initialize components with configuration
40 self.multi_scraper = MultiActorScraper(self.apify_token)
41 self.actor_scraper = ActorMultiScraper() # For actor-to-actor communication
42 self.quality_evaluator = HTMLQualityEvaluator(self.claude_api_key, self.config.claude_model)
43 self.script_generator = ScriptGenerator(self.claude_api_key, self.config.claude_model)
44 self.script_executor = ScriptExecutor()
45
46 # Setup logging - use Actor logger if provided, otherwise standard logging
47 self.logger = actor_logger if actor_logger else logging.getLogger(__name__)
48 self.is_actor_mode = actor_logger is not None
49
50 async def run_complete_pipeline(self, target_url: str, user_goal: str,
51 output_script_path: Optional[str] = None,
52 prune_before_evaluation: bool = True,
53 test_script: bool = False,
54 for_actor: bool = False) -> ScrapingResult:
55 """
56 Run the complete intelligent scraping pipeline.
57
58 Args:
59 target_url: The URL to scrape
60 user_goal: Natural language description of what to extract
61 output_script_path: Path where to save the generated script (None for actor mode)
62 prune_before_evaluation: If True, prune HTML before quality evaluation
63 test_script: If True, test the generated script before finalizing
64 for_actor: If True, generate script for Apify actor format
65
66 Returns:
67 ScrapingResult containing the outcome and generated artifacts
68 """
69 self.logger.info(f"PIPELINE: Starting intelligent scraping pipeline for: {target_url}")
70 self.logger.info(f"PIPELINE: User goal: {user_goal}")
71 self.logger.info(f"PIPELINE: Actor mode: {for_actor}")
72
73 try:
74 # Step 1: Run multiple actors to scrape the website
75 self.logger.info("PIPELINE: Step 1: Running multi-actor scraping...")
76
77 # Use actor-aware scraper if running inside an Apify actor
78 if for_actor:
79 self.logger.info("PIPELINE: Using actor-to-actor communication...")
80 scraping_results = await self.actor_scraper.scrape_with_multiple_actors(target_url)
81 else:
82 self.logger.info("PIPELINE: Using client-based scraping...")
83 # Use configured actors instead of hardcoded ones
84 enabled_actors = self.config.get_enabled_actors()
85 if enabled_actors:
86 scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url, enabled_actors)
87 else:
88 scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url)
89
90 if not any(content for content in scraping_results.values() if content):
91 return ScrapingResult(
92 success=False,
93 error_message="All scraping actors failed to retrieve content"
94 )
95
96 # Step 2: Evaluate quality of each result
97 self.logger.info("PIPELINE: Step 2: Evaluating HTML quality for each actor...")
98 quality_scores, best_actor, best_html = self._evaluate_html_quality(
99 scraping_results, user_goal, prune_before_evaluation
100 )
101
102 if not best_html:
103 return ScrapingResult(
104 success=False,
105 error_message="No actor produced quality HTML content",
106 quality_scores=quality_scores
107 )
108
109 self.logger.info(f"PIPELINE: Best actor selected: {best_actor} with score {quality_scores[best_actor]}/10")
110
111 # Step 3: Prune the best HTML to reduce token count
112 self.logger.info("PIPELINE: Step 3: Pruning HTML content...")
113
114 # Use configuration for pruning settings
115 if self.config.html_pruning.enabled:
116 pruned_html = prune_html(
117 best_html,
118 max_list_items=self.config.html_pruning.max_list_items,
119 max_text_length=self.config.html_pruning.max_text_length,
120 prune_percentage=self.config.html_pruning.prune_percentage
121 )
122 else:
123 pruned_html = best_html
124
125 original_length = len(best_html)
126 pruned_length = len(pruned_html)
127 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0
128
129 self.logger.info(f"PIPELINE: HTML pruned: {original_length:,}{pruned_length:,} chars ({reduction:.1f}% reduction)")
130
131 # Step 4: Generate Python scraping script
132 self.logger.info("PIPELINE: Step 4: Generating Python scraping script...")
133 generated_script = self.script_generator.generate_scraping_script(
134 target_url, best_actor, pruned_html, user_goal, for_actor
135 )
136
137 if not generated_script:
138 return ScrapingResult(
139 success=False,
140 error_message="Failed to generate scraping script",
141 best_actor=best_actor,
142 quality_scores=quality_scores
143 )
144
145 # Step 5: Test the script if requested
146 extracted_data = None
147 if test_script:
148 self.logger.info("PIPELINE: Step 5: Testing generated script...")
149 test_result = self.script_executor.test_script(generated_script, best_html)
150
151 if test_result["success"]:
152 self.logger.info(f"PIPELINE: ✅ Script test passed! Extracted {test_result.get('item_count', 0)} items")
153 extracted_data = test_result["data"]
154 else:
155 self.logger.warning(f"PIPELINE: ⚠️ Script test failed: {test_result['error']}")
156 # Continue anyway, but log the issue
157
158 # Step 6: Save the generated script (only if not actor mode)
159 if output_script_path and not for_actor:
160 self.logger.info(f"PIPELINE: Step 6: Saving generated script to {output_script_path}")
161 with open(output_script_path, 'w', encoding='utf-8') as f:
162 f.write(generated_script)
163
164 self.logger.info("PIPELINE: ✅ Pipeline completed successfully!")
165
166 return ScrapingResult(
167 success=True,
168 generated_script=generated_script,
169 best_actor=best_actor,
170 quality_scores=quality_scores,
171 extracted_data=extracted_data
172 )
173
174 except Exception as e:
175 self.logger.error(f"PIPELINE: Pipeline failed with error: {str(e)}")
176 return ScrapingResult(
177 success=False,
178 error_message=f"Pipeline error: {str(e)}"
179 )
180
181 def _evaluate_html_quality(self, scraping_results: dict, user_goal: str,
182 prune_before_evaluation: bool) -> tuple[dict, str, str]:
183 """Evaluate HTML quality for each scraping result."""
184 quality_scores = {}
185 best_actor = None
186 best_html = None
187 best_score = 0
188
189 for actor_name, html_content in scraping_results.items():
190 if html_content:
191 self.logger.info(f"PIPELINE: Evaluating {actor_name}...")
192
193 # Optionally prune HTML before evaluation
194 evaluation_html = html_content
195 if prune_before_evaluation:
196 original_length = len(html_content)
197 # Use more aggressive pruning for evaluation
198 evaluation_html = prune_html(
199 html_content,
200 max_list_items=3,
201 max_text_length=100,
202 prune_percentage=0.5 # More aggressive for evaluation
203 )
204 pruned_length = len(evaluation_html)
205 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0
206 self.logger.info(f"PIPELINE: {actor_name} HTML pruned for evaluation: {original_length:,}{pruned_length:,} chars ({reduction:.1f}% reduction)")
207
208 evaluation = self.quality_evaluator.evaluate_html_quality(user_goal, evaluation_html)
209
210 if evaluation:
211 quality_scores[actor_name] = evaluation.score
212 self.logger.info(f"PIPELINE: {actor_name} quality score: {evaluation.score}/10 - {evaluation.reasoning}")
213
214 if evaluation.score > best_score:
215 best_score = evaluation.score
216 best_actor = actor_name
217 best_html = html_content # Keep original HTML, not pruned version
218 else:
219 quality_scores[actor_name] = 0
220 self.logger.warning(f"PIPELINE: Failed to evaluate {actor_name}")
221 else:
222 quality_scores[actor_name] = 0
223 self.logger.warning(f"PIPELINE: {actor_name} returned no content")
224
225 return quality_scores, best_actor, best_html
226
227
228async def run_intelligent_scraper(target_url: str, user_goal: str,
229 apify_token: Optional[str] = None,
230 claude_api_key: Optional[str] = None,
231 output_path: Optional[str] = "generated_scraper.py",
232 prune_before_evaluation: bool = True,
233 test_script: bool = False,
234 for_actor: bool = False,
235 actor_logger=None,
236 config: Optional[PipelineConfig] = None) -> ScrapingResult:
237 """
238 Convenience function to run the complete intelligent scraping pipeline.
239
240 Args:
241 target_url: URL to scrape
242 user_goal: Natural language description of extraction goal
243 apify_token: Apify API token (uses APIFY_TOKEN env var if not provided)
244 claude_api_key: Claude API key (uses CLAUDE_API_KEY env var if not provided)
245 output_path: Path to save the generated script (None for actor mode)
246 prune_before_evaluation: If True, prune HTML before quality evaluation
247 test_script: If True, test the generated script before finalizing
248 for_actor: If True, generate script for Apify actor format
249 actor_logger: Optional Actor logger for actor mode
250 config: Optional pipeline configuration
251
252 Returns:
253 ScrapingResult with the outcome
254 """
255 # Get tokens from environment if not provided
256 if not apify_token:
257 apify_token = get_api_key("APIFY_TOKEN")
258 if not claude_api_key:
259 claude_api_key = get_api_key("CLAUDE_API_KEY")
260
261 if not apify_token:
262 return ScrapingResult(
263 success=False,
264 error_message="APIFY_TOKEN not provided and not found in environment variables"
265 )
266
267 if not claude_api_key:
268 return ScrapingResult(
269 success=False,
270 error_message="CLAUDE_API_KEY not provided and not found in environment variables"
271 )
272
273 # Create and run pipeline
274 pipeline = IntelligentScraperPipeline(apify_token, claude_api_key, actor_logger, config)
275 return await pipeline.run_complete_pipeline(
276 target_url, user_goal, output_path, prune_before_evaluation, test_script, for_actor
277 )

src/llmscraper/claude/__init__.py

1"""
2Claude conversation management package.
3"""
4
5from .manager import ClaudeManager, ToolCall, ConversationResult
6
7__all__ = ['ClaudeManager', 'ToolCall', 'ConversationResult']

src/llmscraper/claude/manager.py

1"""
2Claude conversation manager for LLM interactions and tool calling.
3"""
4
5import asyncio
6import logging
7from typing import Dict, List, Any, Optional, Callable
8from dataclasses import dataclass
9import json
10import time
11
12import anthropic
13from anthropic.types import MessageParam, ContentBlockParam, Message
14
15from ..mcp import MCPClient, MCPCallResult
16
17
18@dataclass
19class ToolCall:
20 """Represents a tool call to be executed."""
21 id: str
22 name: str
23 arguments: Dict[str, Any]
24
25
26@dataclass
27class ConversationResult:
28 """Result from a conversation step."""
29 message: str
30 tool_calls: List[ToolCall]
31 reasoning: str
32 is_final: bool = False
33 error: Optional[str] = None
34
35
36class ClaudeManager:
37 """Manages conversations with Claude and tool execution."""
38
39 def __init__(self, api_key: str, model: str = "claude-3-5-haiku-latest",
40 max_tokens: int = 2048, max_tool_calls_per_round: int = 10):
41 """
42 Initialize Claude manager.
43
44 Args:
45 api_key: Anthropic API key
46 model: Claude model to use
47 max_tokens: Maximum tokens per response
48 max_tool_calls_per_round: Maximum tool calls per conversation round
49 """
50 self.client = anthropic.Anthropic(api_key=api_key)
51 self.model = model
52 self.max_tokens = max_tokens
53 self.max_tool_calls_per_round = max_tool_calls_per_round
54 self.conversation: List[MessageParam] = []
55 self.logger = logging.getLogger(__name__)
56
57 # System prompt for LLM scraper
58 self.system_prompt = ""
59
60 def set_system_prompt(self, target_url: str, extraction_goal: str,
61 max_attempts: int, max_retries: int, max_time: int):
62 """Set the system prompt with specific parameters."""
63 self.system_prompt = f"""You are an expert web scraping agent that systematically tests Apify Actors to find the best one for a specific task.
64
65TARGET: {target_url}
66GOAL: {extraction_goal}
67LIMITS: {max_attempts} attempts, {max_retries} retries per actor, {max_time} minutes
68
69STRATEGY:
701. Search for relevant actors using search-actors
712. From search results, extract a prioritized list of actor names to test
723. For each actor: get details, configure input, test it
734. Analyze results and try next actor if current one fails
745. Stop when you find an actor that successfully extracts the target data
75
76ACTOR SELECTION PRIORITY:
77- Domain-specific scrapers first (e.g., "linkedin-scraper" for LinkedIn URLs)
78- Popular scrapers with high user counts and runs
79- General web scrapers as fallback (web-scraper, cheerio-scraper, website-content-crawler, rag-web-browser)
80
81TESTING APPROACH:
82- Start with simple, minimal input configurations
83- If actor fails, try different input parameters (proxy settings, timeouts, formats)
84- Analyze failure patterns and adjust accordingly
85- Don't waste attempts on obviously unsuitable actors
86
87RESULT ANALYSIS:
88- Look for actual extracted data relevant to the goal
89- Check data quality and completeness
90- Prefer actors that return structured, relevant data
91- Stop testing when you find a working solution
92
93IMPORTANT:
94- Be systematic and efficient with your attempts
95- Extract actor names from search results as a prioritized list
96- Test actors one by one until success
97- Focus on finding ANY working solution, then optimize if needed
98- Provide clear reasoning for each decision
99
100Start by searching for actors, then systematically test them."""
101
102 async def process_query(self, query: str, mcp_client: MCPClient,
103 on_progress: Optional[Callable[[str, str], None]] = None) -> ConversationResult:
104 """
105 Process a user query with Claude, handling tool calls.
106
107 Args:
108 query: The user query/request
109 mcp_client: MCP client for tool execution
110 on_progress: Optional callback for progress updates (role, content)
111
112 Returns:
113 ConversationResult with the response and any tool calls made
114 """
115 try:
116 self.logger.info(f"Processing query: {query[:100]}...")
117 self.logger.debug(f"Full query: {query}")
118
119 # Add user message to conversation
120 self.conversation.append({"role": "user", "content": query})
121 self.logger.debug(f"Added user message to conversation. Total messages: {len(self.conversation)}")
122
123 # Get available tools
124 tools = mcp_client.format_tools_for_claude()
125
126 # Start conversation loop
127 total_tool_calls = 0
128 reasoning_parts = []
129 all_tool_calls = []
130
131 while total_tool_calls < self.max_tool_calls_per_round:
132 # Create message with Claude
133 response = await self._create_message(tools)
134
135 if not response:
136 break
137
138 # Process response blocks
139 assistant_content = []
140 tool_use_blocks = []
141 text_content = ""
142
143 for block in response.content:
144 if block.type == 'text':
145 assistant_content.append(block)
146 text_content += block.text
147 reasoning_parts.append(block.text)
148
149 # Log Claude's text responses
150 self.logger.info(f"Claude says: {block.text}")
151
152 if on_progress:
153 on_progress("assistant", block.text)
154
155 elif block.type == 'tool_use':
156 assistant_content.append(block)
157 tool_use_blocks.append(block)
158
159 # Log tool usage
160 self.logger.info(f"Claude wants to use tool: {block.name} with args: {block.input}")
161
162 tool_call = ToolCall(
163 id=block.id,
164 name=block.name,
165 arguments=block.input
166 )
167 all_tool_calls.append(tool_call)
168
169 if on_progress:
170 on_progress("tool_call", f"Calling {block.name} with {block.input}")
171
172 # Add assistant message to conversation
173 self.conversation.append({
174 "role": "assistant",
175 "content": assistant_content
176 })
177 self.logger.debug(f"Added assistant message to conversation. Total messages: {len(self.conversation)}")
178
179 # If no tool calls, we're done
180 if not tool_use_blocks:
181 self.logger.info("No tool calls in response, conversation complete")
182 return ConversationResult(
183 message=text_content,
184 tool_calls=all_tool_calls,
185 reasoning=" ".join(reasoning_parts),
186 is_final=True
187 )
188
189 # Execute tool calls
190 tool_results = []
191 for block in tool_use_blocks:
192 total_tool_calls += 1
193
194 # Check limit
195 if total_tool_calls > self.max_tool_calls_per_round:
196 result_content = f"Tool call limit reached ({self.max_tool_calls_per_round})"
197 break
198
199 # Execute tool
200 try:
201 self.logger.info(f"Executing tool: {block.name} with input: {block.input}")
202 result = await mcp_client.call_tool(block.name, block.input)
203
204 # Format result content
205 if isinstance(result.content, list):
206 # Convert list to readable string
207 result_content = json.dumps(result.content, indent=2)
208 elif isinstance(result.content, dict):
209 result_content = json.dumps(result.content, indent=2)
210 else:
211 result_content = str(result.content)
212
213 # Log the raw result
214 self.logger.info(f"Tool {block.name} returned {len(result_content)} chars")
215 if result.is_error:
216 self.logger.warning(f"Tool {block.name} failed: {result_content[:200]}...")
217 else:
218 self.logger.info(f"Tool {block.name} succeeded. Result preview: {result_content[:200]}...")
219
220 # Truncate only if it would exceed context limits
221 result_content = self._truncate_result_if_needed(result_content, block.name)
222
223 if result.is_error:
224 result_content = f"Error: {result_content}"
225
226 except Exception as e:
227 self.logger.error(f"Tool execution failed for {block.name}: {str(e)}")
228 result_content = f"Tool execution failed: {str(e)}"
229 result = MCPCallResult(content=result_content, is_error=True)
230
231 # Create tool result block
232 tool_result = {
233 "type": "tool_result",
234 "tool_use_id": block.id,
235 "content": result_content,
236 "is_error": getattr(result, 'is_error', False)
237 }
238
239 tool_results.append(tool_result)
240
241 if on_progress:
242 status = "ERROR" if getattr(result, 'is_error', False) else "SUCCESS"
243 on_progress("tool_result", f"{block.name}: {status}")
244
245 # Add tool results to conversation
246 if tool_results:
247 self.conversation.append({
248 "role": "user",
249 "content": tool_results
250 })
251 self.logger.debug(f"Added tool results to conversation. Total messages: {len(self.conversation)}")
252
253 # Continue conversation loop
254 await asyncio.sleep(0.1) # Small delay to prevent rate limiting
255
256 # Return final result
257 return ConversationResult(
258 message=" ".join(reasoning_parts),
259 tool_calls=all_tool_calls,
260 reasoning=" ".join(reasoning_parts),
261 is_final=total_tool_calls >= self.max_tool_calls_per_round,
262 error="Tool call limit reached" if total_tool_calls >= self.max_tool_calls_per_round else None
263 )
264
265 except Exception as e:
266 self.logger.error(f"Error processing query: {str(e)}")
267 return ConversationResult(
268 message=f"Error processing query: {str(e)}",
269 tool_calls=[],
270 reasoning="",
271 error=str(e)
272 )
273
274 async def _create_message(self, tools: List[Dict[str, Any]]) -> Optional[Message]:
275 """Create a message with Claude."""
276 try:
277 response = await asyncio.to_thread(
278 self.client.messages.create,
279 model=self.model,
280 max_tokens=self.max_tokens,
281 system=self.system_prompt,
282 messages=self.conversation,
283 tools=tools if tools else None
284 )
285 return response
286
287 except Exception as e:
288 self.logger.error(f"Error creating message: {str(e)}")
289 return None
290
291 def _truncate_result_if_needed(self, content: str, tool_name: str) -> str:
292 """Truncate results only if they would exceed Claude's context limits."""
293 # Claude models have different context limits:
294 # Haiku: ~200K tokens (~800K chars)
295 # Sonnet: ~200K tokens (~800K chars)
296 # Opus: ~200K tokens (~800K chars)
297
298 # Conservative limit to leave room for conversation history and system prompt
299 max_chars_per_result = 50000 # ~12.5K tokens
300
301 if len(content) <= max_chars_per_result:
302 return content
303
304 self.logger.warning(f"Truncating {tool_name} result from {len(content)} to {max_chars_per_result} chars")
305
306 # For search results, keep the structure but limit items
307 if tool_name == "search-actors" and content.startswith('['):
308 try:
309 data = json.loads(content)
310 if isinstance(data, list) and len(data) > 10:
311 # Keep first 10 results for search instead of 5
312 truncated = data[:10]
313 result = json.dumps(truncated, indent=2)
314 if len(result) <= max_chars_per_result:
315 return result + f"\n... (showing 10 of {len(data)} results)"
316 except:
317 pass
318
319 # For actor execution results, try to preserve key data
320 if tool_name in ["run-actor", "execute-actor"]:
321 try:
322 data = json.loads(content)
323 if isinstance(data, list) and len(data) > 5:
324 # Keep first 5 items for actor results
325 truncated = data[:5]
326 result = json.dumps(truncated, indent=2)
327 if len(result) <= max_chars_per_result:
328 return result + f"\n... [Showing 5 of {len(data)} scraped items. Total result was {len(content)} chars.]"
329 except:
330 pass
331
332 # Generic truncation with smart ending
333 truncated = content[:max_chars_per_result]
334
335 # Try to end at a complete JSON object or line
336 if content.startswith('{') or content.startswith('['):
337 # Find last complete line
338 last_newline = truncated.rfind('\n')
339 if last_newline > max_chars_per_result // 2:
340 truncated = truncated[:last_newline]
341
342 if tool_name in ["run-actor", "execute-actor"]:
343 truncated += f"\n\n... [Result truncated - original length: {len(content)} chars. Contains scraped data that was cut for context management.]"
344 else:
345 truncated += f"\n\n... [Truncated - original length: {len(content)} chars]"
346
347 return truncated
348
349 def reset_conversation(self):
350 """Reset the conversation history."""
351 self.conversation = []
352
353 def get_conversation_summary(self) -> str:
354 """Get a summary of the conversation."""
355 summary_parts = []
356
357 for msg in self.conversation:
358 role = msg["role"]
359 content = msg["content"]
360
361 if isinstance(content, str):
362 summary_parts.append(f"{role}: {content[:100]}...")
363 elif isinstance(content, list):
364 text_parts = []
365 for block in content:
366 if isinstance(block, dict):
367 if block.get("type") == "text":
368 text_parts.append(block.get("text", "")[:50])
369 elif block.get("type") == "tool_use":
370 text_parts.append(f"[TOOL: {block.get('name', 'unknown')}]")
371 elif block.get("type") == "tool_result":
372 text_parts.append("[TOOL_RESULT]")
373
374 summary_parts.append(f"{role}: {' '.join(text_parts)}")
375
376 return "\n".join(summary_parts)

src/llmscraper/evaluation/__init__.py

1"""
2Evaluation module for ScraperCodeGenerator.
3"""
4
5from .html_quality_evaluator import HTMLQualityEvaluator
6
7__all__ = ["HTMLQualityEvaluator"]

src/llmscraper/evaluation/html_quality_evaluator.py

1"""
2HTML quality evaluation using Claude AI.
3"""
4
5import json
6import logging
7import re
8from typing import Optional
9
10import anthropic
11
12from ..models import EvaluationResult, PreEvaluationResult, ClaudeModel
13
14
15class HTMLQualityEvaluator:
16 """Evaluates HTML quality for web scraping using Claude AI."""
17
18 def __init__(self, claude_api_key: str, claude_model: ClaudeModel = ClaudeModel.CLAUDE_3_5_SONNET):
19 """Initialize with Claude API key and model."""
20 if not claude_api_key or not claude_api_key.strip():
21 raise ValueError("Claude API key cannot be empty")
22
23 self.client = anthropic.Anthropic(api_key=claude_api_key)
24 self.claude_model = claude_model
25 self.logger = logging.getLogger(__name__)
26
27 def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:
28 """
29 Evaluate HTML quality for data extraction.
30
31 Args:
32 user_goal: User's extraction goal
33 html_content: HTML content to evaluate
34
35 Returns:
36 EvaluationResult or None if evaluation fails
37 """
38 try:
39 # Pre-evaluation checks
40 pre_eval = self._pre_evaluate_html(html_content)
41 if not pre_eval.should_continue_to_claude:
42 if pre_eval.score is not None:
43 return EvaluationResult(score=pre_eval.score, reasoning=pre_eval.reasoning)
44 return None
45
46 # Claude evaluation
47 return self._evaluate_with_claude(user_goal, html_content)
48
49 except Exception as e:
50 self.logger.error(f"Error evaluating HTML quality: {str(e)}")
51 return None
52
53 def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:
54 """Perform basic HTML validation checks."""
55 if not html_content or not html_content.strip():
56 return PreEvaluationResult(
57 is_valid_html=False,
58 score=1,
59 reasoning="Empty or whitespace-only HTML content",
60 should_continue_to_claude=False
61 )
62
63 # Check for common failure indicators
64 content_lower = html_content.lower()
65
66 # Bot detection/blocking indicators
67 blocking_indicators = [
68 'please verify you are a human',
69 'access denied',
70 'blocked',
71 'captcha',
72 'cloudflare',
73 'ddos protection',
74 'security check',
75 'bot detected'
76 ]
77
78 for indicator in blocking_indicators:
79 if indicator in content_lower:
80 return PreEvaluationResult(
81 is_valid_html=False,
82 score=1,
83 reasoning=f"HTML appears to be blocked/bot-detected (found: '{indicator}')",
84 should_continue_to_claude=False
85 )
86
87 # Check for minimal HTML structure
88 if not re.search(r'<html|<body|<div|<p|<span', content_lower):
89 return PreEvaluationResult(
90 is_valid_html=False,
91 score=2,
92 reasoning="HTML lacks basic structural elements",
93 should_continue_to_claude=False
94 )
95
96 return PreEvaluationResult(
97 is_valid_html=True,
98 should_continue_to_claude=True
99 )
100
101 def _evaluate_with_claude(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:
102 """Evaluate HTML using Claude AI."""
103 try:
104 prompt = self._create_evaluation_prompt(user_goal, html_content)
105
106 response = self.client.messages.create(
107 model=self.claude_model.value,
108 max_tokens=500,
109 messages=[{"role": "user", "content": prompt}]
110 )
111
112 content = response.content[0].text
113 return self._parse_evaluation_response(content)
114
115 except Exception as e:
116 self.logger.error(f"Error in Claude evaluation: {str(e)}")
117 return None
118
119 def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:
120 """Create the evaluation prompt for Claude."""
121 return f"""You are an expert web scraper evaluator. Analyze the provided HTML and determine how suitable it is for extracting the requested data.
122
123USER EXTRACTION GOAL:
124{user_goal}
125
126HTML CONTENT TO EVALUATE:
127{html_content}
128
129Evaluate the HTML on a scale of 1-10 based on:
1301. Presence of the target data elements
1312. HTML structure quality and accessibility
1323. Whether the page loaded correctly (not blocked, error page, etc.)
1334. How easy it would be to extract the requested data
134
135Return your evaluation in this EXACT JSON format:
136{{
137 "score": [1-10 integer],
138 "reasoning": "[brief explanation of the score]"
139}}
140
141Only return the JSON, no other text.
142"""
143
144 def _parse_evaluation_response(self, response: str) -> Optional[EvaluationResult]:
145 """Parse Claude's evaluation response."""
146 try:
147 # Extract JSON from response
148 json_match = re.search(r'\{.*\}', response, re.DOTALL)
149 if not json_match:
150 raise ValueError("No JSON found in response")
151
152 data = json.loads(json_match.group())
153
154 score = data.get('score')
155 reasoning = data.get('reasoning', '')
156
157 if not isinstance(score, int) or score < 1 or score > 10:
158 raise ValueError(f"Invalid score: {score}")
159
160 return EvaluationResult(score=score, reasoning=reasoning)
161
162 except Exception as e:
163 self.logger.error(f"Error parsing evaluation response: {str(e)}")
164 return None

src/llmscraper/generation/__init__.py

1"""
2Generation module for ScraperCodeGenerator.
3"""
4
5from .script_generator import ScriptGenerator
6from .script_executor import ScriptExecutor
7
8__all__ = ["ScriptGenerator", "ScriptExecutor"]

src/llmscraper/generation/script_executor.py

1"""
2Script execution and testing functionality.
3"""
4
5import subprocess
6import tempfile
7import os
8import json
9import logging
10from typing import Dict, Any, Optional
11import ast
12import traceback
13
14
15class ScriptExecutor:
16 """Executes and tests generated scraping scripts."""
17
18 def __init__(self):
19 """Initialize the script executor."""
20 self.logger = logging.getLogger(__name__)
21
22 def test_script(self, script_content: str, html_content: str) -> Dict[str, Any]:
23 """
24 Test a scraping script against sample HTML content.
25
26 Args:
27 script_content: The Python script to test
28 html_content: Sample HTML to test against
29
30 Returns:
31 Dict with test results including success, data, and errors
32 """
33 try:
34 # Extract the extract_data function from the script
35 extract_function = self._extract_function_from_script(script_content, 'extract_data')
36
37 if not extract_function:
38 return {
39 "success": False,
40 "error": "Could not find extract_data function in script",
41 "data": None
42 }
43
44 # Create a safe execution environment
45 safe_globals = {
46 '__builtins__': {
47 'len': len,
48 'str': str,
49 'int': int,
50 'float': float,
51 'bool': bool,
52 'list': list,
53 'dict': dict,
54 'range': range,
55 'enumerate': enumerate,
56 'zip': zip,
57 'isinstance': isinstance,
58 'hasattr': hasattr,
59 'getattr': getattr,
60 'print': print,
61 '__import__': __import__,
62 }
63 }
64
65 # Import necessary modules into the environment
66 exec("from bs4 import BeautifulSoup", safe_globals)
67 exec("import re", safe_globals)
68 exec("import json", safe_globals)
69
70 # Execute the function definition
71 exec(extract_function, safe_globals)
72
73 # Call the function with the HTML content
74 extracted_data = safe_globals['extract_data'](html_content)
75
76 return {
77 "success": True,
78 "data": extracted_data,
79 "error": None,
80 "data_type": type(extracted_data).__name__,
81 "item_count": len(extracted_data) if isinstance(extracted_data, (list, dict)) else 1
82 }
83
84 except Exception as e:
85 self.logger.error(f"Error testing script: {str(e)}")
86 return {
87 "success": False,
88 "error": str(e),
89 "data": None,
90 "traceback": traceback.format_exc()
91 }
92
93 def _extract_function_from_script(self, script_content: str, function_name: str) -> Optional[str]:
94 """Extract a specific function from a script."""
95 try:
96 # Parse the script into an AST
97 tree = ast.parse(script_content)
98
99 # Find the function definition
100 for node in ast.walk(tree):
101 if isinstance(node, ast.FunctionDef) and node.name == function_name:
102 # Get the source code of the function
103 lines = script_content.split('\n')
104 start_line = node.lineno - 1
105
106 # Find the end of the function
107 end_line = start_line + 1
108 while end_line < len(lines):
109 line = lines[end_line]
110 # Check if this line starts a new function or class
111 if line.strip() and not line.startswith(' ') and not line.startswith('\t'):
112 break
113 end_line += 1
114
115 return '\n'.join(lines[start_line:end_line])
116
117 return None
118
119 except Exception as e:
120 self.logger.error(f"Error extracting function: {str(e)}")
121 return None
122
123 def validate_script_syntax(self, script_content: str) -> Dict[str, Any]:
124 """
125 Validate the syntax of a Python script.
126
127 Args:
128 script_content: The Python script to validate
129
130 Returns:
131 Dict with validation results
132 """
133 try:
134 # Try to parse the script
135 ast.parse(script_content)
136
137 return {
138 "valid": True,
139 "error": None
140 }
141
142 except SyntaxError as e:
143 return {
144 "valid": False,
145 "error": f"Syntax error: {str(e)}",
146 "line": e.lineno,
147 "offset": e.offset
148 }
149 except Exception as e:
150 return {
151 "valid": False,
152 "error": f"Parse error: {str(e)}"
153 }
154
155 def run_script_in_sandbox(self, script_content: str, timeout: int = 60) -> Dict[str, Any]:
156 """
157 Run a complete script in a sandboxed environment.
158
159 Args:
160 script_content: The complete Python script
161 timeout: Maximum execution time in seconds
162
163 Returns:
164 Dict with execution results
165 """
166 try:
167 # Create a temporary file
168 with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file:
169 temp_file.write(script_content)
170 temp_file_path = temp_file.name
171
172 try:
173 # Run the script
174 result = subprocess.run(
175 ['python', temp_file_path],
176 capture_output=True,
177 text=True,
178 timeout=timeout,
179 cwd=os.path.dirname(temp_file_path)
180 )
181
182 return {
183 "success": result.returncode == 0,
184 "stdout": result.stdout,
185 "stderr": result.stderr,
186 "return_code": result.returncode
187 }
188
189 finally:
190 # Clean up the temporary file
191 os.unlink(temp_file_path)
192
193 except subprocess.TimeoutExpired:
194 return {
195 "success": False,
196 "stdout": "",
197 "stderr": f"Script execution timed out after {timeout} seconds",
198 "return_code": -1
199 }
200 except Exception as e:
201 return {
202 "success": False,
203 "stdout": "",
204 "stderr": str(e),
205 "return_code": -1
206 }

src/llmscraper/generation/script_generator.py

1"""
2Code generation functionality for creating scraping scripts.
3"""
4
5import logging
6from typing import Optional
7import re
8
9import anthropic
10from ..models import ClaudeModel
11
12
13class ScriptGenerator:
14 """Generates Python scraping scripts using Claude AI."""
15
16 def __init__(self, claude_api_key: str, claude_model: ClaudeModel = ClaudeModel.CLAUDE_3_5_SONNET):
17 """Initialize with Claude API key and model."""
18 if not claude_api_key or not claude_api_key.strip():
19 raise ValueError("Claude API key cannot be empty")
20
21 self.client = anthropic.Anthropic(api_key=claude_api_key)
22 self.claude_model = claude_model
23 self.logger = logging.getLogger(__name__)
24
25 def generate_scraping_script(self, target_url: str, best_actor: str,
26 pruned_html: str, user_goal: str,
27 for_actor: bool = False) -> Optional[str]:
28 """
29 Generate a complete Python scraping script.
30
31 Args:
32 target_url: The target URL to scrape
33 best_actor: Name of the best performing actor
34 pruned_html: Sample HTML content for reference
35 user_goal: User's extraction goal
36 for_actor: If True, generate for Apify actor (key-value store output)
37
38 Returns:
39 Complete Python script as string, or None if generation fails
40 """
41 try:
42 # Generate the HTML parsing code from Claude
43 parsing_code = self._generate_html_parsing_code(pruned_html, user_goal)
44
45 if not parsing_code:
46 self.logger.error("Failed to generate HTML parsing code")
47 return None
48
49 # Create the complete script
50 if for_actor:
51 return self._create_actor_script(target_url, best_actor, parsing_code, user_goal)
52 else:
53 return self._create_standalone_script(target_url, best_actor, parsing_code, user_goal)
54
55 except Exception as e:
56 self.logger.error(f"Error generating script: {str(e)}")
57 return None
58
59 def _generate_html_parsing_code(self, pruned_html: str, user_goal: str) -> Optional[str]:
60 """Generate HTML parsing/extraction code using Claude."""
61 try:
62 prompt = f"""Generate Python code that parses HTML content and extracts data based on the user's goal. You should generate ONLY the extraction logic, not a complete script.
63
64## USER GOAL:
65{user_goal}
66
67## SAMPLE HTML (for reference):
68{pruned_html}
69
70## REQUIREMENTS:
711. Create a function called `extract_data(html_content)` that takes HTML string as input
722. Use BeautifulSoup to parse the HTML
733. Extract the data according to the user's goal using CSS selectors, attributes, text content, etc.
744. Return the extracted data as a Python dictionary or list of dictionaries
755. Handle missing or malformed data gracefully
766. Include appropriate error handling
77
78## EXAMPLE OUTPUT FORMAT:
79```python
80def extract_data(html_content):
81 from bs4 import BeautifulSoup
82
83 soup = BeautifulSoup(html_content, 'html.parser')
84 results = []
85
86 # Your extraction logic here
87 # Use soup.find(), soup.find_all(), CSS selectors, etc.
88
89 return results
90```
91
92Generate ONLY the `extract_data` function and any helper functions needed. Do not include imports outside the function, full scripts, or other boilerplate code."""
93
94 self.logger.info("Requesting HTML parsing code generation from Claude...")
95
96 response = self.client.messages.create(
97 model=self.claude_model.value,
98 max_tokens=2000,
99 messages=[{"role": "user", "content": prompt}]
100 )
101
102 parsing_code = response.content[0].text
103
104 # Extract Python code from response if wrapped in code blocks
105 if "```python" in parsing_code:
106 code_match = re.search(r'```python\n(.*?)\n```', parsing_code, re.DOTALL)
107 if code_match:
108 parsing_code = code_match.group(1)
109
110 return parsing_code
111
112 except Exception as e:
113 self.logger.error(f"Error generating HTML parsing code: {str(e)}")
114 return None
115
116 def _create_standalone_script(self, target_url: str, best_actor: str,
117 parsing_code: str, user_goal: str) -> str:
118 """Create a standalone Python script."""
119 return f'''#!/usr/bin/env python3
120"""
121Generated Web Scraper
122Target: {target_url}
123Goal: {user_goal}
124Best Actor: {best_actor}
125Generated by: ScraperCodeGenerator
126
127This script is completely standalone and does not require the original ScraperCodeGenerator project.
128"""
129
130import os
131import json
132import logging
133from typing import Dict, Any, List, Optional
134
135# Check and import required libraries
136try:
137 import requests
138except ImportError:
139 raise ImportError("requests not installed. Please install using: pip install requests")
140
141try:
142 from bs4 import BeautifulSoup
143except ImportError:
144 raise ImportError("beautifulsoup4 not installed. Please install using: pip install beautifulsoup4")
145
146try:
147 from apify_client import ApifyClient
148except ImportError:
149 raise ImportError("apify-client not installed. Please install using: pip install apify-client")
150
151
152{parsing_code}
153
154
155def run_actor_scraping(target_url: str, apify_token: str) -> Optional[str]:
156 """
157 Run the best performing actor to get HTML content.
158
159 Args:
160 target_url: URL to scrape
161 apify_token: Apify API token
162
163 Returns:
164 HTML content or None if failed
165 """
166 client = ApifyClient(apify_token)
167
168 # Actor configuration for {best_actor}
169 actor_input = {{
170 "startUrls": [{{"url": target_url}}],
171 "maxRequestRetries": 3,
172 "requestTimeoutSecs": 30,
173 "maxPagesPerCrawl": 1,
174 }}
175
176 # Add actor-specific configuration
177 if "{best_actor}" == "cheerio-scraper":
178 actor_input.update(\{{
179 "pageFunction": \'\'\'
180 async function pageFunction(context) {{
181 const {{ request, log, $ }} = context;
182 try {{
183 const title = $('title').text() || '';
184 const html = $('html').html() || '';
185 return {{
186 url: request.url,
187 title: title,
188 html: html
189 }};
190 }} catch (error) {{
191 log.error('Error in pageFunction:', error);
192 return {{
193 url: request.url,
194 title: '',
195 html: ''
196 }};
197 }}
198 }}
199 \'\'\',
200 "proxyConfiguration": {{"useApifyProxy": True}}
201 }})
202 actor_id = "apify/cheerio-scraper"
203 elif "{best_actor}" == "web-scraper":
204 actor_input.update({{
205 "pageFunction": \'\'\'
206 async function pageFunction(context) {{
207 const {{ request, log, page }} = context;
208 try {{
209 const title = await page.title();
210 const html = await page.content();
211 return {{
212 url: request.url,
213 title: title,
214 html: html
215 }};
216 }} catch (error) {{
217 log.error('Error in pageFunction:', error);
218 return {{
219 url: request.url,
220 title: '',
221 html: ''
222 }};
223 }}
224 }}
225 \'\'\',
226 "proxyConfiguration": {{"useApifyProxy": True}}
227 }})
228 actor_id = "apify/web-scraper"
229 elif "{best_actor}" == "website-content-crawler":
230 actor_input = {{
231 "startUrls": [{{"url": target_url}}],
232 "maxCrawlPages": 1,
233 "crawler": "playwright",
234 "proxyConfiguration": {{"useApifyProxy": True}}
235 }}
236 actor_id = "apify/website-content-crawler"
237 else:
238 # Fallback to simple requests if actor not recognized
239 logging.warning(f"Unknown actor '{best_actor}', falling back to requests")
240 try:
241 response = requests.get(target_url, timeout=30)
242 response.raise_for_status()
243 return response.text
244 except Exception as e:
245 logging.error(f"Failed to fetch with requests: {{e}}")
246 return None
247
248 try:
249 # Run the actor
250 logging.info(f"Running {{actor_id}} actor...")
251 run = client.actor(actor_id).call(run_input=actor_input)
252
253 # Get the dataset items
254 dataset_client = client.dataset(run["defaultDatasetId"])
255 items = list(dataset_client.iterate_items())
256
257 if not items:
258 logging.warning("No items returned from actor")
259 return None
260
261 # Extract HTML content
262 item = items[0]
263 html_content = item.get('html') or item.get('text') or item.get('markdown', '')
264
265 if not html_content:
266 logging.warning("No HTML content found in actor result")
267 return None
268
269 return html_content
270
271 except Exception as e:
272 logging.error(f"Error running actor: {{e}}")
273 return None
274
275
276def main():
277 """Main function to run the scraper."""
278 # Setup logging
279 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
280 logger = logging.getLogger(__name__)
281
282 # Configuration
283 target_url = "{target_url}"
284 apify_token = os.getenv("APIFY_TOKEN")
285
286 if not apify_token:
287 logger.error("APIFY_TOKEN environment variable not set")
288 logger.info("Please set your Apify API token: export APIFY_TOKEN='your_token_here'")
289 logger.info("Get your token at: https://console.apify.com/")
290 return
291
292 try:
293 logger.info(f"🚀 Starting scraper for: {{target_url}}")
294 logger.info(f"📝 Goal: {user_goal}")
295 logger.info(f"🏆 Using best actor: {best_actor}")
296
297 # Get HTML content using the best performing actor
298 html_content = run_actor_scraping(target_url, apify_token)
299
300 if not html_content:
301 logger.error("Failed to get HTML content")
302 return
303
304 logger.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")
305
306 # Extract data using the generated parsing code
307 logger.info("🔍 Extracting data from HTML...")
308 extracted_data = extract_data(html_content)
309
310 if not extracted_data:
311 logger.warning("No data was extracted from the HTML")
312 return
313
314 # Prepare final results
315 results = {{
316 "target_url": target_url,
317 "extraction_goal": "{user_goal}",
318 "actor_used": "{best_actor}",
319 "data": extracted_data,
320 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1
321 }}
322
323 # Output results
324 print("\\n" + "="*60)
325 print("📊 EXTRACTION RESULTS")
326 print("="*60)
327 print(json.dumps(results, indent=2, ensure_ascii=False))
328
329 # Save to file
330 output_file = "extracted_data.json"
331 with open(output_file, 'w', encoding='utf-8') as f:
332 json.dump(results, f, indent=2, ensure_ascii=False)
333
334 logger.info(f"💾 Results saved to {{output_file}}")
335 logger.info(f"🎉 Successfully extracted {{results['total_items']}} items!")
336
337 except Exception as e:
338 logger.error(f"❌ Scraping failed: {{e}}")
339 import traceback
340 traceback.print_exc()
341
342
343if __name__ == "__main__":
344 main()
345'''
346
347 def _create_actor_script(self, target_url: str, best_actor: str,
348 parsing_code: str, user_goal: str) -> str:
349 """Create a script for Apify actor."""
350 return f'''"""
351Apify Actor Script
352Target: {target_url}
353Goal: {user_goal}
354Best Actor: {best_actor}
355Generated by: ScraperCodeGenerator
356
357This script is completely standalone and does not require the original ScraperCodeGenerator project.
358"""
359
360import json
361from typing import Optional
362
363# Check and import required libraries
364try:
365 from apify import Actor
366except ImportError:
367 raise ImportError("apify not installed. Please install using: pip install apify")
368
369try:
370 from bs4 import BeautifulSoup
371except ImportError:
372 raise ImportError("beautifulsoup4 not installed. Please install using: pip install beautifulsoup4")
373
374try:
375 from apify_client import ApifyClient
376except ImportError:
377 raise ImportError("apify-client not installed. Please install using: pip install apify-client")
378
379
380{parsing_code}
381
382
383async def run_actor_scraping(target_url: str, apify_token: str) -> Optional[str]:
384 """
385 Run the best performing actor to get HTML content.
386
387 Args:
388 target_url: URL to scrape
389 apify_token: Apify API token
390
391 Returns:
392 HTML content or None if failed
393 """
394 client = ApifyClient(apify_token)
395
396 # Actor configuration for {best_actor}
397 actor_input = {{
398 "startUrls": [{{"url": target_url}}],
399 "maxRequestRetries": 3,
400 "requestTimeoutSecs": 30,
401 "maxPagesPerCrawl": 1,
402 }}
403
404 # Add actor-specific configuration
405 if "{best_actor}" == "cheerio-scraper":
406 actor_input.update({{
407 "pageFunction": \'\'\'
408 async function pageFunction(context) {{
409 const {{ request, log, $ }} = context;
410 try {{
411 const title = $('title').text() || '';
412 const html = $('html').html() || '';
413 return {{
414 url: request.url,
415 title: title,
416 html: html
417 }};
418 }} catch (error) {{
419 log.error('Error in pageFunction:', error);
420 return {{
421 url: request.url,
422 title: '',
423 html: ''
424 }};
425 }}
426 }}
427 \'\'\',
428 "proxyConfiguration": {{"useApifyProxy": True}}
429 }})
430 actor_id = "apify/cheerio-scraper"
431 elif "{best_actor}" == "web-scraper":
432 actor_input.update({{
433 "pageFunction": \'\'\'
434 async function pageFunction(context) {{
435 const {{ request, log, page }} = context;
436 try {{
437 const title = await page.title();
438 const html = await page.content();
439 return {{
440 url: request.url,
441 title: title,
442 html: html
443 }};
444 }} catch (error) {{
445 log.error('Error in pageFunction:', error);
446 return {{
447 url: request.url,
448 title: '',
449 html: ''
450 }};
451 }}
452 }}
453 \'\'\',
454 "proxyConfiguration": {{"useApifyProxy": True}}
455 }})
456 actor_id = "apify/web-scraper"
457 elif "{best_actor}" == "website-content-crawler":
458 actor_input = {{
459 "startUrls": [{{"url": target_url}}],
460 "maxCrawlPages": 1,
461 "crawler": "playwright",
462 "proxyConfiguration": {{"useApifyProxy": True}}
463 }}
464 actor_id = "apify/website-content-crawler"
465 else:
466 Actor.log.error(f"Unknown actor: {best_actor}")
467 return None
468
469 try:
470 # Run the actor
471 Actor.log.info(f"Running {{actor_id}} actor...")
472 run = client.actor(actor_id).call(run_input=actor_input)
473
474 # Get the dataset items
475 dataset_client = client.dataset(run["defaultDatasetId"])
476 items = list(dataset_client.iterate_items())
477
478 if not items:
479 Actor.log.warning("No items returned from actor")
480 return None
481
482 # Extract HTML content
483 item = items[0]
484 html_content = item.get('html') or item.get('text') or item.get('markdown', '')
485
486 if not html_content:
487 Actor.log.warning("No HTML content found in actor result")
488 return None
489
490 return html_content
491
492 except Exception as e:
493 Actor.log.error(f"Error running actor: {{e}}")
494 return None
495
496
497async def main():
498 """Main actor function."""
499 async with Actor:
500 # Get input
501 actor_input = await Actor.get_input() or {{}}
502 target_url = actor_input.get('targetUrl', '{target_url}')
503 user_goal = actor_input.get('userGoal', '{user_goal}')
504 apify_token = actor_input.get('apifyToken') or Actor.config.token
505
506 Actor.log.info(f"🚀 Starting scraper for: {{target_url}}")
507 Actor.log.info(f"📝 Goal: {{user_goal}}")
508 Actor.log.info(f"🏆 Using best actor: {best_actor}")
509
510 try:
511 # Get HTML content using the best performing actor
512 html_content = await run_actor_scraping(target_url, apify_token)
513
514 if not html_content:
515 await Actor.fail(f"Failed to get HTML content from {best_actor} actor")
516 return
517
518 Actor.log.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")
519
520 # Extract data using the generated parsing code
521 Actor.log.info("🔍 Extracting data from HTML...")
522 extracted_data = extract_data(html_content)
523
524 if not extracted_data:
525 Actor.log.warning("No data was extracted from the HTML")
526 extracted_data = []
527
528 # Prepare final results
529 results = {{
530 "target_url": target_url,
531 "extraction_goal": user_goal,
532 "actor_used": "{best_actor}",
533 "data": extracted_data,
534 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1
535 }}
536
537 # Save to key-value store
538 await Actor.set_value('OUTPUT', results)
539
540 Actor.log.info(f"🎉 Successfully extracted {{results['total_items']}} items!")
541 Actor.log.info("💾 Results saved to key-value store as 'OUTPUT'")
542
543 except Exception as e:
544 Actor.log.error(f"❌ Scraping failed: {{e}}")
545 await Actor.fail(str(e))
546
547
548if __name__ == "__main__":
549 import asyncio
550 asyncio.run(main())
551'''

src/llmscraper/llm_scraper/__init__.py

1"""
2LLM Scraper package - Intelligent web scraping using Claude and Apify MCP.
3"""
4
5from .actor import LLMScraperActor, run_llm_scraper_actor, run_llm_scraper
6from .models import (
7 LLMScraperInput, LLMScraperOutput, ActorAttempt,
8 ScrapingStrategy, ProgressUpdate
9)
10from .retry_logic import RetryManager
11from .quality_evaluator import DataQualityEvaluator, QualityMetrics
12
13__all__ = [
14 'LLMScraperActor',
15 'run_llm_scraper_actor',
16 'run_llm_scraper',
17 'LLMScraperInput',
18 'LLMScraperOutput',
19 'ActorAttempt',
20 'ScrapingStrategy',
21 'ProgressUpdate',
22 'RetryManager',
23 'DataQualityEvaluator',
24 'QualityMetrics'
25]

src/llmscraper/llm_scraper/actor.py

1"""
2Main LLM Scraper Actor implementation.
3"""
4
5import asyncio
6import logging
7import time
8from datetime import datetime, timedelta
9from typing import Dict, List, Any, Optional, Callable
10from urllib.parse import urlparse
11import json
12import os
13
14from apify import Actor
15
16from ..mcp import MCPClient
17from ..claude import ClaudeManager, ConversationResult
18from .models import (
19 LLMScraperInput, LLMScraperOutput, ActorAttempt,
20 ScrapingStrategy, ProgressUpdate
21)
22from .retry_logic import RetryManager
23from .quality_evaluator import DataQualityEvaluator
24
25
26class LLMScraperActor:
27 """
28 Main LLM Scraper Actor that uses Claude to discover and test Apify actors.
29 """
30
31 def __init__(self, input_config: LLMScraperInput):
32 """Initialize the LLM Scraper Actor."""
33 self.config = input_config
34 self.start_time = datetime.now()
35 self.logger = logging.getLogger(__name__)
36
37 # Initialize components
38 self.mcp_client: Optional[MCPClient] = None
39 self.claude_manager: Optional[ClaudeManager] = None
40 self.retry_manager = RetryManager(
41 max_retries_per_actor=input_config.max_retries_per_actor
42 )
43 self.quality_evaluator = DataQualityEvaluator()
44
45 # State tracking
46 self.output = LLMScraperOutput(
47 success=False,
48 status="initializing",
49 claude_model_used=input_config.model_name,
50 mcp_server_used=input_config.mcp_url
51 )
52 self.current_attempts = 0
53 self.tested_actors: Dict[str, List[ActorAttempt]] = {}
54
55 # Progress callback
56 self.on_progress: Optional[Callable[[ProgressUpdate], None]] = None
57
58 async def run(self) -> LLMScraperOutput:
59 """
60 Main execution method.
61
62 Returns:
63 LLMScraperOutput with results
64 """
65 try:
66 self.logger.info(f"🚀 Starting LLM Scraper for {self.config.target_url}")
67
68 # Validate input
69 self.config.validate()
70
71 # Initialize connections
72 await self._initialize_connections()
73
74 # Main scraping process
75 await self._execute_scraping_process()
76
77 # Finalize results
78 self._finalize_results()
79
80 self.logger.info(f"✅ LLM Scraper completed: {self.output.status}")
81 return self.output
82
83 except Exception as e:
84 self.logger.error(f"❌ LLM Scraper failed: {str(e)}")
85 self.output.success = False
86 self.output.status = "failed"
87 self.output.llm_reasoning = f"Execution failed: {str(e)}"
88 self._finalize_results()
89 return self.output
90
91 async def _initialize_connections(self):
92 """Initialize MCP client and Claude manager."""
93 self._emit_progress("initializing", "Setting up connections...")
94
95 # Get API keys
96 anthropic_key = (self.config.anthropic_api_key or
97 os.getenv('CLAUDE_API_KEY') or
98 os.getenv('CLAUDE_API_KEY'))
99
100 if not anthropic_key:
101 raise ValueError("Anthropic API key is required (anthropic_api_key or CLAUDE_API_KEY env var)")
102
103 apify_token = os.getenv('APIFY_TOKEN')
104 if not apify_token:
105 raise ValueError("APIFY_TOKEN environment variable is required")
106
107 # Initialize MCP client
108 self.mcp_client = MCPClient(
109 server_url=self.config.mcp_url,
110 apify_token=apify_token
111 )
112
113 connection_success = await self.mcp_client.connect()
114 if not connection_success:
115 self.logger.warning("MCP server connection failed, using fallback mode")
116
117 # Initialize Claude manager
118 self.claude_manager = ClaudeManager(
119 api_key=anthropic_key,
120 model=self.config.model_name,
121 max_tool_calls_per_round=min(self.config.max_actor_attempts, 15)
122 )
123
124 # Set system prompt with specific parameters
125 self.claude_manager.set_system_prompt(
126 target_url=self.config.target_url,
127 extraction_goal=self.config.extraction_goal,
128 max_attempts=self.config.max_actor_attempts,
129 max_retries=self.config.max_retries_per_actor,
130 max_time=self.config.max_time_minutes
131 )
132
133 self.logger.info("✅ Connections initialized successfully")
134
135 async def _execute_scraping_process(self):
136 """Execute the main scraping discovery and testing process."""
137 # Phase 1: Actor Discovery
138 self._emit_progress("discovery", "Discovering suitable actors...")
139 candidate_actors = await self._discover_actors()
140
141 if not candidate_actors:
142 self.output.status = "no_actors_found"
143 self.output.llm_reasoning = "No suitable actors found for the given task"
144 return
145
146 self.logger.info(f"Found {len(candidate_actors)} candidate actors to test")
147 for i, actor in enumerate(candidate_actors[:10], 1):
148 self.logger.info(f" {i}. {actor['name']} - {actor['title']} (priority: {actor['priority']:.2f})")
149 if len(candidate_actors) > 10:
150 self.logger.info(f" ... and {len(candidate_actors) - 10} more actors")
151
152 # Phase 2: Actor Testing
153 self._emit_progress("testing", f"Testing {len(candidate_actors)} actors...")
154 await self._test_actors(candidate_actors)
155
156 # Determine final status
157 if self._has_successful_attempts():
158 self.output.success = True
159 self.output.status = "completed"
160 elif self._time_exceeded():
161 self.output.status = "timeout"
162 elif self.current_attempts >= self.config.max_actor_attempts:
163 self.output.status = "limit_reached"
164 else:
165 self.output.status = "no_successful_actors"
166
167 def _build_discovery_query(self) -> str:
168 """Build the initial query for Claude."""
169 domain = urlparse(self.config.target_url).netloc
170
171 query = f"""I need to find the best Apify actors to scrape this website:
172
173TARGET URL: {self.config.target_url}
174DOMAIN: {domain}
175EXTRACTION GOAL: {self.config.extraction_goal}
176
177Please search for relevant actors and provide me with a prioritized list of actor names to test.
178
179Process:
1801. Use search-actors to find actors specifically for this domain/platform
1812. If no specific actors found, search for general web scraping actors
1823. Analyze the search results and extract the most promising actor names
1834. Provide a prioritized list based on relevance, popularity, and suitability
184
185Focus on finding actor NAMES/IDs that I can test, not on executing them yet.
186Start by searching for actors relevant to this task."""
187
188 return query
189
190 async def _discover_actors(self) -> List[Dict[str, Any]]:
191 """
192 Phase 1: Discover candidate actors for the scraping task.
193
194 Returns:
195 List of candidate actors with metadata
196 """
197 # Create discovery query
198 query = self._build_discovery_query()
199
200 # Process with Claude for discovery only
201 result = await self.claude_manager.process_query(
202 query=query,
203 mcp_client=self.mcp_client,
204 on_progress=self._handle_claude_progress
205 )
206
207 # Extract candidate actors from tool calls
208 candidate_actors = []
209 for tool_call in result.tool_calls:
210 if tool_call.name == "search-actors":
211 # Get the actual result from MCP client
212 search_result = await self.mcp_client.call_tool(
213 tool_call.name,
214 tool_call.arguments
215 )
216
217 if not search_result.is_error and isinstance(search_result.content, list):
218 for actor_data in search_result.content:
219 candidate_actors.append({
220 "name": actor_data.get("name", ""),
221 "title": actor_data.get("title", ""),
222 "description": actor_data.get("description", ""),
223 "username": actor_data.get("username", ""),
224 "stats": actor_data.get("stats", {}),
225 "priority": self._calculate_actor_priority(actor_data)
226 })
227
228 # Sort by priority (higher is better)
229 candidate_actors.sort(key=lambda x: x["priority"], reverse=True)
230
231 # Limit to max attempts
232 candidate_actors = candidate_actors[:self.config.max_actor_attempts]
233
234 self.logger.info(f"Discovered {len(candidate_actors)} candidate actors")
235 for i, actor in enumerate(candidate_actors[:5]):
236 self.logger.info(f" {i+1}. {actor['name']} (priority: {actor['priority']:.2f})")
237
238 return candidate_actors
239
240 def _calculate_actor_priority(self, actor_data: Dict[str, Any]) -> float:
241 """Calculate priority score for an actor based on relevance and popularity."""
242 score = 0.0
243
244 name = actor_data.get("name", "").lower()
245 description = actor_data.get("description", "").lower()
246 stats = actor_data.get("stats", {})
247
248 # Domain-specific bonus
249 domain = urlparse(self.config.target_url).netloc.lower()
250 if domain.replace("www.", "") in name:
251 score += 10.0
252
253 # Platform-specific bonuses
254 platform_keywords = {
255 "linkedin": ["linkedin"],
256 "facebook": ["facebook", "fb"],
257 "instagram": ["instagram", "insta"],
258 "twitter": ["twitter", "x.com"],
259 "youtube": ["youtube"],
260 "amazon": ["amazon"],
261 "shopify": ["shopify"],
262 "ecommerce": ["shop", "store", "product", "ecommerce"]
263 }
264
265 for platform, keywords in platform_keywords.items():
266 if any(keyword in domain for keyword in keywords):
267 if any(keyword in name for keyword in keywords):
268 score += 8.0
269 elif any(keyword in description for keyword in keywords):
270 score += 5.0
271
272 # General scraper types
273 if "web-scraper" in name:
274 score += 4.0
275 elif "cheerio-scraper" in name:
276 score += 4.5
277 elif "website-content-crawler" in name:
278 score += 3.0
279 elif "puppeteer" in name:
280 score += 3.5
281
282 # Popularity bonus
283 users = stats.get("users", 0)
284 runs = stats.get("runs", 0)
285
286 if users > 1000:
287 score += 2.0
288 elif users > 100:
289 score += 1.0
290
291 if runs > 10000:
292 score += 2.0
293 elif runs > 1000:
294 score += 1.0
295
296 return score
297
298 async def _test_actors(self, candidate_actors: List[Dict[str, Any]]):
299 """
300 Phase 2: Test each actor individually in separate conversations.
301
302 Args:
303 candidate_actors: List of actors to test
304 """
305 for i, actor in enumerate(candidate_actors):
306 if self.current_attempts >= self.config.max_actor_attempts:
307 self.logger.info("Reached maximum attempts limit")
308 break
309
310 if self._time_exceeded():
311 self.logger.info("Reached time limit")
312 break
313
314 actor_name = actor["name"]
315 self.logger.info(f"Testing actor {i+1}/{len(candidate_actors)}: {actor_name}")
316
317 # Test this actor in a separate conversation
318 success = await self._test_single_actor(actor)
319
320 if success and self.config.min_data_quality_score > 0:
321 # If we found a good enough actor, we might stop here
322 best_attempt = self._get_best_attempt()
323 if (best_attempt and
324 best_attempt.data_quality_score >= self.config.min_data_quality_score):
325 self.logger.info(f"Found satisfactory actor: {actor_name} " +
326 f"(quality: {best_attempt.data_quality_score:.2f})")
327 break
328
329 async def _test_single_actor(self, actor_info: Dict[str, Any]) -> bool:
330 """
331 Test a single actor in its own conversation.
332
333 Args:
334 actor_info: Actor information from discovery phase
335
336 Returns:
337 True if actor was successful, False otherwise
338 """
339 actor_name = actor_info["name"]
340
341 # Create a fresh Claude manager for this actor test
342 test_claude = ClaudeManager(
343 api_key=self.claude_manager.client.api_key,
344 model=self.config.model_name,
345 max_tool_calls_per_round=self.config.max_retries_per_actor + 2
346 )
347
348 # Set specialized system prompt for testing this specific actor
349 test_claude.system_prompt = f"""You are testing a specific Apify actor to see if it can scrape the target data.
350
351ACTOR TO TEST: {actor_name}
352ACTOR DESCRIPTION: {actor_info.get('description', 'No description')}
353
354TARGET URL: {self.config.target_url}
355EXTRACTION GOAL: {self.config.extraction_goal}
356
357YOUR TASK:
3581. Get details about this actor using get-actor-details
3592. Understand its input schema and capabilities
3603. Configure optimal input parameters for the target URL and goal
3614. Run the actor with those parameters
3625. Analyze the results to determine if it successfully extracted the target data
3636. If it fails, try up to {self.config.max_retries_per_actor} different input configurations
364
365IMPORTANT:
366- Focus ONLY on testing this specific actor
367- Try different input configurations if the first attempt fails
368- Look for the target data in the results
369- Determine success based on whether relevant data was extracted
370
371Start by getting details about the actor, then test it."""
372
373 # Create test query
374 test_query = f"""Please test the actor '{actor_name}' for my scraping task.
375
376Target URL: {self.config.target_url}
377Goal: {self.config.extraction_goal}
378
379Test this actor systematically and determine if it can extract the required data."""
380
381 # Run the test conversation
382 self._emit_progress(
383 "testing",
384 f"Testing {actor_name}...",
385 actor_name=actor_name.split('/')[-1],
386 attempt_number=1
387 )
388
389 try:
390 result = await test_claude.process_query(
391 query=test_query,
392 mcp_client=self.mcp_client,
393 on_progress=lambda role, content: self._handle_test_progress(actor_name, role, content)
394 )
395
396 # Analyze the tool calls to extract actor execution results
397 success = await self._analyze_single_actor_test(actor_name, result.tool_calls)
398
399 return success
400
401 except Exception as e:
402 self.logger.error(f"Error testing actor {actor_name}: {str(e)}")
403 return False
404
405 async def _analyze_single_actor_test(self, actor_name: str, tool_calls: List) -> bool:
406 """
407 Analyze the results of testing a single actor.
408
409 Args:
410 actor_name: Name of the actor being tested
411 tool_calls: Tool calls made during the test
412
413 Returns:
414 True if the actor was successful
415 """
416 actor_executions = []
417
418 # Find all actor execution attempts
419 for tool_call in tool_calls:
420 if tool_call.name in ["run-actor", "execute-actor"]:
421 if tool_call.arguments.get('actor_id') == actor_name or tool_call.arguments.get('actor') == actor_name:
422 actor_executions.append(tool_call)
423
424 if not actor_executions:
425 self.logger.warning(f"No executions found for {actor_name}")
426 return False
427
428 # Process each execution attempt
429 best_attempt = None
430 for i, execution in enumerate(actor_executions):
431 attempt = await self._create_attempt_from_execution(actor_name, execution, i + 1)
432
433 # Add to tracking
434 if actor_name not in self.tested_actors:
435 self.tested_actors[actor_name] = []
436 self.tested_actors[actor_name].append(attempt)
437 self.output.add_attempt(attempt)
438 self.current_attempts += 1
439
440 if attempt.success and (not best_attempt or attempt.data_quality_score > best_attempt.data_quality_score):
441 best_attempt = attempt
442
443 return best_attempt is not None and best_attempt.success
444
445 async def _create_attempt_from_execution(self, actor_name: str, execution_tool_call, attempt_number: int) -> ActorAttempt:
446 """Create an ActorAttempt from a tool call execution."""
447 input_config = execution_tool_call.arguments.get('input', {})
448
449 # Execute the actor to get real results
450 result = await self.mcp_client.call_tool(execution_tool_call.name, execution_tool_call.arguments)
451
452 attempt = ActorAttempt(
453 actor_id=actor_name,
454 actor_name=actor_name.split('/')[-1] if '/' in actor_name else actor_name,
455 attempt_number=attempt_number,
456 input_config=input_config,
457 timestamp=datetime.now().isoformat()
458 )
459
460 if result.is_error:
461 attempt.success = False
462 attempt.error_message = str(result.content)
463 attempt.error_type = "execution_error"
464 self.logger.warning(f"Actor {actor_name} failed: {attempt.error_message}")
465 else:
466 # Analyze the results to determine success
467 extracted_data = self._extract_data_from_result(result.content)
468
469 if extracted_data:
470 attempt.success = True
471 attempt.extracted_data = extracted_data
472 attempt.result_count = len(extracted_data) if isinstance(extracted_data, list) else 1
473 attempt.data_quality_score = self._evaluate_data_quality(extracted_data)
474 attempt.execution_time_seconds = 30.0 # Placeholder
475
476 self.logger.info(f"Actor {actor_name} succeeded: {attempt.result_count} items, " +
477 f"quality: {attempt.data_quality_score:.2f}")
478 else:
479 attempt.success = False
480 attempt.error_message = "No relevant data extracted"
481 attempt.error_type = "no_data"
482 self.logger.warning(f"Actor {actor_name} returned no relevant data")
483
484 return attempt
485
486 def _extract_data_from_result(self, result_content) -> List[Dict[str, Any]]:
487 """Extract meaningful data from actor execution result."""
488 if isinstance(result_content, list):
489 return result_content
490 elif isinstance(result_content, dict):
491 # Look for common data fields
492 if "items" in result_content:
493 return result_content["items"]
494 elif "data" in result_content:
495 return result_content["data"]
496 elif "results" in result_content:
497 return result_content["results"]
498 else:
499 return [result_content]
500 else:
501 return []
502
503 def _evaluate_data_quality(self, data: List[Dict[str, Any]]) -> float:
504 """Evaluate the quality of extracted data."""
505 if not data:
506 return 0.0
507
508 # Simple quality scoring based on data completeness
509 total_score = 0.0
510
511 for item in data:
512 item_score = 0.0
513
514 # Check for common useful fields
515 if "title" in item or "name" in item:
516 item_score += 0.3
517 if "price" in item or "cost" in item:
518 item_score += 0.2
519 if "url" in item or "link" in item:
520 item_score += 0.2
521 if "description" in item:
522 item_score += 0.2
523 if len(item.keys()) > 3: # Has multiple fields
524 item_score += 0.1
525
526 total_score += min(item_score, 1.0)
527
528 return min(total_score / len(data), 1.0)
529
530 def _handle_test_progress(self, actor_name: str, role: str, content: str):
531 """Handle progress updates during individual actor testing."""
532 if role == "tool_call":
533 self.logger.debug(f"[{actor_name}] Tool call: {content}")
534 elif role == "tool_result":
535 self.logger.debug(f"[{actor_name}] Tool result: {content[:100]}...")
536
537 def _get_best_attempt(self) -> Optional[ActorAttempt]:
538 """Get the best successful attempt so far."""
539 best_attempt = None
540
541 for attempts in self.tested_actors.values():
542 for attempt in attempts:
543 if attempt.success and (not best_attempt or
544 attempt.data_quality_score > best_attempt.data_quality_score):
545 best_attempt = attempt
546
547 return best_attempt
548
549 def _handle_claude_progress(self, role: str, content: str):
550 """Handle progress updates from Claude conversation."""
551 if role == "tool_call":
552 self._emit_progress("testing", content)
553 elif role == "tool_result":
554 self._emit_progress("analyzing", f"Tool result: {content}")
555
556 def _has_successful_attempts(self) -> bool:
557 """Check if we have any successful attempts."""
558 return any(
559 attempt.success for attempts in self.tested_actors.values()
560 for attempt in attempts
561 )
562
563 def _time_exceeded(self) -> bool:
564 """Check if maximum execution time has been exceeded."""
565 elapsed = datetime.now() - self.start_time
566 return elapsed.total_seconds() > (self.config.max_time_minutes * 60)
567
568 def _finalize_results(self):
569 """Finalize the output results."""
570 end_time = datetime.now()
571 self.output.total_execution_time_seconds = (end_time - self.start_time).total_seconds()
572
573 # Add recommendations based on results
574 if self.output.success:
575 self.output.add_recommendation(
576 f"Use {self.output.best_actor_name} with the provided optimal configuration for best results"
577 )
578 else:
579 self.output.add_recommendation(
580 "Consider trying with different extraction goals or using manual actor configuration"
581 )
582 if self.current_attempts >= self.config.max_actor_attempts:
583 self.output.add_recommendation(
584 "Increase max_actor_attempts limit to test more actors"
585 )
586 if self._time_exceeded():
587 self.output.add_recommendation(
588 "Increase max_time_minutes to allow more thorough testing"
589 )
590
591 self.output.calculate_performance_summary()
592
593 self.logger.info(f"Final results: {self.output.total_attempts_made} attempts, "
594 f"{self.output.unique_actors_tested} actors tested, "
595 f"success: {self.output.success}")
596
597 def _emit_progress(self, stage: str, message: str, **kwargs):
598 """Emit a progress update."""
599 # Calculate progress based on stage and current attempts
600 progress = 0.0
601 if stage == "initializing":
602 progress = 0.1
603 elif stage == "discovery":
604 progress = 0.2
605 elif stage == "testing":
606 # Progress through testing based on attempts made
607 max_attempts = self.config.max_actor_attempts
608 progress = 0.2 + (0.6 * (self.current_attempts / max_attempts))
609 elif stage == "analyzing":
610 progress = 0.8
611 elif stage == "completed":
612 progress = 1.0
613
614 update = ProgressUpdate(
615 timestamp=datetime.now().isoformat(),
616 stage=stage,
617 message=message,
618 progress=progress,
619 **kwargs
620 )
621
622 self.logger.info(f"[{stage.upper()}] {message}")
623
624 if self.on_progress:
625 self.on_progress(update)
626
627 def set_progress_callback(self, callback: Callable[[ProgressUpdate], None]):
628 """Set progress callback for real-time updates."""
629 self.on_progress = callback
630
631
632# Actor entry point function
633async def run_llm_scraper_actor() -> LLMScraperOutput:
634 """
635 Main entry point for running as an Apify Actor.
636
637 Returns:
638 LLMScraperOutput with results
639 """
640 async with Actor:
641 Actor.log.info("🚀 LLM Scraper Actor starting...")
642
643 # Get input
644 actor_input = await Actor.get_input() or {}
645
646 # Parse input
647 try:
648 input_config = LLMScraperInput(
649 target_url=actor_input.get('targetUrl', ''),
650 extraction_goal=actor_input.get('userGoal', ''),
651 max_actor_attempts=actor_input.get('max_actor_attempts', 10),
652 max_retries_per_actor=actor_input.get('max_retries_per_actor', 3),
653 max_time_minutes=actor_input.get('max_time_minutes', 30),
654 anthropic_api_key=actor_input.get('claudeApiKey'),
655 mcp_url=actor_input.get('mcp_url', "https://mcp.apify.com/sse?enableAddingActors=true"),
656 model_name=actor_input.get('model_name', "claude-3-5-haiku-latest"),
657 debug_mode=actor_input.get('debug_mode', False),
658 prefer_specific_actors=actor_input.get('prefer_specific_actors', True),
659 min_data_quality_score=actor_input.get('min_data_quality_score', 0.7),
660 enable_proxy=actor_input.get('enable_proxy', True)
661 )
662 except Exception as e:
663 Actor.log.error(f"Invalid input: {str(e)}")
664 await Actor.fail(f"Invalid input: {str(e)}")
665 return LLMScraperOutput(success=False, status="failed", llm_reasoning=f"Input validation failed: {str(e)}")
666
667 # Create and run scraper
668 scraper = LLMScraperActor(input_config)
669
670 # Set up progress callback to emit to Actor log
671 def progress_callback(update: ProgressUpdate):
672 Actor.log.info(f"[{update.stage.upper()}] {update.message}")
673 # Push progress updates to dataset for real-time monitoring
674 asyncio.create_task(Actor.push_data({
675 "type": "progress",
676 "timestamp": update.timestamp,
677 "stage": update.stage,
678 "message": update.message,
679 "details": update.to_dict()
680 }))
681
682 scraper.set_progress_callback(progress_callback)
683
684 # Run the scraper
685 result = await scraper.run()
686
687 # Push final results
688 await Actor.push_data({
689 "type": "final_result",
690 **result.to_dict()
691 })
692
693 Actor.log.info(f"✅ LLM Scraper completed with status: {result.status}")
694
695 return result
696
697
698# Standalone function for use outside of Actor context
699async def run_llm_scraper(input_config: LLMScraperInput,
700 progress_callback: Optional[Callable[[ProgressUpdate], None]] = None) -> LLMScraperOutput:
701 """
702 Run LLM Scraper outside of Apify Actor context.
703
704 Args:
705 input_config: Input configuration
706 progress_callback: Optional callback for progress updates
707
708 Returns:
709 LLMScraperOutput with results
710 """
711 scraper = LLMScraperActor(input_config)
712
713 if progress_callback:
714 scraper.set_progress_callback(progress_callback)
715
716 return await scraper.run()

src/llmscraper/llm_scraper/models.py

1"""
2Input and output models for LLM Scraper Actor.
3"""
4
5from dataclasses import dataclass, field
6from typing import Dict, List, Any, Optional, Union
7from datetime import datetime
8import json
9
10
11@dataclass
12class LLMScraperInput:
13 """Input schema for LLM Scraper Actor."""
14
15 # Required fields
16 target_url: str
17 extraction_goal: str
18
19 # Optional configuration
20 max_actor_attempts: int = 10
21 max_retries_per_actor: int = 3
22 max_time_minutes: int = 30
23
24 # API keys (optional - can use environment variables)
25 anthropic_api_key: Optional[str] = None
26
27 # MCP configuration
28 mcp_url: str = "https://mcp.apify.com/sse?enableAddingActors=true"
29 model_name: str = "claude-3-5-haiku-latest"
30
31 # Advanced options
32 debug_mode: bool = False
33 prefer_specific_actors: bool = True
34 min_data_quality_score: float = 0.7
35 enable_proxy: bool = True
36
37 def validate(self) -> bool:
38 """Validate input parameters."""
39 if not self.target_url or not self.target_url.startswith(('http://', 'https://')):
40 raise ValueError("target_url must be a valid HTTP/HTTPS URL")
41
42 if not self.extraction_goal or len(self.extraction_goal.strip()) < 10:
43 raise ValueError("extraction_goal must be at least 10 characters describing what to extract")
44
45 if self.max_actor_attempts < 1 or self.max_actor_attempts > 50:
46 raise ValueError("max_actor_attempts must be between 1 and 50")
47
48 if self.max_retries_per_actor < 1 or self.max_retries_per_actor > 10:
49 raise ValueError("max_retries_per_actor must be between 1 and 10")
50
51 if self.max_time_minutes < 1 or self.max_time_minutes > 120:
52 raise ValueError("max_time_minutes must be between 1 and 120")
53
54 return True
55
56
57@dataclass
58class ActorAttempt:
59 """Represents a single actor execution attempt."""
60
61 actor_id: str
62 actor_name: str
63 attempt_number: int
64 input_config: Dict[str, Any]
65 timestamp: str
66
67 # Results
68 success: bool = False
69 execution_time_seconds: float = 0.0
70 result_count: int = 0
71 data_quality_score: float = 0.0
72 extracted_data: Optional[List[Dict[str, Any]]] = None
73
74 # Error details
75 error_message: Optional[str] = None
76 error_type: Optional[str] = None
77
78 # Metadata
79 run_id: Optional[str] = None
80 dataset_id: Optional[str] = None
81
82 def to_dict(self) -> Dict[str, Any]:
83 """Convert to dictionary for JSON serialization."""
84 return {
85 "actor_id": self.actor_id,
86 "actor_name": self.actor_name,
87 "attempt_number": self.attempt_number,
88 "input_config": self.input_config,
89 "timestamp": self.timestamp,
90 "success": self.success,
91 "execution_time_seconds": self.execution_time_seconds,
92 "result_count": self.result_count,
93 "data_quality_score": self.data_quality_score,
94 "error_message": self.error_message,
95 "error_type": self.error_type,
96 "run_id": self.run_id,
97 "dataset_id": self.dataset_id,
98 "has_data": bool(self.extracted_data)
99 }
100
101
102@dataclass
103class ScrapingStrategy:
104 """Represents a scraping strategy (actor + configuration)."""
105
106 actor_id: str
107 actor_name: str
108 priority: int # 1 = highest priority
109 input_template: Dict[str, Any]
110 expected_data_fields: List[str]
111 reasoning: str
112
113 # Success metrics
114 success_rate: float = 0.0
115 avg_quality_score: float = 0.0
116 avg_execution_time: float = 0.0
117
118 def to_dict(self) -> Dict[str, Any]:
119 """Convert to dictionary."""
120 return {
121 "actor_id": self.actor_id,
122 "actor_name": self.actor_name,
123 "priority": self.priority,
124 "input_template": self.input_template,
125 "expected_data_fields": self.expected_data_fields,
126 "reasoning": self.reasoning,
127 "success_rate": self.success_rate,
128 "avg_quality_score": self.avg_quality_score,
129 "avg_execution_time": self.avg_execution_time
130 }
131
132
133@dataclass
134class LLMScraperOutput:
135 """Output schema for LLM Scraper Actor."""
136
137 # Overall success
138 success: bool
139 status: str # "completed", "failed", "timeout", "limit_reached"
140
141 # Best result
142 best_actor_id: Optional[str] = None
143 best_actor_name: Optional[str] = None
144 optimal_input_config: Optional[Dict[str, Any]] = None
145 final_extracted_data: Optional[List[Dict[str, Any]]] = None
146
147 # Execution details
148 total_execution_time_seconds: float = 0.0
149 total_attempts_made: int = 0
150 unique_actors_tested: int = 0
151
152 # Quality metrics
153 best_data_quality_score: float = 0.0
154 best_result_count: int = 0
155
156 # Strategy and reasoning
157 llm_reasoning: str = ""
158 strategies_discovered: List[ScrapingStrategy] = field(default_factory=list)
159 all_attempts: List[ActorAttempt] = field(default_factory=list)
160
161 # Performance analysis
162 performance_summary: Dict[str, Any] = field(default_factory=dict)
163 recommendations: List[str] = field(default_factory=list)
164
165 # Metadata
166 completion_timestamp: str = ""
167 claude_model_used: str = ""
168 mcp_server_used: str = ""
169
170 def __post_init__(self):
171 """Set completion timestamp."""
172 if not self.completion_timestamp:
173 self.completion_timestamp = datetime.now().isoformat()
174
175 def add_attempt(self, attempt: ActorAttempt):
176 """Add an attempt to the results."""
177 self.all_attempts.append(attempt)
178 self.total_attempts_made = len(self.all_attempts)
179
180 # Update best result if this is better
181 if (attempt.success and
182 attempt.data_quality_score > self.best_data_quality_score):
183
184 self.best_actor_id = attempt.actor_id
185 self.best_actor_name = attempt.actor_name
186 self.optimal_input_config = attempt.input_config
187 self.final_extracted_data = attempt.extracted_data
188 self.best_data_quality_score = attempt.data_quality_score
189 self.best_result_count = attempt.result_count
190
191 # Update unique actors count
192 unique_actors = set(a.actor_id for a in self.all_attempts)
193 self.unique_actors_tested = len(unique_actors)
194
195 def calculate_performance_summary(self):
196 """Calculate performance metrics."""
197 if not self.all_attempts:
198 return
199
200 successful_attempts = [a for a in self.all_attempts if a.success]
201
202 self.performance_summary = {
203 "total_attempts": len(self.all_attempts),
204 "successful_attempts": len(successful_attempts),
205 "success_rate": len(successful_attempts) / len(self.all_attempts),
206 "avg_execution_time": sum(a.execution_time_seconds for a in self.all_attempts) / len(self.all_attempts),
207 "avg_quality_score": sum(a.data_quality_score for a in successful_attempts) / len(successful_attempts) if successful_attempts else 0,
208 "actors_tested": self.unique_actors_tested,
209 "best_actor": self.best_actor_name,
210 "total_runtime": self.total_execution_time_seconds
211 }
212
213 def add_recommendation(self, recommendation: str):
214 """Add a recommendation for future improvements."""
215 if recommendation not in self.recommendations:
216 self.recommendations.append(recommendation)
217
218 def to_dict(self) -> Dict[str, Any]:
219 """Convert to dictionary for JSON output."""
220 self.calculate_performance_summary()
221
222 return {
223 "success": self.success,
224 "status": self.status,
225 "best_actor_id": self.best_actor_id,
226 "best_actor_name": self.best_actor_name,
227 "optimal_input_config": self.optimal_input_config,
228 "final_extracted_data": self.final_extracted_data,
229 "total_execution_time_seconds": self.total_execution_time_seconds,
230 "total_attempts_made": self.total_attempts_made,
231 "unique_actors_tested": self.unique_actors_tested,
232 "best_data_quality_score": self.best_data_quality_score,
233 "best_result_count": self.best_result_count,
234 "llm_reasoning": self.llm_reasoning,
235 "strategies_discovered": [s.to_dict() for s in self.strategies_discovered],
236 "all_attempts": [a.to_dict() for a in self.all_attempts],
237 "performance_summary": self.performance_summary,
238 "recommendations": self.recommendations,
239 "completion_timestamp": self.completion_timestamp,
240 "claude_model_used": self.claude_model_used,
241 "mcp_server_used": self.mcp_server_used
242 }
243
244 def to_json(self, indent: int = 2) -> str:
245 """Convert to JSON string."""
246 return json.dumps(self.to_dict(), indent=indent, default=str)
247
248 # Convenience properties for backward compatibility
249 @property
250 def quality_score(self) -> float:
251 """Alias for best_data_quality_score."""
252 return self.best_data_quality_score
253
254 @property
255 def scraped_data(self) -> Optional[List[Dict[str, Any]]]:
256 """Alias for final_extracted_data."""
257 return self.final_extracted_data or []
258
259 @property
260 def total_execution_time(self) -> float:
261 """Alias for total_execution_time_seconds."""
262 return self.total_execution_time_seconds
263
264 @property
265 def actor_attempts(self) -> List[ActorAttempt]:
266 """Alias for all_attempts."""
267 return self.all_attempts
268
269
270@dataclass
271class ProgressUpdate:
272 """Progress update for real-time status reporting."""
273
274 timestamp: str
275 stage: str # "discovery", "testing", "retrying", "analyzing", "completed"
276 message: str
277 progress: float = 0.0 # 0.0 to 1.0 representing completion percentage
278 actor_name: Optional[str] = None
279 attempt_number: Optional[int] = None
280 success: Optional[bool] = None
281 details: Optional[Dict[str, Any]] = None
282
283 def __post_init__(self):
284 """Set timestamp if not provided."""
285 if not self.timestamp:
286 self.timestamp = datetime.now().isoformat()
287
288 def to_dict(self) -> Dict[str, Any]:
289 """Convert to dictionary."""
290 return {
291 "timestamp": self.timestamp,
292 "stage": self.stage,
293 "message": self.message,
294 "progress": self.progress,
295 "actor_name": self.actor_name,
296 "attempt_number": self.attempt_number,
297 "success": self.success,
298 "details": self.details or {}
299 }

src/llmscraper/llm_scraper/quality_evaluator.py

1"""
2Data quality evaluator for assessing scraping results.
3"""
4
5import logging
6from typing import Dict, List, Any, Optional, Tuple
7from dataclasses import dataclass
8import re
9import json
10
11
12@dataclass
13class QualityMetrics:
14 """Quality metrics for scraped data."""
15
16 completeness_score: float # 0-1, how complete the data is
17 relevance_score: float # 0-1, how relevant to extraction goal
18 structure_score: float # 0-1, how well-structured the data is
19 volume_score: float # 0-1, appropriate amount of data
20 overall_score: float # 0-1, weighted overall score
21
22 # Detailed metrics
23 total_items: int = 0
24 non_empty_fields: int = 0
25 total_fields: int = 0
26 unique_items: int = 0
27
28 # Quality indicators
29 has_required_fields: bool = False
30 has_duplicates: bool = False
31 has_errors: bool = False
32
33 feedback: List[str] = None
34
35 def __post_init__(self):
36 if self.feedback is None:
37 self.feedback = []
38
39
40class DataQualityEvaluator:
41 """Evaluates the quality of scraped data."""
42
43 def __init__(self):
44 """Initialize the quality evaluator."""
45 self.logger = logging.getLogger(__name__)
46
47 # Common field names that indicate good data
48 self.valuable_fields = {
49 'title', 'name', 'price', 'cost', 'amount', 'value',
50 'description', 'text', 'content', 'body', 'summary',
51 'url', 'link', 'href', 'address', 'location',
52 'date', 'time', 'timestamp', 'created', 'updated',
53 'rating', 'score', 'review', 'feedback',
54 'category', 'type', 'tag', 'label', 'status',
55 'id', 'identifier', 'key', 'reference',
56 'email', 'phone', 'contact', 'author', 'user'
57 }
58
59 def evaluate_data_quality(self, data: List[Dict[str, Any]],
60 extraction_goal: str,
61 target_url: str) -> QualityMetrics:
62 """
63 Evaluate the quality of scraped data.
64
65 Args:
66 data: List of scraped data items
67 extraction_goal: Original extraction goal
68 target_url: Target URL that was scraped
69
70 Returns:
71 QualityMetrics with detailed quality assessment
72 """
73 if not data or not isinstance(data, list):
74 return QualityMetrics(
75 completeness_score=0.0,
76 relevance_score=0.0,
77 structure_score=0.0,
78 volume_score=0.0,
79 overall_score=0.0,
80 feedback=["No data found or data is not in expected list format"]
81 )
82
83 # Calculate individual metrics
84 completeness = self._calculate_completeness(data)
85 relevance = self._calculate_relevance(data, extraction_goal)
86 structure = self._calculate_structure_quality(data)
87 volume = self._calculate_volume_quality(data, extraction_goal)
88
89 # Calculate overall score (weighted average)
90 overall = (
91 completeness * 0.3 + # 30% weight on completeness
92 relevance * 0.35 + # 35% weight on relevance
93 structure * 0.2 + # 20% weight on structure
94 volume * 0.15 # 15% weight on volume
95 )
96
97 # Detailed analysis
98 total_items = len(data)
99 unique_items = len(set(json.dumps(item, sort_keys=True) for item in data))
100 has_duplicates = unique_items < total_items
101
102 # Count fields
103 all_fields = set()
104 non_empty_count = 0
105 total_field_count = 0
106
107 for item in data:
108 if isinstance(item, dict):
109 all_fields.update(item.keys())
110 for key, value in item.items():
111 total_field_count += 1
112 if value and str(value).strip():
113 non_empty_count += 1
114
115 # Check for required fields based on extraction goal
116 has_required_fields = self._check_required_fields(data, extraction_goal)
117
118 # Check for errors in data
119 has_errors = self._check_for_errors(data)
120
121 # Generate feedback
122 feedback = self._generate_feedback(data, extraction_goal, completeness, relevance, structure, volume)
123
124 return QualityMetrics(
125 completeness_score=completeness,
126 relevance_score=relevance,
127 structure_score=structure,
128 volume_score=volume,
129 overall_score=overall,
130 total_items=total_items,
131 non_empty_fields=non_empty_count,
132 total_fields=total_field_count,
133 unique_items=unique_items,
134 has_required_fields=has_required_fields,
135 has_duplicates=has_duplicates,
136 has_errors=has_errors,
137 feedback=feedback
138 )
139
140 def _calculate_completeness(self, data: List[Dict[str, Any]]) -> float:
141 """Calculate how complete the data is (non-empty fields)."""
142 if not data:
143 return 0.0
144
145 total_fields = 0
146 filled_fields = 0
147
148 for item in data:
149 if isinstance(item, dict):
150 for key, value in item.items():
151 total_fields += 1
152 if value and str(value).strip() and str(value).strip() not in ['', 'null', 'None', 'undefined']:
153 filled_fields += 1
154
155 return filled_fields / total_fields if total_fields > 0 else 0.0
156
157 def _calculate_relevance(self, data: List[Dict[str, Any]], extraction_goal: str) -> float:
158 """Calculate how relevant the data is to the extraction goal."""
159 if not data or not extraction_goal:
160 return 0.0
161
162 goal_keywords = set(re.findall(r'\w+', extraction_goal.lower()))
163
164 # Remove common stop words
165 stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'under', 'over', 'within', 'without', 'across', 'around', 'near', 'beyond', 'behind', 'except', 'until', 'since', 'while', 'although', 'because', 'if', 'when', 'where', 'how', 'what', 'who', 'which', 'why', 'extract', 'get', 'find', 'scrape', 'data', 'information', 'from', 'website', 'page'}
166 goal_keywords -= stop_words
167
168 if not goal_keywords:
169 return 0.5 # Neutral score if no meaningful keywords
170
171 relevance_scores = []
172
173 for item in data:
174 if isinstance(item, dict):
175 item_text = ' '.join(str(v).lower() for v in item.values() if v)
176 item_keywords = set(re.findall(r'\w+', item_text))
177
178 # Calculate keyword overlap
179 overlap = len(goal_keywords & item_keywords)
180 score = overlap / len(goal_keywords)
181 relevance_scores.append(score)
182
183 return sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0.0
184
185 def _calculate_structure_quality(self, data: List[Dict[str, Any]]) -> float:
186 """Calculate how well-structured the data is."""
187 if not data:
188 return 0.0
189
190 structure_scores = []
191
192 # Check consistency of fields across items
193 all_fields = set()
194 for item in data:
195 if isinstance(item, dict):
196 all_fields.update(item.keys())
197
198 if not all_fields:
199 return 0.0
200
201 for item in data:
202 if isinstance(item, dict):
203 # Score based on:
204 # 1. Field consistency (has common fields)
205 # 2. Field name quality (meaningful names)
206 # 3. Data type consistency
207
208 field_score = len(item.keys()) / len(all_fields) if all_fields else 0
209
210 # Check for meaningful field names
211 meaningful_fields = 0
212 for field in item.keys():
213 field_lower = field.lower()
214 if any(valuable in field_lower for valuable in self.valuable_fields):
215 meaningful_fields += 1
216
217 meaning_score = meaningful_fields / len(item.keys()) if item.keys() else 0
218
219 # Combined structure score for this item
220 item_score = (field_score + meaning_score) / 2
221 structure_scores.append(item_score)
222
223 return sum(structure_scores) / len(structure_scores) if structure_scores else 0.0
224
225 def _calculate_volume_quality(self, data: List[Dict[str, Any]], extraction_goal: str) -> float:
226 """Calculate if the volume of data is appropriate."""
227 if not data:
228 return 0.0
229
230 data_count = len(data)
231
232 # Determine expected volume based on extraction goal
233 goal_lower = extraction_goal.lower()
234
235 if any(word in goal_lower for word in ['all', 'every', 'each', 'list']):
236 # Expecting larger dataset
237 if data_count >= 10:
238 return 1.0
239 elif data_count >= 5:
240 return 0.8
241 elif data_count >= 2:
242 return 0.6
243 else:
244 return 0.3
245 elif any(word in goal_lower for word in ['first', 'top', 'main', 'primary']):
246 # Expecting smaller, focused dataset
247 if 1 <= data_count <= 5:
248 return 1.0
249 elif data_count <= 10:
250 return 0.8
251 else:
252 return 0.6
253 else:
254 # General expectation
255 if 3 <= data_count <= 20:
256 return 1.0
257 elif 1 <= data_count <= 30:
258 return 0.8
259 elif data_count > 30:
260 return 0.7
261 else:
262 return 0.4
263
264 def _check_required_fields(self, data: List[Dict[str, Any]], extraction_goal: str) -> bool:
265 """Check if data contains fields that seem required based on the goal."""
266 if not data:
267 return False
268
269 goal_lower = extraction_goal.lower()
270 required_patterns = []
271
272 # Map goal keywords to expected fields
273 if any(word in goal_lower for word in ['price', 'cost', 'amount']):
274 required_patterns.extend(['price', 'cost', 'amount', 'value', '$'])
275 if any(word in goal_lower for word in ['title', 'name', 'product']):
276 required_patterns.extend(['title', 'name', 'product'])
277 if any(word in goal_lower for word in ['description', 'text', 'content']):
278 required_patterns.extend(['description', 'text', 'content', 'body'])
279 if any(word in goal_lower for word in ['rating', 'review', 'score']):
280 required_patterns.extend(['rating', 'review', 'score', 'star'])
281 if any(word in goal_lower for word in ['url', 'link']):
282 required_patterns.extend(['url', 'link', 'href'])
283
284 if not required_patterns:
285 return True # No specific requirements identified
286
287 # Check if any item has fields matching the patterns
288 for item in data:
289 if isinstance(item, dict):
290 item_fields = ' '.join(item.keys()).lower()
291 item_values = ' '.join(str(v) for v in item.values()).lower()
292
293 for pattern in required_patterns:
294 if pattern in item_fields or pattern in item_values:
295 return True
296
297 return False
298
299 def _check_for_errors(self, data: List[Dict[str, Any]]) -> bool:
300 """Check if data contains obvious errors."""
301 error_indicators = [
302 'error', 'exception', 'failed', 'null', 'undefined',
303 'not found', '404', '500', 'blocked', 'denied'
304 ]
305
306 for item in data:
307 if isinstance(item, dict):
308 item_text = ' '.join(str(v).lower() for v in item.values() if v)
309 if any(error in item_text for error in error_indicators):
310 return True
311
312 return False
313
314 def _generate_feedback(self, data: List[Dict[str, Any]], extraction_goal: str,
315 completeness: float, relevance: float,
316 structure: float, volume: float) -> List[str]:
317 """Generate human-readable feedback about data quality."""
318 feedback = []
319
320 # Overall assessment
321 overall = (completeness * 0.3 + relevance * 0.35 + structure * 0.2 + volume * 0.15)
322
323 if overall >= 0.8:
324 feedback.append("Excellent data quality - this actor produced high-quality results")
325 elif overall >= 0.6:
326 feedback.append("Good data quality - results are usable with minor issues")
327 elif overall >= 0.4:
328 feedback.append("Fair data quality - results have some issues but may be usable")
329 else:
330 feedback.append("Poor data quality - consider trying a different actor or configuration")
331
332 # Specific feedback
333 if completeness < 0.5:
334 feedback.append("Data has many empty fields - try adjusting extraction settings")
335
336 if relevance < 0.5:
337 feedback.append("Data doesn't closely match extraction goal - consider more specific actor or different parameters")
338
339 if structure < 0.5:
340 feedback.append("Data structure is inconsistent - try a different actor or modify extraction logic")
341
342 if volume < 0.5:
343 feedback.append("Data volume is suboptimal - consider adjusting maxResults or crawl settings")
344
345 # Positive feedback
346 if completeness > 0.8:
347 feedback.append("Data fields are well-populated")
348
349 if relevance > 0.8:
350 feedback.append("Data is highly relevant to extraction goal")
351
352 if structure > 0.8:
353 feedback.append("Data has consistent, well-structured format")
354
355 return feedback
356
357 def compare_results(self, results1: QualityMetrics, results2: QualityMetrics) -> str:
358 """Compare two quality results and recommend the better one."""
359 if results1.overall_score > results2.overall_score:
360 winner = "first"
361 score_diff = results1.overall_score - results2.overall_score
362 elif results2.overall_score > results1.overall_score:
363 winner = "second"
364 score_diff = results2.overall_score - results1.overall_score
365 else:
366 return "Both results have equal overall quality scores"
367
368 comparison = f"The {winner} result is better with {score_diff:.2f} higher overall score"
369
370 # Add specific reasons
371 reasons = []
372 if abs(results1.relevance_score - results2.relevance_score) > 0.1:
373 if results1.relevance_score > results2.relevance_score:
374 reasons.append("first result is more relevant to extraction goal")
375 else:
376 reasons.append("second result is more relevant to extraction goal")
377
378 if abs(results1.completeness_score - results2.completeness_score) > 0.1:
379 if results1.completeness_score > results2.completeness_score:
380 reasons.append("first result has more complete data")
381 else:
382 reasons.append("second result has more complete data")
383
384 if reasons:
385 comparison += f" because {' and '.join(reasons)}"
386
387 return comparison

src/llmscraper/llm_scraper/retry_logic.py

1"""
2Retry logic for intelligent actor execution with parameter adjustment.
3"""
4
5import asyncio
6import logging
7from typing import Dict, List, Any, Optional, Tuple
8from dataclasses import dataclass
9from datetime import datetime
10import copy
11
12
13@dataclass
14class RetryAttempt:
15 """Represents a retry attempt with adjusted parameters."""
16 attempt_number: int
17 adjusted_input: Dict[str, Any]
18 reasoning: str
19 timestamp: str
20
21
22class RetryManager:
23 """Manages intelligent retry logic with parameter adjustments."""
24
25 def __init__(self, max_retries_per_actor: int = 3):
26 """Initialize retry manager."""
27 self.max_retries_per_actor = max_retries_per_actor
28 self.logger = logging.getLogger(__name__)
29
30 # Track retry history
31 self.retry_history: Dict[str, List[RetryAttempt]] = {}
32
33 def should_retry(self, actor_id: str, error_message: str) -> bool:
34 """
35 Determine if an actor should be retried based on error and history.
36
37 Args:
38 actor_id: ID of the actor that failed
39 error_message: Error message from the failed attempt
40
41 Returns:
42 True if should retry, False otherwise
43 """
44 current_attempts = len(self.retry_history.get(actor_id, []))
45
46 if current_attempts >= self.max_retries_per_actor:
47 self.logger.debug(f"Max retries reached for {actor_id}")
48 return False
49
50 # Check if error is retryable
51 retryable_errors = [
52 "timeout", "rate limit", "proxy", "network", "temporary",
53 "service unavailable", "too many requests", "blocked"
54 ]
55
56 error_lower = error_message.lower()
57 is_retryable = any(err in error_lower for err in retryable_errors)
58
59 if not is_retryable:
60 self.logger.debug(f"Error not retryable for {actor_id}: {error_message}")
61 return False
62
63 self.logger.info(f"Will retry {actor_id} (attempt {current_attempts + 1}/{self.max_retries_per_actor})")
64 return True
65
66 def adjust_input_for_retry(self, actor_id: str, base_input: Dict[str, Any],
67 error_message: str, attempt_number: int) -> Tuple[Dict[str, Any], str]:
68 """
69 Adjust input parameters for retry based on the error and attempt number.
70
71 Args:
72 actor_id: ID of the actor
73 base_input: Original input configuration
74 error_message: Error message from failed attempt
75 attempt_number: Current attempt number (1-based)
76
77 Returns:
78 Tuple of (adjusted_input, reasoning)
79 """
80 adjusted_input = copy.deepcopy(base_input)
81 adjustments = []
82
83 error_lower = error_message.lower()
84
85 # Attempt-based adjustments
86 if attempt_number == 1:
87 # First retry: Basic adjustments
88 if "timeout" in error_lower or "slow" in error_lower:
89 if "requestTimeoutSecs" in adjusted_input:
90 adjusted_input["requestTimeoutSecs"] = adjusted_input.get("requestTimeoutSecs", 30) * 2
91 adjustments.append("increased request timeout")
92
93 if "maxRequestRetries" in adjusted_input:
94 adjusted_input["maxRequestRetries"] = min(adjusted_input.get("maxRequestRetries", 3) + 2, 10)
95 adjustments.append("increased retry attempts")
96
97 if "rate limit" in error_lower or "blocked" in error_lower:
98 # Enable proxy if not already enabled
99 if "proxyConfiguration" not in adjusted_input:
100 adjusted_input["proxyConfiguration"] = {"useApifyProxy": True}
101 adjustments.append("enabled proxy")
102 elif not adjusted_input["proxyConfiguration"].get("useApifyProxy"):
103 adjusted_input["proxyConfiguration"]["useApifyProxy"] = True
104 adjustments.append("enabled Apify proxy")
105
106 # Reduce load if memory/resource issues
107 if "memory" in error_lower or "resource" in error_lower:
108 if "maxResults" in adjusted_input:
109 adjusted_input["maxResults"] = max(adjusted_input.get("maxResults", 10) // 2, 1)
110 adjustments.append("reduced max results")
111
112 if "maxPagesPerCrawl" in adjusted_input:
113 adjusted_input["maxPagesPerCrawl"] = max(adjusted_input.get("maxPagesPerCrawl", 10) // 2, 1)
114 adjustments.append("reduced pages per crawl")
115
116 elif attempt_number == 2:
117 # Second retry: More aggressive adjustments
118 if "proxyConfiguration" in adjusted_input:
119 proxy_config = adjusted_input["proxyConfiguration"]
120 if "apifyProxyGroups" not in proxy_config:
121 proxy_config["apifyProxyGroups"] = ["RESIDENTIAL"]
122 adjustments.append("switched to residential proxies")
123 elif "RESIDENTIAL" not in proxy_config.get("apifyProxyGroups", []):
124 proxy_config["apifyProxyGroups"] = ["RESIDENTIAL"]
125 adjustments.append("switched to residential proxies")
126
127 # Reduce concurrent operations
128 if "maxConcurrency" in adjusted_input:
129 adjusted_input["maxConcurrency"] = max(adjusted_input.get("maxConcurrency", 5) // 2, 1)
130 adjustments.append("reduced concurrency")
131
132 # Increase wait times
133 if "dynamicContentWaitSecs" in adjusted_input:
134 adjusted_input["dynamicContentWaitSecs"] = min(adjusted_input.get("dynamicContentWaitSecs", 10) * 2, 60)
135 adjustments.append("increased wait time for dynamic content")
136
137 elif attempt_number >= 3:
138 # Final retry: Conservative settings
139 adjustments.append("using conservative settings")
140
141 # Minimal resource usage
142 if "maxResults" in adjusted_input:
143 adjusted_input["maxResults"] = min(adjusted_input.get("maxResults", 10), 5)
144 if "maxPagesPerCrawl" in adjusted_input:
145 adjusted_input["maxPagesPerCrawl"] = 1
146 if "maxConcurrency" in adjusted_input:
147 adjusted_input["maxConcurrency"] = 1
148
149 # Maximum timeouts and retries
150 if "requestTimeoutSecs" in adjusted_input:
151 adjusted_input["requestTimeoutSecs"] = 120
152 if "maxRequestRetries" in adjusted_input:
153 adjusted_input["maxRequestRetries"] = 10
154
155 # Force proxy usage
156 adjusted_input["proxyConfiguration"] = {
157 "useApifyProxy": True,
158 "apifyProxyGroups": ["RESIDENTIAL"]
159 }
160
161 # Actor-specific adjustments
162 if "web-scraper" in actor_id:
163 self._adjust_web_scraper_input(adjusted_input, error_lower, adjustments)
164 elif "cheerio-scraper" in actor_id:
165 self._adjust_cheerio_scraper_input(adjusted_input, error_lower, adjustments)
166 elif "website-content-crawler" in actor_id:
167 self._adjust_website_crawler_input(adjusted_input, error_lower, adjustments)
168
169 # Record this retry attempt
170 retry_attempt = RetryAttempt(
171 attempt_number=attempt_number,
172 adjusted_input=adjusted_input,
173 reasoning=f"Retry {attempt_number}: {', '.join(adjustments) if adjustments else 'no specific adjustments'}",
174 timestamp=datetime.now().isoformat()
175 )
176
177 if actor_id not in self.retry_history:
178 self.retry_history[actor_id] = []
179 self.retry_history[actor_id].append(retry_attempt)
180
181 return adjusted_input, retry_attempt.reasoning
182
183 def _adjust_web_scraper_input(self, input_config: Dict[str, Any], error_lower: str, adjustments: List[str]):
184 """Apply web-scraper specific adjustments."""
185 if "javascript" in error_lower or "js" in error_lower:
186 # Disable JavaScript if causing issues
187 if "useChrome" in input_config:
188 input_config["useChrome"] = False
189 adjustments.append("disabled Chrome/JavaScript")
190
191 if "page function" in error_lower:
192 # Simplify page function if it exists
193 if "pageFunction" in input_config:
194 # Use a minimal page function
195 input_config["pageFunction"] = """
196 async function pageFunction(context) {
197 const { request, log, $ } = context;
198 return {
199 url: request.url,
200 title: $('title').text(),
201 html: $('body').html()
202 };
203 }
204 """
205 adjustments.append("simplified page function")
206
207 def _adjust_cheerio_scraper_input(self, input_config: Dict[str, Any], error_lower: str, adjustments: List[str]):
208 """Apply cheerio-scraper specific adjustments."""
209 if "selector" in error_lower or "parse" in error_lower:
210 # Simplify selectors if parsing issues
211 if "pageFunction" in input_config:
212 input_config["pageFunction"] = """
213 async function pageFunction(context) {
214 const { request, $ } = context;
215 return {
216 url: request.url,
217 title: $('title').text(),
218 content: $('body').text()
219 };
220 }
221 """
222 adjustments.append("simplified cheerio selectors")
223
224 def _adjust_website_crawler_input(self, input_config: Dict[str, Any], error_lower: str, adjustments: List[str]):
225 """Apply website-content-crawler specific adjustments."""
226 if "content" in error_lower or "extraction" in error_lower:
227 # Adjust content extraction settings
228 if "htmlTransformer" in input_config:
229 input_config["htmlTransformer"] = "readableText"
230 adjustments.append("switched to readable text extraction")
231
232 if "readableTextCharThreshold" in input_config:
233 input_config["readableTextCharThreshold"] = 50 # Lower threshold
234 adjustments.append("lowered text threshold")
235
236 if "crawler" in error_lower or "navigation" in error_lower:
237 # Simplify crawler settings
238 if "maxCrawlDepth" in input_config:
239 input_config["maxCrawlDepth"] = 0 # Only crawl start page
240 adjustments.append("limited to single page")
241
242 if "crawlerType" in input_config:
243 input_config["crawlerType"] = "cheerio" # Use simpler crawler
244 adjustments.append("switched to Cheerio crawler")
245
246 def get_retry_history(self, actor_id: str) -> List[RetryAttempt]:
247 """Get retry history for an actor."""
248 return self.retry_history.get(actor_id, [])
249
250 def reset_history(self, actor_id: Optional[str] = None):
251 """Reset retry history for a specific actor or all actors."""
252 if actor_id:
253 self.retry_history.pop(actor_id, None)
254 else:
255 self.retry_history.clear()
256
257 def get_total_retries(self) -> int:
258 """Get total number of retries across all actors."""
259 return sum(len(attempts) for attempts in self.retry_history.values())
260
261 def calculate_retry_success_rate(self) -> float:
262 """Calculate overall retry success rate (placeholder for future implementation)."""
263 # This would be implemented based on actual success tracking
264 total_retries = self.get_total_retries()
265 if total_retries == 0:
266 return 0.0
267
268 # Placeholder calculation
269 return 0.6 # 60% success rate on retries

src/llmscraper/mcp/__init__.py

1"""
2MCP (Model Context Protocol) package for Apify integration.
3"""
4
5from .client import MCPClient, MCPTool, MCPCallResult
6
7__all__ = ['MCPClient', 'MCPTool', 'MCPCallResult']

src/llmscraper/mcp/client.py

1"""
2MCP (Model Context Protocol) Client for Apify integration.
3"""
4
5import asyncio
6import httpx
7import logging
8from typing import Dict, List, Any, Optional, Union
9from dataclasses import dataclass
10import json
11
12
13@dataclass
14class MCPTool:
15 """Represents an MCP tool."""
16 name: str
17 description: str
18 input_schema: Dict[str, Any]
19
20
21@dataclass
22class MCPCallResult:
23 """Result from an MCP tool call."""
24 content: Union[str, List[Dict[str, Any]]]
25 is_error: bool = False
26 error_message: Optional[str] = None
27
28
29class MCPClient:
30 """Client for connecting to Apify MCP Server."""
31
32 def __init__(self, server_url: str, apify_token: str, timeout: int = 300):
33 """
34 Initialize MCP client.
35
36 Args:
37 server_url: URL of the MCP server (e.g., https://mcp.apify.com)
38 apify_token: Apify API token for authentication
39 timeout: Default timeout for requests
40 """
41 # For now, let's use fallback mode since MCP server endpoints are unclear
42 self.server_url = server_url.rstrip('/')
43 self.apify_token = apify_token
44 self.timeout = timeout
45 self.available_tools: List[MCPTool] = []
46 self.logger = logging.getLogger(__name__)
47 self.use_fallback_only = True # Force fallback mode until MCP endpoints are clarified
48
49 self.headers = {
50 'Authorization': f'Bearer {apify_token}',
51 'Content-Type': 'application/json',
52 'User-Agent': 'LLMScraper/1.0.0'
53 }
54
55 async def connect(self) -> bool:
56 """
57 Connect to MCP server and fetch available tools.
58
59 Returns:
60 True if connection successful, False otherwise
61 """
62 try:
63 self.logger.info(f"Initializing Apify client with fallback mode")
64
65 # For now, skip MCP server connection and use direct Apify API
66 if self.use_fallback_only:
67 await self._setup_fallback_tools()
68 self.logger.info(f"Connected in fallback mode. Available tools: {len(self.available_tools)}")
69 return True
70
71 # Original MCP connection logic (currently disabled)
72 self.logger.info(f"Connecting to MCP server: {self.server_url}")
73 await self._list_tools()
74 self.logger.info(f"Successfully connected to MCP server. Available tools: {len(self.available_tools)}")
75 return True
76
77 except Exception as e:
78 self.logger.error(f"Failed to connect to MCP server: {str(e)}")
79 return False
80
81 async def _list_tools(self) -> List[MCPTool]:
82 """Fetch available tools from MCP server using proper MCP protocol."""
83 try:
84 async with httpx.AsyncClient(timeout=30) as client:
85 # Use proper MCP protocol format
86 payload = {
87 "jsonrpc": "2.0",
88 "id": 1,
89 "method": "tools/list",
90 "params": {}
91 }
92
93 # Try the streamable endpoint first, then SSE fallback
94 endpoints_to_try = [
95 self.server_url, # https://mcp.apify.com
96 f"{self.server_url}/sse" # https://mcp.apify.com/sse
97 ]
98
99 for endpoint in endpoints_to_try:
100 try:
101 self.logger.debug(f"Trying MCP endpoint: {endpoint}")
102 response = await client.post(
103 endpoint,
104 headers=self.headers,
105 json=payload
106 )
107
108 if response.status_code == 200:
109 data = response.json()
110
111 # Handle MCP protocol response
112 if "result" in data and "tools" in data["result"]:
113 tools = data["result"]["tools"]
114
115 self.available_tools = [
116 MCPTool(
117 name=tool['name'],
118 description=tool.get('description', ''),
119 input_schema=tool.get('inputSchema', {})
120 )
121 for tool in tools
122 ]
123
124 self.logger.info(f"Found {len(self.available_tools)} tools from MCP server")
125 return self.available_tools
126
127 # Handle error response
128 elif "error" in data:
129 error = data["error"]
130 self.logger.warning(f"MCP error listing tools: {error.get('message', 'Unknown error')}")
131 continue
132
133 except httpx.HTTPStatusError as e:
134 self.logger.debug(f"Endpoint {endpoint} returned {e.response.status_code}")
135 continue
136 except Exception as e:
137 self.logger.debug(f"Endpoint {endpoint} failed: {str(e)}")
138 continue
139
140 # If MCP endpoints fail, fallback to direct Apify actor search
141 self.logger.warning("MCP endpoints failed, using fallback actor discovery")
142 await self._setup_fallback_tools()
143 return self.available_tools
144
145 except Exception as e:
146 self.logger.error(f"Error listing tools: {str(e)}")
147 # Setup minimal fallback tools for basic functionality
148 await self._setup_fallback_tools()
149 return self.available_tools
150
151 async def _setup_fallback_tools(self):
152 """Setup fallback tools when MCP server is not available."""
153 self.logger.info("Setting up fallback tools for direct Apify API usage")
154
155 # Basic tools that we can simulate without MCP server
156 fallback_tools = [
157 MCPTool(
158 name="search-actors",
159 description="Search for Apify actors",
160 input_schema={
161 "type": "object",
162 "properties": {
163 "search": {"type": "string", "description": "Search query"},
164 "limit": {"type": "integer", "description": "Max results", "default": 10}
165 },
166 "required": ["search"]
167 }
168 ),
169 MCPTool(
170 name="get-actor-details",
171 description="Get details about a specific actor",
172 input_schema={
173 "type": "object",
174 "properties": {
175 "actor": {"type": "string", "description": "Actor ID or name"}
176 },
177 "required": ["actor"]
178 }
179 ),
180 MCPTool(
181 name="run-actor",
182 description="Run an Apify actor directly",
183 input_schema={
184 "type": "object",
185 "properties": {
186 "actor_id": {"type": "string", "description": "Actor ID"},
187 "input": {"type": "object", "description": "Actor input"}
188 },
189 "required": ["actor_id", "input"]
190 }
191 )
192 ]
193
194 self.available_tools = fallback_tools
195
196 async def call_tool(self, tool_name: str, arguments: Dict[str, Any], timeout: Optional[int] = None) -> MCPCallResult:
197 """
198 Call an MCP tool.
199
200 Args:
201 tool_name: Name of the tool to call
202 arguments: Arguments to pass to the tool
203 timeout: Optional timeout override
204
205 Returns:
206 MCPCallResult with the response
207 """
208 call_timeout = timeout or self.timeout
209
210 try:
211 self.logger.info(f"Calling tool: {tool_name} with args: {arguments}")
212
213 # Check if we need to use fallback mode
214 if not any(tool.name == tool_name for tool in self.available_tools):
215 return MCPCallResult(
216 content=f"Tool {tool_name} not available",
217 is_error=True,
218 error_message=f"Tool {tool_name} not found in available tools"
219 )
220
221 # Use fallback mode by default for now
222 if self.use_fallback_only or tool_name in ["search-actors", "get-actor-details", "run-actor"]:
223 return await self._fallback_tool_call(tool_name, arguments)
224
225 # Try MCP call (currently disabled)
226 result = await self._try_mcp_call(tool_name, arguments, call_timeout)
227 if result and not result.is_error:
228 return result
229
230 # Fallback to direct API calls
231 return await self._fallback_tool_call(tool_name, arguments)
232
233 except Exception as e:
234 self.logger.error(f"Error calling tool {tool_name}: {str(e)}")
235 return MCPCallResult(
236 content=f"Tool execution error: {str(e)}",
237 is_error=True,
238 error_message=str(e)
239 )
240
241 async def _try_mcp_call(self, tool_name: str, arguments: Dict[str, Any], timeout: int) -> Optional[MCPCallResult]:
242 """Try to make an MCP tool call using proper MCP protocol."""
243 try:
244 async with httpx.AsyncClient(timeout=timeout) as client:
245 # Use proper MCP protocol format
246 payload = {
247 "jsonrpc": "2.0",
248 "id": 1,
249 "method": "tools/call",
250 "params": {
251 "name": tool_name,
252 "arguments": arguments
253 }
254 }
255
256 # Try the streamable endpoint first, then SSE fallback
257 endpoints_to_try = [
258 self.server_url, # https://mcp.apify.com
259 f"{self.server_url}/sse" # https://mcp.apify.com/sse
260 ]
261
262 for endpoint in endpoints_to_try:
263 try:
264 self.logger.debug(f"Trying MCP endpoint: {endpoint}")
265
266 response = await client.post(
267 endpoint,
268 headers=self.headers,
269 json=payload
270 )
271
272 if response.status_code == 200:
273 data = response.json()
274
275 # Handle MCP protocol response
276 if "result" in data:
277 result = data["result"]
278 content = result.get("content", [])
279
280 # Extract text content from MCP response
281 if isinstance(content, list) and content:
282 text_content = ""
283 for item in content:
284 if isinstance(item, dict) and item.get("type") == "text":
285 text_content += item.get("text", "")
286
287 return MCPCallResult(
288 content=text_content if text_content else content,
289 is_error=False
290 )
291 else:
292 return MCPCallResult(
293 content=content,
294 is_error=False
295 )
296
297 # Handle error response
298 elif "error" in data:
299 error = data["error"]
300 return MCPCallResult(
301 content=f"MCP error: {error.get('message', 'Unknown error')}",
302 is_error=True,
303 error_message=error.get('message', 'Unknown error')
304 )
305
306 # Fallback for non-standard response
307 return MCPCallResult(
308 content=data,
309 is_error=False
310 )
311
312 except Exception as e:
313 self.logger.debug(f"MCP endpoint {endpoint} failed: {str(e)}")
314 continue
315
316 return None
317
318 except Exception as e:
319 self.logger.debug(f"MCP call attempt failed: {str(e)}")
320 return None
321
322 async def _fallback_tool_call(self, tool_name: str, arguments: Dict[str, Any]) -> MCPCallResult:
323 """Handle tool calls using direct Apify API when MCP is unavailable."""
324 from apify_client import ApifyClient
325
326 try:
327 client = ApifyClient(self.apify_token)
328
329 if tool_name == "search-actors":
330 search_query = arguments.get("search", "")
331 limit = arguments.get("limit", 10)
332
333 try:
334 # Use Apify Store API to search for actors
335 async with httpx.AsyncClient() as http_client:
336 store_url = "https://api.apify.com/v2/store"
337 params = {
338 "limit": limit,
339 "search": search_query
340 }
341 headers = {"Authorization": f"Bearer {self.apify_token}"}
342
343 response = await http_client.get(store_url, params=params, headers=headers)
344
345 if response.status_code == 200:
346 data = response.json()
347 actors = data.get("data", {}).get("items", [])
348
349 # Format results similar to MCP response
350 result_content = [
351 {
352 "name": actor.get("name", ""),
353 "title": actor.get("title", ""),
354 "description": actor.get("description", ""),
355 "username": actor.get("username", ""),
356 "stats": {
357 "users": actor.get("stats", {}).get("totalUsers", 0),
358 "runs": actor.get("stats", {}).get("totalRuns", 0)
359 }
360 }
361 for actor in actors
362 ]
363
364 return MCPCallResult(content=result_content)
365 else:
366 self.logger.warning(f"Store API returned {response.status_code}")
367
368 except Exception as e:
369 self.logger.warning(f"Store API failed: {str(e)}")
370
371 # Fallback to hardcoded popular actors if API fails
372 result_content = [
373 {
374 "name": "apify/web-scraper",
375 "title": "Web Scraper",
376 "description": "Crawls arbitrary websites using the Chrome browser and extracts data from pages using a provided JavaScript code.",
377 "username": "apify",
378 "stats": {"users": 1000, "runs": 50000}
379 },
380 {
381 "name": "apify/cheerio-scraper",
382 "title": "Cheerio Scraper",
383 "description": "Crawls websites using the Cheerio library and extracts data from HTML documents.",
384 "username": "apify",
385 "stats": {"users": 800, "runs": 30000}
386 },
387 {
388 "name": "apify/website-content-crawler",
389 "title": "Website Content Crawler",
390 "description": "Crawls websites and extracts text content, metadata, and other information.",
391 "username": "apify",
392 "stats": {"users": 600, "runs": 20000}
393 },
394 {
395 "name": "apify/puppeteer-scraper",
396 "title": "Puppeteer Scraper",
397 "description": "Crawls websites using Puppeteer and extracts data from pages.",
398 "username": "apify",
399 "stats": {"users": 500, "runs": 15000}
400 }
401 ]
402
403 # Filter based on search query
404 if search_query:
405 search_lower = search_query.lower()
406 result_content = [
407 actor for actor in result_content
408 if (search_lower in actor["name"].lower() or
409 search_lower in actor["title"].lower() or
410 search_lower in actor["description"].lower())
411 ]
412
413 return MCPCallResult(content=result_content[:limit])
414
415 elif tool_name == "get-actor-details":
416 actor_id = arguments.get("actor")
417
418 try:
419 actor_info = client.actor(actor_id).get()
420
421 # Convert datetime objects to strings for JSON serialization
422 if actor_info:
423 actor_info = self._serialize_datetime_fields(actor_info)
424
425 return MCPCallResult(content=actor_info)
426 except Exception as e:
427 return MCPCallResult(
428 content=f"Actor {actor_id} not found",
429 is_error=True,
430 error_message=str(e)
431 )
432
433 elif tool_name == "run-actor":
434 actor_id = arguments.get("actor_id")
435 actor_input = arguments.get("input", {})
436
437 try:
438 self.logger.info(f"Running actor {actor_id} with input: {actor_input}")
439 run = client.actor(actor_id).call(run_input=actor_input)
440
441 if run and run.get('status') == 'SUCCEEDED':
442 # Get results from dataset or key-value store
443 dataset_id = run.get('defaultDatasetId')
444 if dataset_id:
445 items = list(client.dataset(dataset_id).iterate_items())
446 return MCPCallResult(content=items)
447
448 return MCPCallResult(content={"status": "completed", "run": run})
449 else:
450 return MCPCallResult(
451 content=f"Actor run failed: {run.get('status') if run else 'Unknown error'}",
452 is_error=True,
453 error_message="Actor execution failed"
454 )
455
456 except Exception as e:
457 return MCPCallResult(
458 content=f"Error running actor: {str(e)}",
459 is_error=True,
460 error_message=str(e)
461 )
462
463 return MCPCallResult(
464 content=f"Fallback not implemented for tool: {tool_name}",
465 is_error=True,
466 error_message="Fallback not available"
467 )
468
469 except Exception as e:
470 return MCPCallResult(
471 content=f"Fallback execution error: {str(e)}",
472 is_error=True,
473 error_message=str(e)
474 )
475
476 def get_available_tools(self) -> List[MCPTool]:
477 """Get list of available tools."""
478 return self.available_tools.copy()
479
480 def format_tools_for_claude(self) -> List[Dict[str, Any]]:
481 """Format tools for Claude API."""
482 return [
483 {
484 "name": tool.name,
485 "description": tool.description,
486 "input_schema": tool.input_schema
487 }
488 for tool in self.available_tools
489 ]
490
491 def _serialize_datetime_fields(self, obj):
492 """Recursively convert datetime objects to ISO format strings."""
493 import datetime
494
495 if isinstance(obj, dict):
496 return {key: self._serialize_datetime_fields(value) for key, value in obj.items()}
497 elif isinstance(obj, list):
498 return [self._serialize_datetime_fields(item) for item in obj]
499 elif isinstance(obj, datetime.datetime):
500 return obj.isoformat()
501 elif isinstance(obj, datetime.date):
502 return obj.isoformat()
503 else:
504 return obj

src/llmscraper/scraping/__init__.py

1"""
2Scraping module for ScraperCodeGenerator.
3"""
4
5from .apify_runner import ApifyRunner
6from .multi_actor_scraper import MultiActorScraper
7from .actor_multi_scraper import ActorMultiScraper
8
9__all__ = ["ApifyRunner", "MultiActorScraper", "ActorMultiScraper"]

src/llmscraper/scraping/actor_multi_scraper.py

1"""
2Apify Actor-specific scraping module for running other actors from within an Apify actor.
3"""
4
5import asyncio
6import logging
7from typing import Dict, Any, List, Optional, Tuple
8from apify import Actor
9
10
11class ActorMultiScraper:
12 """Handles running multiple Apify actors from within an Apify actor context."""
13
14 def __init__(self):
15 """Initialize the actor scraper."""
16 self.logger = logging.getLogger(__name__)
17
18 async def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, Optional[str]]:
19 """
20 Run multiple actors in parallel to scrape the target URL and return HTML content.
21
22 Args:
23 target_url: The URL to scrape
24
25 Returns:
26 Dictionary mapping actor names to their HTML content (or None if failed)
27 """
28 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors called for {target_url}")
29
30 # Define actor configurations
31 actor_configs = self._get_actor_configs(target_url)
32 Actor.log.info(f"DEBUG: Will run {len(actor_configs)} actors in parallel: {list(actor_configs.keys())}")
33
34 # Create tasks for parallel execution
35 tasks = []
36 actor_names = []
37
38 for actor_name, config in actor_configs.items():
39 Actor.log.info(f"DEBUG: Creating task for {actor_name}...")
40 task = self._run_single_actor_with_name(actor_name, config)
41 tasks.append(task)
42 actor_names.append(actor_name)
43
44 # Run all actors in parallel
45 Actor.log.info("DEBUG: Starting all actors in parallel...")
46 results_list = await asyncio.gather(*tasks, return_exceptions=True)
47
48 # Process results
49 results = {}
50 for i, (actor_name, result) in enumerate(zip(actor_names, results_list)):
51 if isinstance(result, Exception):
52 Actor.log.error(f"DEBUG: {actor_name} failed: {str(result)}")
53 results[actor_name] = None
54 else:
55 results[actor_name] = result
56 if result:
57 Actor.log.info(f"DEBUG: {actor_name} succeeded: {len(result):,} characters")
58 else:
59 Actor.log.warning(f"DEBUG: {actor_name} returned no content")
60
61 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors completed. Results: {list(results.keys())}")
62 Actor.log.info(f"DEBUG: Results with content: {[name for name, content in results.items() if content]}")
63 return results
64
65 async def _run_single_actor_with_name(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:
66 """
67 Run a single actor and return its HTML content.
68
69 Args:
70 actor_name: Name of the actor for logging
71 config: Actor configuration
72
73 Returns:
74 HTML content or None if failed
75 """
76 try:
77 Actor.log.info(f"DEBUG: Starting {actor_name}...")
78 return await self._run_single_actor(actor_name, config)
79 except Exception as e:
80 Actor.log.error(f"DEBUG: {actor_name} failed: {str(e)}")
81 return None
82
83 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:
84 """Get configurations for all actors to run."""
85 return {
86 "cheerio-scraper": {
87 "actor_id": "apify/cheerio-scraper",
88 "input": {
89 "startUrls": [{"url": target_url}],
90 "maxRequestRetries": 3,
91 "requestTimeoutSecs": 30,
92 "maxPagesPerCrawl": 1,
93 "pageFunction": """
94 async function pageFunction(context) {
95 const { request, log, $ } = context;
96 try {
97 const title = $('title').text() || '';
98 const html = $('html').html() || '';
99 return {
100 url: request.url,
101 title: title,
102 html: html
103 };
104 } catch (error) {
105 log.error('Error in pageFunction:', error);
106 return {
107 url: request.url,
108 title: '',
109 html: ''
110 };
111 }
112 }
113 """,
114 "proxyConfiguration": {"useApifyProxy": True}
115 }
116 },
117 "web-scraper": {
118 "actor_id": "apify/web-scraper",
119 "input": {
120 "startUrls": [{"url": target_url}],
121 "maxRequestRetries": 3,
122 "requestTimeoutSecs": 30,
123 "maxPagesPerCrawl": 1,
124 "pageFunction": """
125 async function pageFunction(context) {
126 const { request, log, page } = context;
127 try {
128 const title = await page.title();
129 const html = await page.content();
130 return { url: request.url, title, html };
131 } catch (error) {
132 log.error('Error in pageFunction:', error);
133 return { url: request.url, title: '', html: '' };
134 }
135 }
136 """,
137 "proxyConfiguration": {"useApifyProxy": True}
138 }
139 },
140 "website-content-crawler": {
141 "actor_id": "apify/website-content-crawler",
142 "input": {
143 "startUrls": [{"url": target_url}],
144 "maxRequestsPerCrawl": 1,
145 "maxCrawlDepth": 0,
146 "htmlTransformer": "readableText",
147 "readableTextCharThreshold": 100,
148 "removeCookieWarnings": True,
149 "clickElementsCssSelector": "",
150 "proxyConfiguration": {"useApifyProxy": True}
151 }
152 }
153 }
154
155 async def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:
156 """
157 Run a single actor and extract HTML content.
158
159 Args:
160 actor_name: Name of the actor (for logging)
161 config: Actor configuration including actor_id and input
162
163 Returns:
164 HTML content as string, or None if failed
165 """
166 try:
167 actor_id = config["actor_id"]
168 actor_input = config["input"]
169
170 Actor.log.info(f"DEBUG: Calling actor {actor_id}")
171
172 # Call the actor using Apify SDK - use the exact same pattern as working code
173 run = await Actor.call(
174 actor_id=actor_id,
175 run_input=actor_input
176 )
177
178 if not run:
179 Actor.log.error(f"DEBUG: Actor {actor_name} failed to start - run is None")
180 return None
181
182 Actor.log.info(f"DEBUG: Actor {actor_name} run created with ID: {run.id}")
183 Actor.log.info(f"DEBUG: Default dataset ID: {run.default_dataset_id}")
184
185 # Use the exact same pattern as your working code
186 if run.default_dataset_id:
187 try:
188 Actor.log.info(f"DEBUG: Getting dataset items for {actor_name}...")
189 items = (await Actor.apify_client.dataset(run.default_dataset_id).list_items()).items
190
191 if items:
192 Actor.log.info(f"DEBUG: Found {len(items)} items in dataset for {actor_name}")
193
194 for i, item in enumerate(items):
195 Actor.log.info(f"DEBUG: Dataset item {i} keys: {list(item.keys()) if isinstance(item, dict) else type(item)}")
196
197 # Look for HTML content in the item
198 html_content = self._extract_html_from_item(item, actor_name)
199 if html_content:
200 Actor.log.info(f"DEBUG: Found HTML content in dataset item {i} for {actor_name}: {len(html_content)} characters")
201 return html_content
202 else:
203 Actor.log.info(f"DEBUG: No dataset items found for {actor_name}")
204
205 except Exception as e:
206 Actor.log.error(f"DEBUG: Dataset retrieval failed for {actor_name}: {e}")
207 import traceback
208 Actor.log.error(f"DEBUG: Dataset traceback: {traceback.format_exc()}")
209
210 # Fallback: Try key-value store (simplified)
211 if run.default_key_value_store_id:
212 try:
213 Actor.log.info(f"DEBUG: Trying key-value store as fallback for {actor_name}...")
214 kvs_client = Actor.apify_client.key_value_store(run.default_key_value_store_id)
215
216 # Try common keys that might contain HTML
217 common_keys = ['OUTPUT', 'RESULTS', 'DATA']
218 for key_name in common_keys:
219 try:
220 record = await kvs_client.get_record(key_name)
221 if record:
222 Actor.log.info(f"DEBUG: Found record for key {key_name}")
223 html_content = self._extract_html_from_record(record, actor_name)
224 if html_content:
225 Actor.log.info(f"DEBUG: Found HTML content in key {key_name} for {actor_name}")
226 return html_content
227 except Exception:
228 pass # Key doesn't exist, continue
229
230 except Exception as e:
231 Actor.log.error(f"DEBUG: Key-value store retrieval failed for {actor_name}: {e}")
232
233 Actor.log.warning(f"DEBUG: No HTML content found for {actor_name}")
234 return None
235
236 except Exception as e:
237 Actor.log.error(f"DEBUG: Error running {actor_name}: {str(e)}")
238 import traceback
239 Actor.log.error(f"DEBUG: Full traceback: {traceback.format_exc()}")
240 return None
241
242 def _extract_html_from_item(self, item: Dict[str, Any], actor_name: str) -> Optional[str]:
243 """Extract HTML content from a dataset item."""
244 Actor.log.info(f"DEBUG: Extracting HTML from item for {actor_name}, item keys: {list(item.keys()) if isinstance(item, dict) else 'not a dict'}")
245
246 # Look for HTML in common fields
247 html_fields = ['html', 'content', 'body', 'pageContent', 'text', 'data']
248
249 for field in html_fields:
250 if field in item and item[field]:
251 content = item[field]
252 Actor.log.info(f"DEBUG: Found content in field '{field}': {type(content)}, length: {len(content) if isinstance(content, str) else 'N/A'}")
253
254 if isinstance(content, str) and len(content) > 100:
255 # Check if it looks like HTML
256 if '<' in content and '>' in content:
257 Actor.log.info(f"DEBUG: Found HTML content in field '{field}' for {actor_name}")
258 return content
259 elif actor_name == "website-content-crawler":
260 # For website-content-crawler, text content is also acceptable
261 Actor.log.info(f"DEBUG: Found text content in field '{field}' for {actor_name}")
262 html_content = f"<html><body><div>{content}</div></body></html>"
263 return html_content
264
265 # For website-content-crawler, look for any text-like content
266 if actor_name == "website-content-crawler":
267 for key, value in item.items():
268 if isinstance(value, str) and len(value) > 50:
269 Actor.log.info(f"DEBUG: Using text content from field '{key}' for website-content-crawler")
270 html_content = f"<html><body><div>{value}</div></body></html>"
271 return html_content
272
273 Actor.log.info(f"DEBUG: No HTML content found in item for {actor_name}")
274 return None
275
276 def _extract_html_from_record(self, record: Any, actor_name: str) -> Optional[str]:
277 """Extract HTML content from a key-value store record."""
278 try:
279 # The record might be the content directly or wrapped in a dict
280 content = record
281
282 if hasattr(record, 'value'):
283 content = record.value
284 elif isinstance(record, dict) and 'value' in record:
285 content = record['value']
286
287 # If content is a string, check if it's HTML
288 if isinstance(content, str):
289 if len(content) > 100 and ('<' in content or actor_name == "website-content-crawler"):
290 return content
291
292 # If content is a dict, look for HTML fields
293 elif isinstance(content, dict):
294 html_content = self._extract_html_from_item(content, actor_name)
295 if html_content:
296 return html_content
297
298 return None
299
300 except Exception as e:
301 Actor.log.debug(f"DEBUG: Error extracting HTML from record for {actor_name}: {e}")
302 return None

src/llmscraper/scraping/apify_runner.py

1"""
2Apify integration for running actors and retrieving results.
3"""
4
5import logging
6from typing import Optional, Dict, Any, List, Union
7
8from apify_client import ApifyClient
9
10
11class ApifyRunner:
12 """Handles running Apify actors and retrieving results."""
13
14 def __init__(self, api_token: str):
15 """Initialize with API token."""
16 if not api_token or not api_token.strip():
17 raise ValueError("API token cannot be empty")
18
19 self.client = ApifyClient(api_token)
20 self.logger = logging.getLogger(__name__)
21
22 def run_actor(self, actor_id: str, actor_input: dict,
23 retrieve_from: str = "auto") -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
24 """
25 Run an Apify actor and retrieve results.
26
27 Args:
28 actor_id: The ID of the Apify actor
29 actor_input: Input configuration for the actor
30 retrieve_from: "auto", "dataset", "key-value-store", or "both"
31
32 Returns:
33 Retrieved data or None if failed
34 """
35 if not actor_id or not actor_id.strip():
36 raise ValueError("actor_id cannot be empty")
37
38 if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:
39 raise ValueError("Invalid retrieve_from option")
40
41 # Determine storage type
42 if retrieve_from == "auto":
43 retrieve_from = "key-value-store" if "website-content-crawler" in actor_id else "dataset"
44
45 try:
46 self.logger.info(f"Starting Apify actor: {actor_id}")
47
48 # Start the actor run
49 run = self.client.actor(actor_id).call(run_input=actor_input)
50
51 if not run or run.get('status') != 'SUCCEEDED':
52 self.logger.error(f"Actor run failed: {run.get('status') if run else 'No run created'}")
53 return None
54
55 run_id = run.get('id')
56 self.logger.info(f"Actor run {run_id} completed successfully")
57
58 # Retrieve results based on type
59 if retrieve_from == "dataset":
60 return self._get_dataset_items(run_id)
61 elif retrieve_from == "key-value-store":
62 return self._get_key_value_store_items(run_id)
63 elif retrieve_from == "both":
64 return {
65 "dataset": self._get_dataset_items(run_id),
66 "key_value_store": self._get_key_value_store_items(run_id)
67 }
68
69 except Exception as e:
70 self.logger.error(f"Error running actor {actor_id}: {str(e)}")
71 return None
72
73 def _get_dataset_items(self, run_id: str) -> List[Dict[str, Any]]:
74 """Get items from the dataset of a run."""
75 try:
76 dataset_id = self.client.run(run_id).get().get('defaultDatasetId')
77 if not dataset_id:
78 self.logger.warning(f"No dataset found for run {run_id}")
79 return []
80
81 dataset_items = list(self.client.dataset(dataset_id).iterate_items())
82 self.logger.info(f"Retrieved {len(dataset_items)} items from dataset")
83 return dataset_items
84
85 except Exception as e:
86 self.logger.error(f"Error retrieving dataset items: {str(e)}")
87 return []
88
89 def _get_key_value_store_items(self, run_id: str) -> Dict[str, Any]:
90 """Get items from the key-value store of a run."""
91 try:
92 kvs_id = self.client.run(run_id).get().get('defaultKeyValueStoreId')
93 if not kvs_id:
94 self.logger.warning(f"No key-value store found for run {run_id}")
95 return {}
96
97 kvs = self.client.key_value_store(kvs_id)
98 keys = list(kvs.list_keys())
99
100 items = {}
101 for key_info in keys:
102 # Handle case where key_info might be a string or dict
103 if isinstance(key_info, dict):
104 key_name = key_info.get('key')
105 else:
106 key_name = str(key_info)
107
108 if key_name:
109 try:
110 value = kvs.get_record(key_name)
111 if value:
112 # Handle case where value might be a string or dict
113 if isinstance(value, dict):
114 items[key_name] = value.get('value', value)
115 else:
116 items[key_name] = value
117 except Exception as e:
118 self.logger.warning(f"Failed to retrieve key '{key_name}': {str(e)}")
119
120 self.logger.info(f"Retrieved {len(items)} items from key-value store")
121 return items
122
123 except Exception as e:
124 self.logger.error(f"Error retrieving key-value store items: {str(e)}")
125 return {}
126
127
128# Legacy functions for backward compatibility
129def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:
130 """Legacy function - use ApifyRunner class instead."""
131 runner = ApifyRunner(api_token)
132 result = runner.run_actor(actor_id, actor_input, "dataset")
133 return result if isinstance(result, list) else None
134
135
136def run_apify_actor_with_flexible_retrieval(
137 actor_id: str, actor_input: dict, *, api_token: str, retrieve_from: str = "auto"
138) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
139 """Legacy function - use ApifyRunner class instead."""
140 runner = ApifyRunner(api_token)
141 return runner.run_actor(actor_id, actor_input, retrieve_from)

src/llmscraper/scraping/multi_actor_scraper.py

1"""
2Multi-actor scraping functionality.
3"""
4
5import logging
6from typing import Dict, Any
7from concurrent.futures import ThreadPoolExecutor, as_completed
8
9from .apify_runner import ApifyRunner
10from ..models import ActorConfig
11
12
13class MultiActorScraper:
14 """Scrapes websites using multiple Apify actors simultaneously."""
15
16 def __init__(self, api_token: str):
17 """Initialize with Apify API token."""
18 self.api_token = api_token
19 self.runner = ApifyRunner(api_token)
20 self.logger = logging.getLogger(__name__)
21
22 def scrape_with_multiple_actors(self, target_url: str,
23 actor_configs: Dict[str, ActorConfig] = None) -> Dict[str, str]:
24 """
25 Scrape a URL with multiple actors and return HTML content.
26
27 Args:
28 target_url: URL to scrape
29 actor_configs: Dictionary of actor configurations to use
30
31 Returns:
32 Dict mapping actor names to HTML content
33 """
34 if actor_configs is None:
35 # Use default configurations for backward compatibility
36 actor_configs = self._get_default_actor_configs(target_url)
37
38 # Filter to only enabled actors
39 enabled_configs = {name: config for name, config in actor_configs.items() if config.enabled}
40
41 if not enabled_configs:
42 self.logger.warning("No enabled actors found")
43 return {}
44
45 results = {}
46
47 # Use ThreadPoolExecutor for concurrent execution
48 with ThreadPoolExecutor(max_workers=len(enabled_configs)) as executor:
49 future_to_actor = {
50 executor.submit(self._run_single_actor, name, config): name
51 for name, config in enabled_configs.items()
52 }
53
54 for future in as_completed(future_to_actor):
55 actor_name = future_to_actor[future]
56 try:
57 name, html_content = future.result()
58 results[name] = html_content
59 except Exception as e:
60 self.logger.error(f"Actor {actor_name} failed: {str(e)}")
61 results[actor_name] = None
62
63 return results
64
65 def _get_default_actor_configs(self, target_url: str) -> Dict[str, ActorConfig]:
66 """Get default actor configurations for backward compatibility."""
67 from ..models import get_default_actor_configs
68
69 configs = get_default_actor_configs()
70 # Add target URL to all configs
71 for config in configs.values():
72 config.input['startUrls'] = [{"url": target_url}]
73
74 return configs
75
76 def _run_single_actor(self, actor_name: str, config) -> tuple[str, str]:
77 """
78 Run a single actor and extract HTML content.
79
80 Args:
81 actor_name: Name of the actor
82 config: Actor configuration (can be ActorConfig or dict for backward compatibility)
83
84 Returns:
85 Tuple of (actor_name, html_content)
86 """
87 try:
88 self.logger.info(f"Starting {actor_name}...")
89
90 # Handle both ActorConfig and dict formats
91 if hasattr(config, 'actor_id'):
92 actor_id = config.actor_id
93 actor_input = config.input
94 else:
95 actor_id = config["actor_id"]
96 actor_input = config["input"]
97
98 result = self.runner.run_actor(
99 actor_id,
100 actor_input,
101 "auto"
102 )
103
104 if not result:
105 self.logger.warning(f"{actor_name} returned no results")
106 return actor_name, None
107
108 # Extract HTML based on result type
109 html_content = self._extract_html_from_result(result, actor_name)
110
111 if html_content:
112 self.logger.info(f"{actor_name} completed successfully: {len(html_content):,} chars")
113 else:
114 self.logger.warning(f"{actor_name} returned no HTML content")
115
116 return actor_name, html_content
117
118 except Exception as e:
119 self.logger.error(f"Error running {actor_name}: {str(e)}")
120 return actor_name, None
121
122 def _extract_html_from_result(self, result: Any, actor_name: str) -> str:
123 """Extract HTML content from actor result."""
124 try:
125 if isinstance(result, list) and result:
126 # Dataset result
127 item = result[0]
128 return item.get('html') or item.get('content', '')
129 elif isinstance(result, dict):
130 # Key-value store result
131 if 'OUTPUT' in result:
132 output = result['OUTPUT']
133 if isinstance(output, dict):
134 return output.get('html') or output.get('content', '')
135 elif isinstance(output, str):
136 return output
137
138 self.logger.warning(f"Unexpected result format from {actor_name}")
139 return None
140
141 except Exception as e:
142 self.logger.error(f"Error extracting HTML from {actor_name}: {str(e)}")
143 return None

src/llmscraper/utils/__init__.py

1"""
2Utilities module for ScraperCodeGenerator.
3"""
4
5from .html_utils import is_html, prune_html, extract_text_content, validate_html_structure
6from .config import get_api_key, validate_required_keys, setup_logging
7
8__all__ = [
9 "is_html",
10 "prune_html",
11 "extract_text_content",
12 "validate_html_structure",
13 "get_api_key",
14 "validate_required_keys",
15 "setup_logging"
16]

src/llmscraper/utils/config.py

1"""
2Configuration and environment utilities.
3"""
4
5import os
6from typing import Optional
7
8
9def get_api_key(key_name: str, provided_key: Optional[str] = None) -> Optional[str]:
10 """
11 Get API key from provided value or environment variable.
12
13 Args:
14 key_name: Name of the environment variable
15 provided_key: Explicitly provided key (takes precedence)
16
17 Returns:
18 API key or None if not found
19 """
20 if provided_key and provided_key.strip():
21 return provided_key.strip()
22
23 return os.getenv(key_name)
24
25
26def validate_required_keys(**keys) -> dict[str, str]:
27 """
28 Validate that all required API keys are present.
29
30 Args:
31 **keys: Key-value pairs of key names and values
32
33 Returns:
34 Dict of validated keys
35
36 Raises:
37 ValueError: If any required key is missing
38 """
39 validated = {}
40 missing = []
41
42 for key_name, key_value in keys.items():
43 if not key_value or not key_value.strip():
44 missing.append(key_name)
45 else:
46 validated[key_name] = key_value.strip()
47
48 if missing:
49 raise ValueError(f"Missing required API keys: {', '.join(missing)}")
50
51 return validated
52
53
54def setup_logging(level: str = "INFO") -> None:
55 """
56 Setup logging configuration.
57
58 Args:
59 level: Logging level (DEBUG, INFO, WARNING, ERROR)
60 """
61 import logging
62
63 logging.basicConfig(
64 level=getattr(logging, level.upper()),
65 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
66 handlers=[
67 logging.StreamHandler()
68 ]
69 )

src/llmscraper/utils/config_parser.py

1"""
2Configuration parser for the ScraperCodeGenerator pipeline.
3"""
4
5import json
6import logging
7from typing import Dict, Any, Optional, Union
8
9from ..models import (
10 PipelineConfig, ActorConfig, HTMLPruningConfig, ClaudeModel,
11 get_default_actor_configs
12)
13
14
15class ConfigurationParser:
16 """Parses and validates configuration from input data."""
17
18 def __init__(self):
19 self.logger = logging.getLogger(__name__)
20
21 def parse_from_input(self, input_data: Dict[str, Any]) -> PipelineConfig:
22 """
23 Parse configuration from input data.
24
25 Args:
26 input_data: Raw input data from Actor or CLI
27
28 Returns:
29 Parsed and validated PipelineConfig
30 """
31 config = PipelineConfig()
32
33 # Parse core settings
34 config.for_actor = input_data.get('forActor', False)
35 config.test_script = input_data.get('testScript', False)
36 config.output_script_path = input_data.get('outputScriptPath')
37
38 # Parse Claude settings
39 claude_model_str = input_data.get('claudeModel', 'claude-3-5-sonnet-20241022')
40 config.claude_model = self._parse_claude_model(claude_model_str)
41 config.claude_api_key = input_data.get('claudeApiKey')
42
43 # Parse HTML pruning settings
44 config.html_pruning = self._parse_html_pruning_config(input_data)
45
46 # Parse actor configurations
47 config.actors = self._parse_actor_configs(
48 input_data.get('actors', []), input_data.get('targetUrl', '')
49 )
50
51 # Parse execution settings
52 config.max_retries = input_data.get('maxRetries', 3)
53 config.timeout_seconds = input_data.get('timeout', 60)
54 config.concurrent_actors = input_data.get('concurrentActors', True)
55
56 return config
57
58 def _parse_claude_model(self, model_str: str) -> ClaudeModel:
59 """Parse Claude model from string."""
60 model_mapping = {
61 # Claude 4 models
62 'claude-sonnet-4-20250514': ClaudeModel.CLAUDE_4_SONNET,
63 'claude-opus-4-20250514': ClaudeModel.CLAUDE_4_OPUS,
64 'claude-sonnet-4-0': ClaudeModel.CLAUDE_4_SONNET,
65 'claude-opus-4-0': ClaudeModel.CLAUDE_4_OPUS,
66
67 # Claude 3.7 models
68 'claude-3-7-sonnet-20250219': ClaudeModel.CLAUDE_3_7_SONNET,
69 'claude-3-7-sonnet-latest': ClaudeModel.CLAUDE_3_7_SONNET,
70
71 # Claude 3.5 models
72 'claude-3-5-sonnet-20241022': ClaudeModel.CLAUDE_3_5_SONNET,
73 'claude-3-5-sonnet-latest': ClaudeModel.CLAUDE_3_5_SONNET,
74 'claude-3-5-haiku-20241022': ClaudeModel.CLAUDE_3_5_HAIKU,
75
76 # Claude 3 models
77 'claude-3-sonnet-20240229': ClaudeModel.CLAUDE_3_SONNET,
78 'claude-3-haiku-20240307': ClaudeModel.CLAUDE_3_HAIKU,
79
80 # Aliases
81 'claude-4': ClaudeModel.CLAUDE_4_SONNET,
82 'claude-4-sonnet': ClaudeModel.CLAUDE_4_SONNET,
83 'claude-4-opus': ClaudeModel.CLAUDE_4_OPUS,
84 'sonnet-4': ClaudeModel.CLAUDE_4_SONNET,
85 'opus-4': ClaudeModel.CLAUDE_4_OPUS,
86 'sonnet-3.7': ClaudeModel.CLAUDE_3_7_SONNET,
87 'sonnet-3.5': ClaudeModel.CLAUDE_3_5_SONNET,
88 'haiku-3.5': ClaudeModel.CLAUDE_3_5_HAIKU,
89 'haiku': ClaudeModel.CLAUDE_3_HAIKU,
90 'sonnet': ClaudeModel.CLAUDE_3_SONNET,
91 }
92
93 return model_mapping.get(model_str.lower(), ClaudeModel.CLAUDE_4_SONNET)
94
95 def _parse_html_pruning_config(self, input_data: Dict[str, Any]) -> HTMLPruningConfig:
96 """Parse HTML pruning configuration from flat input structure."""
97 # Convert percentage from 0-100 to 0.0-1.0 if needed
98 prune_percentage = input_data.get('htmlPrunePercentage', 80)
99 if prune_percentage > 1:
100 prune_percentage = prune_percentage / 100.0
101
102 return HTMLPruningConfig(
103 enabled=input_data.get('htmlPruningEnabled', True),
104 max_list_items=input_data.get('htmlMaxListItems', 5),
105 max_text_length=input_data.get('htmlMaxTextLength', 500),
106 prune_before_evaluation=input_data.get('htmlPruneBeforeEvaluation', True),
107 prune_percentage=prune_percentage
108 )
109
110 def _parse_actor_configs(self, actors_data: Any, target_url: str) -> Dict[str, ActorConfig]:
111 """Parse actor configurations with improved validation."""
112 # Start with default configurations
113 default_configs = get_default_actor_configs()
114
115 # Handle both array and object formats
116 if isinstance(actors_data, list):
117 # New array format: [{"name": "actor-name", "enabled": true, "input": {...}}]
118 return self._parse_actor_configs_from_array(actors_data, target_url, default_configs)
119 elif isinstance(actors_data, dict):
120 # Legacy object format: {"actor-name": true, "other-actor": {"enabled": true, "input": {...}}}
121 return self._parse_actor_configs_from_object(actors_data, target_url, default_configs)
122 else:
123 # No actor configuration provided, use defaults
124 for config in default_configs.values():
125 config.input['startUrls'] = [{"url": target_url}]
126 return default_configs
127
128 def _parse_actor_configs_from_array(self, actors_data: list, target_url: str, default_configs: Dict[str, ActorConfig]) -> Dict[str, ActorConfig]:
129 """Parse actor configurations from array format."""
130 parsed_configs = {}
131
132 for actor_item in actors_data:
133 if not isinstance(actor_item, dict):
134 self.logger.warning(f"Invalid actor configuration format: {actor_item}")
135 continue
136
137 actor_name = actor_item.get('name')
138 if not actor_name:
139 self.logger.warning(f"Actor configuration missing 'name' field: {actor_item}")
140 continue
141
142 try:
143 # Check if this is a known actor
144 if actor_name in default_configs:
145 config = default_configs[actor_name]
146 config.enabled = actor_item.get('enabled', True)
147 # Merge custom input with defaults
148 if 'input' in actor_item:
149 config.input.update(actor_item['input'])
150 else:
151 # Custom actor
152 config = ActorConfig(
153 actor_id=actor_item.get('actorId', actor_name),
154 name=actor_name,
155 description=actor_item.get('description', ''),
156 enabled=actor_item.get('enabled', True),
157 input=actor_item.get('input', {})
158 )
159
160 # Ensure startUrls is set
161 if 'startUrls' not in config.input:
162 config.input['startUrls'] = [{"url": target_url}]
163
164 parsed_configs[actor_name] = config
165
166 except Exception as e:
167 self.logger.error(f"Error parsing configuration for actor '{actor_name}': {e}")
168 continue
169
170 # If no valid configs, use defaults
171 if not parsed_configs:
172 for config in default_configs.values():
173 config.input['startUrls'] = [{"url": target_url}]
174 return default_configs
175
176 return parsed_configs
177
178 def _parse_actor_configs_from_object(self, actors_data: dict, target_url: str, default_configs: Dict[str, ActorConfig]) -> Dict[str, ActorConfig]:
179 """Parse actor configurations from legacy object format."""
180 parsed_configs = {}
181
182 for actor_name, actor_data in actors_data.items():
183 try:
184 if isinstance(actor_data, dict):
185 # Full configuration object
186 if 'actorId' in actor_data:
187 # Custom actor with explicit ID
188 config = ActorConfig(
189 actor_id=actor_data.get('actorId'),
190 name=actor_data.get('name', actor_name),
191 description=actor_data.get('description', ''),
192 enabled=actor_data.get('enabled', True),
193 input=actor_data.get('input', {})
194 )
195 else:
196 # Partial configuration for known actor
197 if actor_name in default_configs:
198 config = default_configs[actor_name]
199 config.enabled = actor_data.get('enabled', True)
200 # Merge custom input with defaults
201 if 'input' in actor_data:
202 config.input.update(actor_data['input'])
203 else:
204 self.logger.warning(f"Unknown actor '{actor_name}' with partial config, skipping")
205 continue
206
207 # Ensure startUrls is set
208 if 'startUrls' not in config.input:
209 config.input['startUrls'] = [{"url": target_url}]
210
211 # Validate actor ID
212 if not config.actor_id:
213 self.logger.error(f"Actor '{actor_name}' missing actor_id")
214 continue
215
216 parsed_configs[actor_name] = config
217
218 elif isinstance(actor_data, bool):
219 # Simple boolean enable/disable
220 if actor_name in default_configs:
221 config = default_configs[actor_name]
222 config.enabled = actor_data
223 config.input['startUrls'] = [{"url": target_url}]
224 parsed_configs[actor_name] = config
225 else:
226 self.logger.warning(f"Unknown actor '{actor_name}' with boolean config, skipping")
227
228 elif isinstance(actor_data, str):
229 # Just actor ID provided
230 config = ActorConfig(
231 actor_id=actor_data,
232 name=actor_name,
233 enabled=True,
234 input={'startUrls': [{"url": target_url}]}
235 )
236 parsed_configs[actor_name] = config
237
238 else:
239 self.logger.warning(f"Invalid configuration format for actor '{actor_name}': {type(actor_data)}")
240
241 except Exception as e:
242 self.logger.error(f"Error parsing configuration for actor '{actor_name}': {e}")
243 continue
244
245 # Ensure at least one actor is enabled
246 if not any(config.enabled for config in parsed_configs.values()):
247 self.logger.warning("No actors enabled, falling back to defaults")
248 for config in default_configs.values():
249 config.input['startUrls'] = [{"url": target_url}]
250 return default_configs
251
252 return parsed_configs
253
254 def generate_example_config(self) -> Dict[str, Any]:
255 """Generate an example configuration for documentation."""
256 return {
257 "targetUrl": "https://example.com",
258 "userGoal": "Extract product information",
259 "claudeApiKey": "sk-ant-api03-...",
260
261 # Core settings
262 "forActor": False,
263 "testScript": True,
264 "outputScriptPath": "generated_scraper.py",
265
266 # Claude model selection
267 "claudeModel": "claude-sonnet-4-20250514", # or "claude-4", "sonnet-4", "opus-4", "sonnet-3.7", "sonnet-3.5", "haiku"
268
269 # HTML pruning settings
270 "htmlPruningEnabled": True,
271 "htmlMaxListItems": 5,
272 "htmlMaxTextLength": 500,
273 "htmlPrunePercentage": 80,
274 "htmlPruneBeforeEvaluation": True,
275
276 # Actor configurations (new array format)
277 "actors": [
278 {
279 "name": "cheerio-scraper",
280 "enabled": True,
281 "input": {
282 "maxRequestRetries": 3,
283 "requestTimeoutSecs": 30,
284 "maxPagesPerCrawl": 1,
285 "pageFunction": """
286 async function pageFunction(context) {
287 const { request, log, $ } = context;
288 try {
289 const title = $('title').text() || '';
290 const html = $('html').html() || '';
291 return {
292 url: request.url,
293 title: title,
294 html: html
295 };
296 } catch (error) {
297 log.error('Error in pageFunction:', error);
298 return {
299 url: request.url,
300 title: '',
301 html: ''
302 };
303 }
304 }
305 """,
306 "proxyConfiguration": {"useApifyProxy": True}
307 }
308 },
309 {
310 "name": "web-scraper",
311 "enabled": True,
312 "input": {
313 "maxRequestRetries": 3,
314 "requestTimeoutSecs": 30,
315 "maxPagesPerCrawl": 1,
316 "pageFunction": """
317 async function pageFunction(context) {
318 const { request, log, page } = context;
319 try {
320 const title = await page.title();
321 const html = await page.content();
322 return {
323 url: request.url,
324 title: title,
325 html: html
326 };
327 } catch (error) {
328 log.error('Error in pageFunction:', error);
329 return {
330 url: request.url,
331 title: '',
332 html: ''
333 };
334 }
335 }
336 """,
337 "proxyConfiguration": {"useApifyProxy": True}
338 }
339 },
340 {
341 "name": "website-content-crawler",
342 "enabled": False,
343 "input": {
344 "maxCrawlPages": 1,
345 "crawler": "playwright",
346 "proxyConfiguration": {"useApifyProxy": True}
347 }
348 },
349 {
350 "name": "custom-scraper",
351 "actorId": "your-username/custom-scraper",
352 "description": "My custom scraping actor",
353 "enabled": True,
354 "input": {
355 "maxRequestRetries": 5,
356 "requestTimeoutSecs": 60,
357 "customParam": "value"
358 }
359 },
360 {
361 "name": "playwright-scraper",
362 "enabled": True,
363 "input": {
364 "maxRequestRetries": 2,
365 "requestTimeoutSecs": 45,
366 "maxPagesPerCrawl": 1,
367 "pageFunction": """
368 async function pageFunction(context) {
369 const { request, log, page } = context;
370 try {
371 const title = await page.title();
372 const html = await page.content();
373 return {
374 url: request.url,
375 title: title,
376 html: html
377 };
378 } catch (error) {
379 log.error('Error in pageFunction:', error);
380 return {
381 url: request.url,
382 title: '',
383 html: ''
384 };
385 }
386 }
387 """,
388 "proxyConfiguration": {"useApifyProxy": True}
389 }
390 }
391 ],
392
393 # Execution settings
394 "maxRetries": 3,
395 "timeout": 60,
396 "concurrentActors": True
397 }
398
399 def validate_config(self, config: PipelineConfig) -> bool:
400 """
401 Validate configuration and log any issues.
402
403 Args:
404 config: Configuration to validate
405
406 Returns:
407 True if configuration is valid
408 """
409 is_valid = True
410
411 # Check if at least one actor is enabled
412 enabled_actors = config.get_enabled_actors()
413 if not enabled_actors:
414 self.logger.error("No actors are enabled in configuration")
415 is_valid = False
416
417 # Check Claude API key
418 if not config.claude_api_key:
419 self.logger.error("Claude API key is required")
420 is_valid = False
421
422 # Validate HTML pruning settings
423 prune_percentage = config.html_pruning.prune_percentage
424 if prune_percentage < 0 or prune_percentage > 1:
425 self.logger.error("HTML pruning percentage must be between 0 and 1")
426 is_valid = False
427
428 if config.html_pruning.max_list_items < 1:
429 self.logger.error("Max list items must be at least 1")
430 is_valid = False
431
432 if config.html_pruning.max_text_length < 1:
433 self.logger.error("Max text length must be at least 1")
434 is_valid = False
435
436 # Validate actor configurations
437 for actor_name, actor_config in enabled_actors.items():
438 if not actor_config.actor_id:
439 self.logger.error(f"Actor '{actor_name}' missing actor_id")
440 is_valid = False
441
442 # Validate actor_id format
443 if actor_config.actor_id and '/' not in actor_config.actor_id:
444 self.logger.warning(f"Actor '{actor_name}' has unusual actor_id format: {actor_config.actor_id}")
445
446 # Validate required input fields
447 if not actor_config.input.get('startUrls'):
448 self.logger.error(f"Actor '{actor_name}' missing startUrls in input")
449 is_valid = False
450
451 return is_valid

src/llmscraper/utils/html_utils.py

1"""
2HTML utility functions for processing web content.
3"""
4
5from typing import Optional
6from bs4 import BeautifulSoup, Comment, NavigableString
7import re
8
9
10def is_html(text_content: str) -> bool:
11 """
12 Check if a string is likely HTML content.
13
14 Args:
15 text_content: The text content to check
16
17 Returns:
18 True if the content appears to be HTML
19 """
20 if not text_content or not isinstance(text_content, str):
21 return False
22
23 content_lower = text_content.lower()
24 return '<html>' in content_lower and '<body>' in content_lower
25
26
27def prune_html(html_content: str, max_list_items: int = 5, max_text_length: int = 500,
28 prune_percentage: float = 0.8) -> str:
29 """
30 Clean and shorten HTML content to reduce token count while preserving structure.
31
32 Args:
33 html_content: The raw HTML content to process
34 max_list_items: Maximum number of list items to keep
35 max_text_length: Maximum length of text content in any tag
36 prune_percentage: Percentage of content to keep (0.0-1.0)
37
38 Returns:
39 The cleaned and shortened HTML
40 """
41 if not html_content or not isinstance(html_content, str):
42 return ""
43
44 try:
45 soup = BeautifulSoup(html_content, 'html.parser')
46
47 # Remove unwanted tags entirely
48 unwanted_tags = ['script', 'style', 'svg', 'noscript', 'meta', 'link']
49 for tag_name in unwanted_tags:
50 for tag in soup.find_all(tag_name):
51 tag.decompose()
52
53 # Remove HTML comments
54 for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
55 comment.extract()
56
57 # Remove unwanted attributes from all tags
58 allowed_attributes = {'id', 'class', 'href', 'src', 'alt', 'title'}
59 for tag in soup.find_all(True):
60 if hasattr(tag, 'attrs'):
61 tag.attrs = {key: value for key, value in tag.attrs.items()
62 if key in allowed_attributes}
63
64 # Truncate lists and tables
65 list_and_table_tags = ['ul', 'ol', 'table', 'tbody', 'thead']
66 for tag_name in list_and_table_tags:
67 for tag in soup.find_all(tag_name):
68 children = list(tag.children)
69 # Filter out NavigableString objects (text nodes, whitespace)
70 non_text_children = [child for child in children if not isinstance(child, NavigableString)]
71
72 if len(non_text_children) > max_list_items:
73 # Keep only the first max_list_items children
74 for child in non_text_children[max_list_items:]:
75 child.decompose()
76
77 # Add a comment indicating truncation
78 if tag.name in ['ul', 'ol']:
79 truncation_notice = soup.new_tag("li")
80 truncation_notice.string = f"... ({len(non_text_children) - max_list_items} more items)"
81 tag.append(truncation_notice)
82 elif tag.name == 'table':
83 truncation_notice = soup.new_tag("tr")
84 td = soup.new_tag("td")
85 td.string = f"... ({len(non_text_children) - max_list_items} more rows)"
86 truncation_notice.append(td)
87 tag.append(truncation_notice)
88
89 # Truncate long text content
90 for element in soup.find_all(string=True):
91 if isinstance(element, NavigableString) and not isinstance(element, Comment):
92 text = str(element).strip()
93 if len(text) > max_text_length:
94 element.replace_with(text[:max_text_length] + "...")
95
96 # Apply percentage-based pruning if specified
97 if prune_percentage < 1.0:
98 # Calculate target length based on percentage
99 target_length = int(len(str(soup)) * prune_percentage)
100 current_html = str(soup)
101
102 if len(current_html) > target_length:
103 # Additional aggressive pruning to meet percentage target
104 # Remove more list items
105 for tag_name in ['ul', 'ol', 'table', 'tbody', 'thead']:
106 for tag in soup.find_all(tag_name):
107 children = list(tag.children)
108 non_text_children = [child for child in children if not isinstance(child, NavigableString)]
109
110 # Keep even fewer items if we need more aggressive pruning
111 aggressive_max = max(1, int(max_list_items * prune_percentage))
112 if len(non_text_children) > aggressive_max:
113 for child in non_text_children[aggressive_max:]:
114 child.decompose()
115
116 # More aggressive text truncation
117 aggressive_text_length = int(max_text_length * prune_percentage)
118 for element in soup.find_all(string=True):
119 if isinstance(element, NavigableString) and not isinstance(element, Comment):
120 text = str(element).strip()
121 if len(text) > aggressive_text_length:
122 element.replace_with(text[:aggressive_text_length] + "...")
123
124 # Return the cleaned HTML
125 return str(soup)
126
127 except Exception as e:
128 # If parsing fails, return original content truncated
129 return html_content[:max_text_length * 10] if len(html_content) > max_text_length * 10 else html_content
130
131
132def extract_text_content(html_content: str) -> str:
133 """
134 Extract clean text content from HTML.
135
136 Args:
137 html_content: HTML content to extract text from
138
139 Returns:
140 Clean text content
141 """
142 if not html_content:
143 return ""
144
145 try:
146 soup = BeautifulSoup(html_content, 'html.parser')
147 return soup.get_text(separator=' ', strip=True)
148 except Exception:
149 return html_content
150
151
152def validate_html_structure(html_content: str) -> bool:
153 """
154 Validate basic HTML structure.
155
156 Args:
157 html_content: HTML content to validate
158
159 Returns:
160 True if HTML has basic valid structure
161 """
162 if not html_content:
163 return False
164
165 try:
166 soup = BeautifulSoup(html_content, 'html.parser')
167
168 # Check for basic HTML elements
169 has_html_tag = soup.find('html') is not None
170 has_body_tag = soup.find('body') is not None
171 has_content = len(soup.get_text(strip=True)) > 0
172
173 return has_html_tag or has_body_tag or has_content
174
175 except Exception:
176 return False

src/llmscraper/utils/llm_config_parser.py

1"""
2Configuration parser for the LLM Scraper Actor.
3"""
4import os
5import logging
6from typing import Dict, Any, Optional
7
8from ..llm_scraper.models import LLMScraperInput
9
10
11class LLMScraperConfigParser:
12 """Parses and validates configuration from input data and environment variables."""
13
14 def __init__(self):
15 self.logger = logging.getLogger(__name__)
16
17 def parse_from_input(self, input_data: Dict[str, Any]) -> LLMScraperInput:
18 """
19 Parse configuration from input data with environment variable fallbacks.
20
21 Args:
22 input_data: Raw input data from Actor
23
24 Returns:
25 Parsed and validated LLMScraperInput
26 """
27 # Get required fields
28 target_url = input_data.get('targetUrl')
29 if not target_url:
30 raise ValueError("targetUrl is required")
31
32 extraction_goal = input_data.get('extractionGoal')
33 if not extraction_goal:
34 raise ValueError("extractionGoal is required")
35
36 # Get API key from input or environment
37 anthropic_api_key = (
38 input_data.get('claudeApiKey') or
39 os.getenv('ANTHROPIC_API_KEY') or
40 os.getenv('CLAUDE_API_KEY')
41 )
42
43 if not anthropic_api_key:
44 raise ValueError(
45 "Claude API key is required. Provide it via 'claudeApiKey' input field "
46 "or set ANTHROPIC_API_KEY environment variable."
47 )
48
49 # Parse optional configuration with defaults
50 config = LLMScraperInput(
51 target_url=target_url,
52 extraction_goal=extraction_goal,
53 max_actor_attempts=input_data.get('maxActorAttempts', 10),
54 max_retries_per_actor=input_data.get('maxRetriesPerActor', 3),
55 max_time_minutes=input_data.get('maxTimeMinutes', 30),
56 anthropic_api_key=anthropic_api_key,
57 mcp_url=input_data.get('mcpUrl', 'https://mcp.apify.com/sse?enableAddingActors=true'),
58 model_name=input_data.get('modelName', 'claude-3-5-haiku-latest'),
59 debug_mode=input_data.get('debugMode', False),
60 prefer_specific_actors=input_data.get('preferSpecificActors', True),
61 min_data_quality_score=input_data.get('minDataQualityScore', 0.7),
62 enable_proxy=input_data.get('enableProxy', True)
63 )
64
65 self.logger.info(f"Parsed configuration for URL: {target_url}")
66 if config.debug_mode:
67 self.logger.info(f"Configuration: {config}")
68
69 return config
70
71 def validate_config(self, config: LLMScraperInput) -> bool:
72 """
73 Validate configuration and log any issues.
74
75 Args:
76 config: Configuration to validate
77
78 Returns:
79 True if configuration is valid
80 """
81 is_valid = True
82
83 # Validate URL
84 if not config.target_url.startswith(('http://', 'https://')):
85 self.logger.error(f"Invalid target URL: {config.target_url}")
86 is_valid = False
87
88 # Validate API key format
89 if not config.anthropic_api_key.startswith('sk-ant-'):
90 self.logger.warning("API key format appears invalid (should start with 'sk-ant-')")
91
92 # Validate numeric ranges
93 if config.max_actor_attempts < 1 or config.max_actor_attempts > 50:
94 self.logger.error(f"max_actor_attempts must be between 1 and 50, got: {config.max_actor_attempts}")
95 is_valid = False
96
97 if config.max_retries_per_actor < 1 or config.max_retries_per_actor > 10:
98 self.logger.error(f"max_retries_per_actor must be between 1 and 10, got: {config.max_retries_per_actor}")
99 is_valid = False
100
101 if config.max_time_minutes < 1 or config.max_time_minutes > 240:
102 self.logger.error(f"max_time_minutes must be between 1 and 240, got: {config.max_time_minutes}")
103 is_valid = False
104
105 if config.min_data_quality_score < 0.0 or config.min_data_quality_score > 1.0:
106 self.logger.error(f"min_data_quality_score must be between 0.0 and 1.0, got: {config.min_data_quality_score}")
107 is_valid = False
108
109 # Validate model name
110 valid_models = [
111 'claude-3-5-haiku-latest',
112 'claude-3-5-sonnet-latest',
113 'claude-3-opus-latest',
114 'claude-3-haiku-20240307',
115 'claude-3-sonnet-20240229',
116 'claude-3-5-sonnet-20241022'
117 ]
118 if config.model_name not in valid_models:
119 self.logger.warning(f"Unknown model name: {config.model_name}. Valid options: {valid_models}")
120
121 return is_valid
122
123 def generate_example_config(self) -> Dict[str, Any]:
124 """Generate an example configuration for documentation."""
125 return {
126 "targetUrl": "https://books.toscrape.com/",
127 "extractionGoal": "Extract book information including title, price, rating, and availability",
128 "claudeApiKey": "sk-ant-api03-...",
129 "maxActorAttempts": 5,
130 "maxRetriesPerActor": 3,
131 "maxTimeMinutes": 20,
132 "mcpUrl": "https://mcp.apify.com/sse?enableAddingActors=true",
133 "modelName": "claude-3-5-haiku-latest",
134 "debugMode": False,
135 "preferSpecificActors": True,
136 "minDataQualityScore": 0.8,
137 "enableProxy": True
138 }