
LLMScraper
Pricing
Pay per usage
Go to Store

LLMScraper
Find best scraper for your website and data you need.
0.0 (0)
Pricing
Pay per usage
0
Total users
1
Monthly users
1
Last modified
6 days ago
.gitignore
# --- General ---.DS_Store.env.env.*
# --- Logs ---logs*.lognpm-debug.log*yarn-debug.log*yarn-error.log*pnpm-debug.log*lerna-debug.log*
# --- IDEs ---.vscode/*!.vscode/extensions.json.idea/*.suo*.ntvs**.njsproj*.sln*.sw?
# --- Python ---__pycache__/*.py[cod]*$py.class*.so.Python.venv/venv/ENV/env/.env.env.*.pytest_cache/.coveragehtmlcov/.tox/.cache.mypy_cache/.dmypy.jsondmypy.json
# Project specificscraped_results/*.html
# --- Node.js / Frontend ---frontend/node_modules/frontend/dist/frontend/dist-ssr/frontend/.pnpfrontend/.pnp.jsfrontend/.npmnode_modulesdistdist-ssr*.local# Added by Apify CLIstorage.venv
# --- Apify ---storage/apify_storage/
# --- Local test files ---input.jsontest_*
.python-version
3.10
Dockerfile
# Use the official Apify Python base imageFROM apify/actor-python:3.11
# Copy requirements and install dependenciesCOPY requirements.txt ./RUN pip install --no-cache-dir -r requirements.txt
# Copy the source codeCOPY . ./
# Set the entrypointCMD ["python3", "main.py"]
main.py
1"""2Main entry point for LLM Scraper Actor.3
4This Actor uses Claude AI to automatically discover and test the best Apify actors5for your web scraping task. No manual configuration needed!6"""7
8import asyncio9import logging10import os11import sys12from datetime import datetime13from typing import Dict, Any, Optional14
15# Add src to path for development16sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))17
18from apify import Actor19from llmscraper.llm_scraper import LLMScraperActor, LLMScraperInput, ProgressUpdate20
21
22async def main():23 """Main entry point for LLM Scraper Actor."""24 async with Actor:25 # Get input data26 actor_input = await Actor.get_input() or {}27 28 # Setup logging29 logging.basicConfig(30 level=logging.INFO if not actor_input.get('debugMode') else logging.DEBUG,31 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'32 )33 logger = logging.getLogger(__name__)34 35 try:36 # Parse and validate input37 config = LLMScraperInput(38 target_url=actor_input.get('targetUrl', ''),39 extraction_goal=actor_input.get('extractionGoal', ''),40 anthropic_api_key=actor_input.get('claudeApiKey') or os.getenv('ANTHROPIC_API_KEY'),41 max_actor_attempts=actor_input.get('maxActorAttempts', 10),42 max_retries_per_actor=actor_input.get('maxRetriesPerActor', 3),43 max_time_minutes=actor_input.get('maxTimeMinutes', 30),44 mcp_url=actor_input.get('mcpUrl', 'https://mcp.apify.com/sse?enableAddingActors=true'),45 model_name=actor_input.get('modelName', 'claude-3-5-haiku-latest'),46 debug_mode=actor_input.get('debugMode', False),47 prefer_specific_actors=actor_input.get('preferSpecificActors', True),48 min_data_quality_score=actor_input.get('minDataQualityScore', 70) / 100.0, # Convert percentage to decimal49 enable_proxy=actor_input.get('enableProxy', True)50 )51 52 # Validate configuration53 config.validate()54 55 logger.info("🚀 LLM Scraper Actor starting...")56 logger.info(f"🎯 Target URL: {config.target_url}")57 logger.info(f"🎯 Goal: {config.extraction_goal}")58 logger.info(f"🤖 Model: {config.model_name}")59 60 # Initialize progress tracking61 progress_updates = []62 63 def progress_callback(update: ProgressUpdate):64 progress_updates.append(update)65 logger.info(f"Progress: {update.message} ({update.progress:.1%})")66 67 # Create and run LLM Scraper68 scraper = LLMScraperActor(config)69 scraper.set_progress_callback(progress_callback)70 result = await scraper.run()71 72 # Process results73 if result.success:74 logger.info(f"✅ Scraping completed successfully!")75 logger.info(f"📊 Quality Score: {result.quality_score:.2f}")76 logger.info(f"🎭 Best Actor: {result.best_actor_id}")77 logger.info(f"📦 Items extracted: {len(result.scraped_data)}")78 79 # Save data to dataset80 for item in result.scraped_data:81 await Actor.push_data({82 "url": config.target_url,83 "data": item,84 "quality_score": result.quality_score,85 "actor_used": result.best_actor_id,86 "timestamp": datetime.now().isoformat(),87 "success": True,88 "error": None,89 "extraction_goal": config.extraction_goal90 })91 92 # Save summary to key-value store93 await Actor.set_value('SCRAPING_RESULT', {94 "success": True,95 "quality_score": result.quality_score,96 "items_count": len(result.scraped_data),97 "best_actor_id": result.best_actor_id,98 "total_execution_time": result.total_execution_time,99 "attempts_made": len(result.actor_attempts),100 "target_url": config.target_url,101 "extraction_goal": config.extraction_goal,102 "model_used": config.model_name,103 "progress_updates": [104 {"message": u.message, "progress": u.progress, "timestamp": u.timestamp.isoformat()}105 for u in progress_updates106 ],107 "actor_attempts": [108 {109 "actor_id": attempt.actor_id,110 "success": attempt.success,111 "quality_score": attempt.data_quality_score,112 "execution_time": attempt.execution_time_seconds,113 "error_message": attempt.error_message,114 "attempt_number": attempt.attempt_number115 }116 for attempt in result.actor_attempts117 ]118 })119 120 else:121 error_msg = result.llm_reasoning or f"Status: {result.status}"122 logger.error(f"❌ Scraping failed: {error_msg}")123 124 # Save failure info to dataset125 await Actor.push_data({126 "url": config.target_url,127 "data": None,128 "quality_score": 0.0,129 "actor_used": None,130 "timestamp": datetime.now().isoformat(),131 "success": False,132 "error": error_msg,133 "extraction_goal": config.extraction_goal,134 "total_execution_time": result.total_execution_time,135 "attempts_made": len(result.actor_attempts)136 })137 138 # Save failure summary to key-value store139 await Actor.set_value('SCRAPING_RESULT', {140 "success": False,141 "error_message": error_msg,142 "status": result.status,143 "total_execution_time": result.total_execution_time,144 "attempts_made": len(result.actor_attempts),145 "target_url": config.target_url,146 "extraction_goal": config.extraction_goal,147 "model_used": config.model_name,148 "progress_updates": [149 {"message": u.message, "progress": u.progress, "timestamp": u.timestamp}150 for u in progress_updates151 ]152 })153 154 # Exit with error code155 Actor.exit(exit_code=1, status_message=f"Scraping failed: {error_msg}")156 157 except Exception as e:158 logger.error(f"💥 Fatal error: {str(e)}", exc_info=True)159 160 # Save error info161 await Actor.push_data({162 "url": actor_input.get('targetUrl', 'unknown'),163 "data": None,164 "quality_score": 0.0,165 "actor_used": None,166 "timestamp": datetime.now().isoformat(),167 "success": False,168 "error": str(e),169 "extraction_goal": actor_input.get('extractionGoal', 'unknown')170 })171 172 Actor.exit(exit_code=1, status_message=f"Fatal error: {str(e)}")173
174
175if __name__ == "__main__":176 asyncio.run(main())
package.json
{ "name": "llm-scraper-actor", "version": "1.0.0", "description": "AI-powered web scraper that automatically discovers and tests the best Apify actors for your scraping task using Claude AI", "main": "main.py", "scripts": { "start": "python3 main.py", "start:local": "./venv/bin/python main.py" }, "keywords": [ "web-scraping", "artificial-intelligence", "ai-powered", "data-extraction", "apify-actor", "claude-ai", "llm-scraper", "intelligent-scraping", "automated-scraping", "mcp-server" ], "dependencies": {}, "author": "", "license": "MIT"}
pyproject.toml
1[build-system]2requires = ["hatchling"]3build-backend = "hatchling.build"4
5[project]6name = "llmscraper"7version = "0.1.0"8description = "Intelligent web scraping framework using AI-powered quality evaluation and multiple scraping strategies"9readme = "README.md"10requires-python = ">=3.10"11dependencies = [12 "anthropic>=0.54.0",13 "apify-client>=1.11.0", 14 "beautifulsoup4>=4.12.0",15 "apify>=1.5.0",16]17
18[project.scripts]19llmscraper = "llmscraper.main:main"20
21[project.optional-dependencies]22dev = [23 "pytest>=7.0",24 "pytest-asyncio>=0.21.0",25 "black>=23.0",26 "isort>=5.12.0",27 "mypy>=1.5.0",28]29
30[tool.hatch.build.targets.wheel]31packages = ["src/llmscraper"]32
33[tool.hatch.build.targets.sdist]34include = [35 "/src",36 "/tests", 37 "/examples.json",38 "/README.md",39]
requirements.txt
1apify>=1.5.02apify-client>=1.11.03anthropic>=0.54.04beautifulsoup4>=4.12.05httpx>=0.27.06python-dotenv>=1.0.0
uv.lock
version = 1revision = 2requires-python = ">=3.10"
[[package]]name = "annotated-types"version = "0.7.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },]
[[package]]name = "anthropic"version = "0.54.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "anyio" }, { name = "distro" }, { name = "httpx" }, { name = "jiter" }, { name = "pydantic" }, { name = "sniffio" }, { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/89/28/80cb9bb6e7ce77d404145b51da4257455805c17f0a6be528ff3286e3882f/anthropic-0.54.0.tar.gz", hash = "sha256:5e6f997d97ce8e70eac603c3ec2e7f23addeff953fbbb76b19430562bb6ba815", size = 312376, upload-time = "2025-06-11T02:46:27.642Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/de/b9/6ffb48e82c5e97b03cecee872d134a6b6666c2767b2d32ed709f3a60a8fe/anthropic-0.54.0-py3-none-any.whl", hash = "sha256:c1062a0a905daeec17ca9c06c401e4b3f24cb0495841d29d752568a1d4018d56", size = 288774, upload-time = "2025-06-11T02:46:25.578Z" },]
[[package]]name = "anyio"version = "4.9.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, { name = "idna" }, { name = "sniffio" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" },]sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload-time = "2025-03-17T00:02:52.713Z" },]
[[package]]name = "apify-client"version = "1.11.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "apify-shared" }, { name = "colorama" }, { name = "httpx" }, { name = "more-itertools" },]sdist = { url = "https://files.pythonhosted.org/packages/49/44/b7cae857f2129d4093bc5a0a2267fcbba7905207a0b7cc424dc3c7c90291/apify_client-1.11.0.tar.gz", hash = "sha256:c2e151754c35be9bc7c1028bf7cb127aeb1ffa2fbd1ec1ad7e97b901deb32e08", size = 346095, upload-time = "2025-06-13T11:46:39.129Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/8f/24/d3273bfe5b4a96fd60c8d554edbab99274fae8cb2347b96f2e3fa0bc4d5b/apify_client-1.11.0-py3-none-any.whl", hash = "sha256:9d691960bdbeee17624a2a82aafc4f0bfba9b48820a48f559b7eba76bf01cb3c", size = 82550, upload-time = "2025-06-13T11:46:37.483Z" },]
[[package]]name = "apify-shared"version = "1.4.1"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/b2/a6/c8e2fa0b3bdc479d3ecde778e2381af199f910cf7c8baa3c207bcfe26e47/apify_shared-1.4.1.tar.gz", hash = "sha256:16e617c840fd27bf38d980f079c0b867c7378f68c7006b3d5a7d530d43930507", size = 13871, upload-time = "2025-04-28T12:20:01.113Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/1d/f3/3446c8a7986fdc087024d4e174e4b3f587097a9b28f6f8e8c788199225b2/apify_shared-1.4.1-py3-none-any.whl", hash = "sha256:abac5712b6e8eb96693204cbb2702905e1971d9084b1716e7337852b5005290e", size = 12706, upload-time = "2025-04-28T12:19:59.792Z" },]
[[package]]name = "certifi"version = "2025.6.15"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload-time = "2025-06-15T02:45:51.329Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" },]
[[package]]name = "colorama"version = "0.4.6"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },]
[[package]]name = "distro"version = "1.9.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },]
[[package]]name = "exceptiongroup"version = "1.3.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.13'" },]sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },]
[[package]]name = "h11"version = "0.16.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },]
[[package]]name = "httpcore"version = "1.0.9"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "certifi" }, { name = "h11" },]sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },]
[[package]]name = "httpx"version = "0.28.1"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "anyio" }, { name = "certifi" }, { name = "httpcore" }, { name = "idna" },]sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },]
[[package]]name = "idna"version = "3.10"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },]
[[package]]name = "jiter"version = "0.10.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ee/9d/ae7ddb4b8ab3fb1b51faf4deb36cb48a4fbbd7cb36bad6a5fca4741306f7/jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500", size = 162759, upload-time = "2025-05-18T19:04:59.73Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/be/7e/4011b5c77bec97cb2b572f566220364e3e21b51c48c5bd9c4a9c26b41b67/jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303", size = 317215, upload-time = "2025-05-18T19:03:04.303Z" }, { url = "https://files.pythonhosted.org/packages/8a/4f/144c1b57c39692efc7ea7d8e247acf28e47d0912800b34d0ad815f6b2824/jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e", size = 322814, upload-time = "2025-05-18T19:03:06.433Z" }, { url = "https://files.pythonhosted.org/packages/63/1f/db977336d332a9406c0b1f0b82be6f71f72526a806cbb2281baf201d38e3/jiter-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8b3e0068c26ddedc7abc6fac37da2d0af16b921e288a5a613f4b86f050354f", size = 345237, upload-time = "2025-05-18T19:03:07.833Z" }, { url = "https://files.pythonhosted.org/packages/d7/1c/aa30a4a775e8a672ad7f21532bdbfb269f0706b39c6ff14e1f86bdd9e5ff/jiter-0.10.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:286299b74cc49e25cd42eea19b72aa82c515d2f2ee12d11392c56d8701f52224", size = 370999, upload-time = "2025-05-18T19:03:09.338Z" }, { url = "https://files.pythonhosted.org/packages/35/df/f8257abc4207830cb18880781b5f5b716bad5b2a22fb4330cfd357407c5b/jiter-0.10.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ed5649ceeaeffc28d87fb012d25a4cd356dcd53eff5acff1f0466b831dda2a7", size = 491109, upload-time = "2025-05-18T19:03:11.13Z" }, { url = "https://files.pythonhosted.org/packages/06/76/9e1516fd7b4278aa13a2cc7f159e56befbea9aa65c71586305e7afa8b0b3/jiter-0.10.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2ab0051160cb758a70716448908ef14ad476c3774bd03ddce075f3c1f90a3d6", size = 388608, upload-time = "2025-05-18T19:03:12.911Z" }, { url = "https://files.pythonhosted.org/packages/6d/64/67750672b4354ca20ca18d3d1ccf2c62a072e8a2d452ac3cf8ced73571ef/jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03997d2f37f6b67d2f5c475da4412be584e1cec273c1cfc03d642c46db43f8cf", size = 352454, upload-time = "2025-05-18T19:03:14.741Z" }, { url = "https://files.pythonhosted.org/packages/96/4d/5c4e36d48f169a54b53a305114be3efa2bbffd33b648cd1478a688f639c1/jiter-0.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c404a99352d839fed80d6afd6c1d66071f3bacaaa5c4268983fc10f769112e90", size = 391833, upload-time = "2025-05-18T19:03:16.426Z" }, { url = "https://files.pythonhosted.org/packages/0b/de/ce4a6166a78810bd83763d2fa13f85f73cbd3743a325469a4a9289af6dae/jiter-0.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66e989410b6666d3ddb27a74c7e50d0829704ede652fd4c858e91f8d64b403d0", size = 523646, upload-time = "2025-05-18T19:03:17.704Z" }, { url = "https://files.pythonhosted.org/packages/a2/a6/3bc9acce53466972964cf4ad85efecb94f9244539ab6da1107f7aed82934/jiter-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b532d3af9ef4f6374609a3bcb5e05a1951d3bf6190dc6b176fdb277c9bbf15ee", size = 514735, upload-time = "2025-05-18T19:03:19.44Z" }, { url = "https://files.pythonhosted.org/packages/b4/d8/243c2ab8426a2a4dea85ba2a2ba43df379ccece2145320dfd4799b9633c5/jiter-0.10.0-cp310-cp310-win32.whl", hash = "sha256:da9be20b333970e28b72edc4dff63d4fec3398e05770fb3205f7fb460eb48dd4", size = 210747, upload-time = "2025-05-18T19:03:21.184Z" }, { url = "https://files.pythonhosted.org/packages/37/7a/8021bd615ef7788b98fc76ff533eaac846322c170e93cbffa01979197a45/jiter-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:f59e533afed0c5b0ac3eba20d2548c4a550336d8282ee69eb07b37ea526ee4e5", size = 207484, upload-time = "2025-05-18T19:03:23.046Z" }, { url = "https://files.pythonhosted.org/packages/1b/dd/6cefc6bd68b1c3c979cecfa7029ab582b57690a31cd2f346c4d0ce7951b6/jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978", size = 317473, upload-time = "2025-05-18T19:03:25.942Z" }, { url = "https://files.pythonhosted.org/packages/be/cf/fc33f5159ce132be1d8dd57251a1ec7a631c7df4bd11e1cd198308c6ae32/jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc", size = 321971, upload-time = "2025-05-18T19:03:27.255Z" }, { url = "https://files.pythonhosted.org/packages/68/a4/da3f150cf1d51f6c472616fb7650429c7ce053e0c962b41b68557fdf6379/jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d", size = 345574, upload-time = "2025-05-18T19:03:28.63Z" }, { url = "https://files.pythonhosted.org/packages/84/34/6e8d412e60ff06b186040e77da5f83bc158e9735759fcae65b37d681f28b/jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2", size = 371028, upload-time = "2025-05-18T19:03:30.292Z" }, { url = "https://files.pythonhosted.org/packages/fb/d9/9ee86173aae4576c35a2f50ae930d2ccb4c4c236f6cb9353267aa1d626b7/jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61", size = 491083, upload-time = "2025-05-18T19:03:31.654Z" }, { url = "https://files.pythonhosted.org/packages/d9/2c/f955de55e74771493ac9e188b0f731524c6a995dffdcb8c255b89c6fb74b/jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db", size = 388821, upload-time = "2025-05-18T19:03:33.184Z" }, { url = "https://files.pythonhosted.org/packages/81/5a/0e73541b6edd3f4aada586c24e50626c7815c561a7ba337d6a7eb0a915b4/jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5", size = 352174, upload-time = "2025-05-18T19:03:34.965Z" }, { url = "https://files.pythonhosted.org/packages/1c/c0/61eeec33b8c75b31cae42be14d44f9e6fe3ac15a4e58010256ac3abf3638/jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606", size = 391869, upload-time = "2025-05-18T19:03:36.436Z" }, { url = "https://files.pythonhosted.org/packages/41/22/5beb5ee4ad4ef7d86f5ea5b4509f680a20706c4a7659e74344777efb7739/jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605", size = 523741, upload-time = "2025-05-18T19:03:38.168Z" }, { url = "https://files.pythonhosted.org/packages/ea/10/768e8818538e5817c637b0df52e54366ec4cebc3346108a4457ea7a98f32/jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5", size = 514527, upload-time = "2025-05-18T19:03:39.577Z" }, { url = "https://files.pythonhosted.org/packages/73/6d/29b7c2dc76ce93cbedabfd842fc9096d01a0550c52692dfc33d3cc889815/jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7", size = 210765, upload-time = "2025-05-18T19:03:41.271Z" }, { url = "https://files.pythonhosted.org/packages/c2/c9/d394706deb4c660137caf13e33d05a031d734eb99c051142e039d8ceb794/jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812", size = 209234, upload-time = "2025-05-18T19:03:42.918Z" }, { url = "https://files.pythonhosted.org/packages/6d/b5/348b3313c58f5fbfb2194eb4d07e46a35748ba6e5b3b3046143f3040bafa/jiter-0.10.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1e274728e4a5345a6dde2d343c8da018b9d4bd4350f5a472fa91f66fda44911b", size = 312262, upload-time = "2025-05-18T19:03:44.637Z" }, { url = "https://files.pythonhosted.org/packages/9c/4a/6a2397096162b21645162825f058d1709a02965606e537e3304b02742e9b/jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7202ae396446c988cb2a5feb33a543ab2165b786ac97f53b59aafb803fef0744", size = 320124, upload-time = "2025-05-18T19:03:46.341Z" }, { url = "https://files.pythonhosted.org/packages/2a/85/1ce02cade7516b726dd88f59a4ee46914bf79d1676d1228ef2002ed2f1c9/jiter-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23ba7722d6748b6920ed02a8f1726fb4b33e0fd2f3f621816a8b486c66410ab2", size = 345330, upload-time = "2025-05-18T19:03:47.596Z" }, { url = "https://files.pythonhosted.org/packages/75/d0/bb6b4f209a77190ce10ea8d7e50bf3725fc16d3372d0a9f11985a2b23eff/jiter-0.10.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:371eab43c0a288537d30e1f0b193bc4eca90439fc08a022dd83e5e07500ed026", size = 369670, upload-time = "2025-05-18T19:03:49.334Z" }, { url = "https://files.pythonhosted.org/packages/a0/f5/a61787da9b8847a601e6827fbc42ecb12be2c925ced3252c8ffcb56afcaf/jiter-0.10.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c675736059020365cebc845a820214765162728b51ab1e03a1b7b3abb70f74c", size = 489057, upload-time = "2025-05-18T19:03:50.66Z" }, { url = "https://files.pythonhosted.org/packages/12/e4/6f906272810a7b21406c760a53aadbe52e99ee070fc5c0cb191e316de30b/jiter-0.10.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c5867d40ab716e4684858e4887489685968a47e3ba222e44cde6e4a2154f959", size = 389372, upload-time = "2025-05-18T19:03:51.98Z" }, { url = "https://files.pythonhosted.org/packages/e2/ba/77013b0b8ba904bf3762f11e0129b8928bff7f978a81838dfcc958ad5728/jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395bb9a26111b60141757d874d27fdea01b17e8fac958b91c20128ba8f4acc8a", size = 352038, upload-time = "2025-05-18T19:03:53.703Z" }, { url = "https://files.pythonhosted.org/packages/67/27/c62568e3ccb03368dbcc44a1ef3a423cb86778a4389e995125d3d1aaa0a4/jiter-0.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6842184aed5cdb07e0c7e20e5bdcfafe33515ee1741a6835353bb45fe5d1bd95", size = 391538, upload-time = "2025-05-18T19:03:55.046Z" }, { url = "https://files.pythonhosted.org/packages/c0/72/0d6b7e31fc17a8fdce76164884edef0698ba556b8eb0af9546ae1a06b91d/jiter-0.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:62755d1bcea9876770d4df713d82606c8c1a3dca88ff39046b85a048566d56ea", size = 523557, upload-time = "2025-05-18T19:03:56.386Z" }, { url = "https://files.pythonhosted.org/packages/2f/09/bc1661fbbcbeb6244bd2904ff3a06f340aa77a2b94e5a7373fd165960ea3/jiter-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:533efbce2cacec78d5ba73a41756beff8431dfa1694b6346ce7af3a12c42202b", size = 514202, upload-time = "2025-05-18T19:03:57.675Z" }, { url = "https://files.pythonhosted.org/packages/1b/84/5a5d5400e9d4d54b8004c9673bbe4403928a00d28529ff35b19e9d176b19/jiter-0.10.0-cp312-cp312-win32.whl", hash = "sha256:8be921f0cadd245e981b964dfbcd6fd4bc4e254cdc069490416dd7a2632ecc01", size = 211781, upload-time = "2025-05-18T19:03:59.025Z" }, { url = "https://files.pythonhosted.org/packages/9b/52/7ec47455e26f2d6e5f2ea4951a0652c06e5b995c291f723973ae9e724a65/jiter-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7c7d785ae9dda68c2678532a5a1581347e9c15362ae9f6e68f3fdbfb64f2e49", size = 206176, upload-time = "2025-05-18T19:04:00.305Z" }, { url = "https://files.pythonhosted.org/packages/2e/b0/279597e7a270e8d22623fea6c5d4eeac328e7d95c236ed51a2b884c54f70/jiter-0.10.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e0588107ec8e11b6f5ef0e0d656fb2803ac6cf94a96b2b9fc675c0e3ab5e8644", size = 311617, upload-time = "2025-05-18T19:04:02.078Z" }, { url = "https://files.pythonhosted.org/packages/91/e3/0916334936f356d605f54cc164af4060e3e7094364add445a3bc79335d46/jiter-0.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cafc4628b616dc32530c20ee53d71589816cf385dd9449633e910d596b1f5c8a", size = 318947, upload-time = "2025-05-18T19:04:03.347Z" }, { url = "https://files.pythonhosted.org/packages/6a/8e/fd94e8c02d0e94539b7d669a7ebbd2776e51f329bb2c84d4385e8063a2ad/jiter-0.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:520ef6d981172693786a49ff5b09eda72a42e539f14788124a07530f785c3ad6", size = 344618, upload-time = "2025-05-18T19:04:04.709Z" }, { url = "https://files.pythonhosted.org/packages/6f/b0/f9f0a2ec42c6e9c2e61c327824687f1e2415b767e1089c1d9135f43816bd/jiter-0.10.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:554dedfd05937f8fc45d17ebdf298fe7e0c77458232bcb73d9fbbf4c6455f5b3", size = 368829, upload-time = "2025-05-18T19:04:06.912Z" }, { url = "https://files.pythonhosted.org/packages/e8/57/5bbcd5331910595ad53b9fd0c610392ac68692176f05ae48d6ce5c852967/jiter-0.10.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bc299da7789deacf95f64052d97f75c16d4fc8c4c214a22bf8d859a4288a1c2", size = 491034, upload-time = "2025-05-18T19:04:08.222Z" }, { url = "https://files.pythonhosted.org/packages/9b/be/c393df00e6e6e9e623a73551774449f2f23b6ec6a502a3297aeeece2c65a/jiter-0.10.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5161e201172de298a8a1baad95eb85db4fb90e902353b1f6a41d64ea64644e25", size = 388529, upload-time = "2025-05-18T19:04:09.566Z" }, { url = "https://files.pythonhosted.org/packages/42/3e/df2235c54d365434c7f150b986a6e35f41ebdc2f95acea3036d99613025d/jiter-0.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e2227db6ba93cb3e2bf67c87e594adde0609f146344e8207e8730364db27041", size = 350671, upload-time = "2025-05-18T19:04:10.98Z" }, { url = "https://files.pythonhosted.org/packages/c6/77/71b0b24cbcc28f55ab4dbfe029f9a5b73aeadaba677843fc6dc9ed2b1d0a/jiter-0.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:15acb267ea5e2c64515574b06a8bf393fbfee6a50eb1673614aa45f4613c0cca", size = 390864, upload-time = "2025-05-18T19:04:12.722Z" }, { url = "https://files.pythonhosted.org/packages/6a/d3/ef774b6969b9b6178e1d1e7a89a3bd37d241f3d3ec5f8deb37bbd203714a/jiter-0.10.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:901b92f2e2947dc6dfcb52fd624453862e16665ea909a08398dde19c0731b7f4", size = 522989, upload-time = "2025-05-18T19:04:14.261Z" }, { url = "https://files.pythonhosted.org/packages/0c/41/9becdb1d8dd5d854142f45a9d71949ed7e87a8e312b0bede2de849388cb9/jiter-0.10.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d0cb9a125d5a3ec971a094a845eadde2db0de85b33c9f13eb94a0c63d463879e", size = 513495, upload-time = "2025-05-18T19:04:15.603Z" }, { url = "https://files.pythonhosted.org/packages/9c/36/3468e5a18238bdedae7c4d19461265b5e9b8e288d3f86cd89d00cbb48686/jiter-0.10.0-cp313-cp313-win32.whl", hash = "sha256:48a403277ad1ee208fb930bdf91745e4d2d6e47253eedc96e2559d1e6527006d", size = 211289, upload-time = "2025-05-18T19:04:17.541Z" }, { url = "https://files.pythonhosted.org/packages/7e/07/1c96b623128bcb913706e294adb5f768fb7baf8db5e1338ce7b4ee8c78ef/jiter-0.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:75f9eb72ecb640619c29bf714e78c9c46c9c4eaafd644bf78577ede459f330d4", size = 205074, upload-time = "2025-05-18T19:04:19.21Z" }, { url = "https://files.pythonhosted.org/packages/54/46/caa2c1342655f57d8f0f2519774c6d67132205909c65e9aa8255e1d7b4f4/jiter-0.10.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:28ed2a4c05a1f32ef0e1d24c2611330219fed727dae01789f4a335617634b1ca", size = 318225, upload-time = "2025-05-18T19:04:20.583Z" }, { url = "https://files.pythonhosted.org/packages/43/84/c7d44c75767e18946219ba2d703a5a32ab37b0bc21886a97bc6062e4da42/jiter-0.10.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a4c418b1ec86a195f1ca69da8b23e8926c752b685af665ce30777233dfe070", size = 350235, upload-time = "2025-05-18T19:04:22.363Z" }, { url = "https://files.pythonhosted.org/packages/01/16/f5a0135ccd968b480daad0e6ab34b0c7c5ba3bc447e5088152696140dcb3/jiter-0.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d7bfed2fe1fe0e4dda6ef682cee888ba444b21e7a6553e03252e4feb6cf0adca", size = 207278, upload-time = "2025-05-18T19:04:23.627Z" }, { url = "https://files.pythonhosted.org/packages/1c/9b/1d646da42c3de6c2188fdaa15bce8ecb22b635904fc68be025e21249ba44/jiter-0.10.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:5e9251a5e83fab8d87799d3e1a46cb4b7f2919b895c6f4483629ed2446f66522", size = 310866, upload-time = "2025-05-18T19:04:24.891Z" }, { url = "https://files.pythonhosted.org/packages/ad/0e/26538b158e8a7c7987e94e7aeb2999e2e82b1f9d2e1f6e9874ddf71ebda0/jiter-0.10.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:023aa0204126fe5b87ccbcd75c8a0d0261b9abdbbf46d55e7ae9f8e22424eeb8", size = 318772, upload-time = "2025-05-18T19:04:26.161Z" }, { url = "https://files.pythonhosted.org/packages/7b/fb/d302893151caa1c2636d6574d213e4b34e31fd077af6050a9c5cbb42f6fb/jiter-0.10.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c189c4f1779c05f75fc17c0c1267594ed918996a231593a21a5ca5438445216", size = 344534, upload-time = "2025-05-18T19:04:27.495Z" }, { url = "https://files.pythonhosted.org/packages/01/d8/5780b64a149d74e347c5128d82176eb1e3241b1391ac07935693466d6219/jiter-0.10.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15720084d90d1098ca0229352607cd68256c76991f6b374af96f36920eae13c4", size = 369087, upload-time = "2025-05-18T19:04:28.896Z" }, { url = "https://files.pythonhosted.org/packages/e8/5b/f235a1437445160e777544f3ade57544daf96ba7e96c1a5b24a6f7ac7004/jiter-0.10.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4f2fb68e5f1cfee30e2b2a09549a00683e0fde4c6a2ab88c94072fc33cb7426", size = 490694, upload-time = "2025-05-18T19:04:30.183Z" }, { url = "https://files.pythonhosted.org/packages/85/a9/9c3d4617caa2ff89cf61b41e83820c27ebb3f7b5fae8a72901e8cd6ff9be/jiter-0.10.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce541693355fc6da424c08b7edf39a2895f58d6ea17d92cc2b168d20907dee12", size = 388992, upload-time = "2025-05-18T19:04:32.028Z" }, { url = "https://files.pythonhosted.org/packages/68/b1/344fd14049ba5c94526540af7eb661871f9c54d5f5601ff41a959b9a0bbd/jiter-0.10.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c50c40272e189d50006ad5c73883caabb73d4e9748a688b216e85a9a9ca3b9", size = 351723, upload-time = "2025-05-18T19:04:33.467Z" }, { url = "https://files.pythonhosted.org/packages/41/89/4c0e345041186f82a31aee7b9d4219a910df672b9fef26f129f0cda07a29/jiter-0.10.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fa3402a2ff9815960e0372a47b75c76979d74402448509ccd49a275fa983ef8a", size = 392215, upload-time = "2025-05-18T19:04:34.827Z" }, { url = "https://files.pythonhosted.org/packages/55/58/ee607863e18d3f895feb802154a2177d7e823a7103f000df182e0f718b38/jiter-0.10.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:1956f934dca32d7bb647ea21d06d93ca40868b505c228556d3373cbd255ce853", size = 522762, upload-time = "2025-05-18T19:04:36.19Z" }, { url = "https://files.pythonhosted.org/packages/15/d0/9123fb41825490d16929e73c212de9a42913d68324a8ce3c8476cae7ac9d/jiter-0.10.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:fcedb049bdfc555e261d6f65a6abe1d5ad68825b7202ccb9692636c70fcced86", size = 513427, upload-time = "2025-05-18T19:04:37.544Z" }, { url = "https://files.pythonhosted.org/packages/d8/b3/2bd02071c5a2430d0b70403a34411fc519c2f227da7b03da9ba6a956f931/jiter-0.10.0-cp314-cp314-win32.whl", hash = "sha256:ac509f7eccca54b2a29daeb516fb95b6f0bd0d0d8084efaf8ed5dfc7b9f0b357", size = 210127, upload-time = "2025-05-18T19:04:38.837Z" }, { url = "https://files.pythonhosted.org/packages/03/0c/5fe86614ea050c3ecd728ab4035534387cd41e7c1855ef6c031f1ca93e3f/jiter-0.10.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5ed975b83a2b8639356151cef5c0d597c68376fc4922b45d0eb384ac058cfa00", size = 318527, upload-time = "2025-05-18T19:04:40.612Z" }, { url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" },]
[[package]]name = "llmscraper"version = "0.1.0"source = { virtual = "." }dependencies = [ { name = "anthropic" }, { name = "apify-client" },]
[package.metadata]requires-dist = [ { name = "anthropic", specifier = ">=0.54.0" }, { name = "apify-client", specifier = ">=1.11.0" },]
[[package]]name = "more-itertools"version = "10.7.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671, upload-time = "2025-04-22T14:17:41.838Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278, upload-time = "2025-04-22T14:17:40.49Z" },]
[[package]]name = "pydantic"version = "2.11.7"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "annotated-types" }, { name = "pydantic-core" }, { name = "typing-extensions" }, { name = "typing-inspection" },]sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },]
[[package]]name = "pydantic-core"version = "2.33.2"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/e5/92/b31726561b5dae176c2d2c2dc43a9c5bfba5d32f96f8b4c0a600dd492447/pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8", size = 2028817, upload-time = "2025-04-23T18:30:43.919Z" }, { url = "https://files.pythonhosted.org/packages/a3/44/3f0b95fafdaca04a483c4e685fe437c6891001bf3ce8b2fded82b9ea3aa1/pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d", size = 1861357, upload-time = "2025-04-23T18:30:46.372Z" }, { url = "https://files.pythonhosted.org/packages/30/97/e8f13b55766234caae05372826e8e4b3b96e7b248be3157f53237682e43c/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d", size = 1898011, upload-time = "2025-04-23T18:30:47.591Z" }, { url = "https://files.pythonhosted.org/packages/9b/a3/99c48cf7bafc991cc3ee66fd544c0aae8dc907b752f1dad2d79b1b5a471f/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572", size = 1982730, upload-time = "2025-04-23T18:30:49.328Z" }, { url = "https://files.pythonhosted.org/packages/de/8e/a5b882ec4307010a840fb8b58bd9bf65d1840c92eae7534c7441709bf54b/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02", size = 2136178, upload-time = "2025-04-23T18:30:50.907Z" }, { url = "https://files.pythonhosted.org/packages/e4/bb/71e35fc3ed05af6834e890edb75968e2802fe98778971ab5cba20a162315/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b", size = 2736462, upload-time = "2025-04-23T18:30:52.083Z" }, { url = "https://files.pythonhosted.org/packages/31/0d/c8f7593e6bc7066289bbc366f2235701dcbebcd1ff0ef8e64f6f239fb47d/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2", size = 2005652, upload-time = "2025-04-23T18:30:53.389Z" }, { url = "https://files.pythonhosted.org/packages/d2/7a/996d8bd75f3eda405e3dd219ff5ff0a283cd8e34add39d8ef9157e722867/pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a", size = 2113306, upload-time = "2025-04-23T18:30:54.661Z" }, { url = "https://files.pythonhosted.org/packages/ff/84/daf2a6fb2db40ffda6578a7e8c5a6e9c8affb251a05c233ae37098118788/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac", size = 2073720, upload-time = "2025-04-23T18:30:56.11Z" }, { url = "https://files.pythonhosted.org/packages/77/fb/2258da019f4825128445ae79456a5499c032b55849dbd5bed78c95ccf163/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a", size = 2244915, upload-time = "2025-04-23T18:30:57.501Z" }, { url = "https://files.pythonhosted.org/packages/d8/7a/925ff73756031289468326e355b6fa8316960d0d65f8b5d6b3a3e7866de7/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b", size = 2241884, upload-time = "2025-04-23T18:30:58.867Z" }, { url = "https://files.pythonhosted.org/packages/0b/b0/249ee6d2646f1cdadcb813805fe76265745c4010cf20a8eba7b0e639d9b2/pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22", size = 1910496, upload-time = "2025-04-23T18:31:00.078Z" }, { url = "https://files.pythonhosted.org/packages/66/ff/172ba8f12a42d4b552917aa65d1f2328990d3ccfc01d5b7c943ec084299f/pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640", size = 1955019, upload-time = "2025-04-23T18:31:01.335Z" }, { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" }, { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" }, { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" }, { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" }, { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" }, { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" }, { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" }, { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" }, { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" }, { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" }, { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" }, { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" }, { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" }, { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" }, { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" }, { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" }, { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" }, { url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" }, { url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" }, { url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" }, { url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" }, { url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" }, { url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" }, { url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" }, { url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" }, { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" }, { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" }, { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" }, { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" }, { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" }, { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" }, { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" }, { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" }, { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" }, { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" }, { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" }, { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" }, { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" }, { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" }, { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" }, { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" }, { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" }, { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, { url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" }, { url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" }, { url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" }, { url = "https://files.pythonhosted.org/packages/12/73/8cd57e20afba760b21b742106f9dbdfa6697f1570b189c7457a1af4cd8a0/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e", size = 2067527, upload-time = "2025-04-23T18:32:59.771Z" }, { url = "https://files.pythonhosted.org/packages/e3/d5/0bb5d988cc019b3cba4a78f2d4b3854427fc47ee8ec8e9eaabf787da239c/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c", size = 2108225, upload-time = "2025-04-23T18:33:04.51Z" }, { url = "https://files.pythonhosted.org/packages/f1/c5/00c02d1571913d496aabf146106ad8239dc132485ee22efe08085084ff7c/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec", size = 2069490, upload-time = "2025-04-23T18:33:06.391Z" }, { url = "https://files.pythonhosted.org/packages/22/a8/dccc38768274d3ed3a59b5d06f59ccb845778687652daa71df0cab4040d7/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052", size = 2237525, upload-time = "2025-04-23T18:33:08.44Z" }, { url = "https://files.pythonhosted.org/packages/d4/e7/4f98c0b125dda7cf7ccd14ba936218397b44f50a56dd8c16a3091df116c3/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c", size = 2238446, upload-time = "2025-04-23T18:33:10.313Z" }, { url = "https://files.pythonhosted.org/packages/ce/91/2ec36480fdb0b783cd9ef6795753c1dea13882f2e68e73bce76ae8c21e6a/pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808", size = 2066678, upload-time = "2025-04-23T18:33:12.224Z" }, { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" }, { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" }, { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" }, { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" }, { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" }, { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" }, { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" }, { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" }, { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },]
[[package]]name = "sniffio"version = "1.3.1"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },]
[[package]]name = "typing-extensions"version = "4.14.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/d1/bc/51647cd02527e87d05cb083ccc402f93e441606ff1f01739a62c8ad09ba5/typing_extensions-4.14.0.tar.gz", hash = "sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4", size = 107423, upload-time = "2025-06-02T14:52:11.399Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/69/e0/552843e0d356fbb5256d21449fa957fa4eff3bbc135a74a691ee70c7c5da/typing_extensions-4.14.0-py3-none-any.whl", hash = "sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af", size = 43839, upload-time = "2025-06-02T14:52:10.026Z" },]
[[package]]name = "typing-inspection"version = "0.4.1"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },]
.actor/README.md
1# 🤖 LLM-Powered Web Scraper2
3An intelligent Apify Actor that uses Claude AI to automatically discover, test, and select the b- **Quality Threshold**: Lower `minDataQualityScore` if you're getting no resultsst Apify actors for your web scraping tasks. No manual configuration needed!4
5## ✨ Features6
7- **🧠 AI-Powered Actor Discovery**: Uses Claude AI to automatically find and test the best Apify actors for your target website8- **🔄 Smart Retry Logic**: Automatically adjusts parameters and retries failed attempts with different actors9- **📊 Quality Assessment**: Evaluates scraped data quality across multiple dimensions (completeness, relevance, structure, volume)10- **🎯 Priority-Based Testing**: Tests domain-specific actors first, then falls back to general-purpose ones11- **📈 Real-time Progress**: Tracks and reports scraping progress with detailed logging12- **🔗 MCP Integration**: Connects to Apify MCP Server for dynamic actor discovery and execution13- **⚙️ Flexible Configuration**: Extensive customization options for timeout, quality thresholds, and model selection14- **🛡️ Error Handling**: Robust error handling with detailed logging and graceful fallbacks15
16## 🚀 Quick Start17
181. **Set up your Claude API key** in the Actor input or as an environment variable192. **Provide your target URL** and describe what data you want to extract203. **Run the Actor** - it will automatically find and test the best scraping approach21
22### Example Input23
24```json25{26 "targetUrl": "https://books.toscrape.com/",27 "extractionGoal": "Extract book information including title, price, star rating, and availability",28 "claudeApiKey": "sk-ant-api03-...",29 "maxActorAttempts": 5,30 "maxTimeMinutes": 2031}32```33
34## 📝 Input Configuration35
36### Required Fields37
38- **`targetUrl`**: The URL of the website you want to scrape39- **`extractionGoal`**: Describe what data you want to extract from the website40- **`claudeApiKey`**: Your Anthropic Claude API key for AI-powered analysis41
42### Optional Configuration43
44- **`maxActorAttempts`** (default: 10): Maximum number of different actors to try45- **`maxRetriesPerActor`** (default: 3): Maximum retry attempts per actor46- **`maxTimeMinutes`** (default: 30): Maximum total execution time in minutes47- **`modelName`** (default: "claude-3-5-haiku-latest"): Claude model to use48- **`debugMode`** (default: false): Enable detailed logging49- **`preferSpecificActors`** (default: true): Prioritize domain-specific actors50- **`minDataQualityScore`** (default: 70): Minimum quality score (0-100) to accept results51- **`enableProxy`** (default: true): Use proxy for scraping requests52
53### Available Claude Models54
55- `claude-3-5-haiku-latest` - Fast & cost-effective (recommended)56- `claude-3-5-sonnet-latest` - Balanced performance and quality57- `claude-3-opus-latest` - Maximum quality (slower, more expensive)58
59## 📊 Output60
61The Actor saves results to:62
63### Dataset64
65Each scraped item with metadata:66
67```json68{69 "url": "https://example.com",70 "data": {...},71 "quality_score": 0.85,72 "actor_used": "apify/web-scraper",73 "timestamp": "2025-07-24T11:30:00Z",74 "success": true,75 "extraction_goal": "Extract product information",76 "total_execution_time": 45.2,77 "attempts_made": 378}79```80
81### Key-Value Store82
83Summary information in `SCRAPING_RESULT`:84
85```json86{87 "success": true,88 "quality_score": 0.85,89 "items_count": 25,90 "best_actor_id": "apify/web-scraper",91 "total_execution_time": 45.2,92 "attempts_made": 3,93 "progress_updates": [...],94 "actor_attempts": [...]95}96```97
98## 🔧 How It Works99
1001. **Actor Discovery**: Connects to Apify MCP Server to discover available actors1012. **AI Analysis**: Uses Claude to analyze the target website and select appropriate actors1023. **Smart Testing**: Tests actors in priority order with intelligent parameter adjustment1034. **Quality Evaluation**: Assesses data quality using multiple metrics1045. **Retry Logic**: Automatically retries with different parameters if needed1056. **Result Selection**: Returns the best results based on quality scores106
107## 🏗️ Architecture108
109The Actor consists of several key components:110
111- **MCP Client** (`src/llmscraper/mcp/`): Handles communication with Apify MCP Server112- **Claude Manager** (`src/llmscraper/claude/`): Manages AI conversations and tool calls113- **LLM Scraper Actor** (`src/llmscraper/llm_scraper/`): Main orchestration logic114- **Retry Logic** (`src/llmscraper/llm_scraper/retry_logic.py`): Intelligent parameter adjustment115- **Quality Evaluator** (`src/llmscraper/llm_scraper/quality_evaluator.py`): Data quality assessment116
117## 🔑 Environment Variables118
119- `ANTHROPIC_API_KEY`: Your Anthropic Claude API key (alternative to input field)120- `APIFY_TOKEN`: Automatically provided by Apify platform121- `MCP_SERVER_URL`: Custom MCP server URL (optional)122
123## ⚡ Performance Tips124
1251. **Use Haiku Model**: For most tasks, `claude-3-5-haiku-latest` provides the best speed/cost ratio1262. **Adjust Attempts**: Reduce `maxActorAttempts` for faster results, increase for better coverage1273. **Quality Threshold**: Lower `minDataQualityScore` if you're getting no results1284. **Time Limits**: Set appropriate `maxTimeMinutes` based on your needs129
130## 🛠️ Development131
132### Local Testing133
134```bash135# Install dependencies (using virtual environment)136pip install -r requirements.txt137
138# Or if you have the project's virtual environment:139./venv/bin/pip install -r requirements.txt140
141# Set up environment142export ANTHROPIC_API_KEY=your_key_here143
144# Run the actor locally145python3 main.py146
147# Or using npm scripts:148npm run start # Uses system python3149npm run start:local # Uses project virtual environment150```151
152### Project Structure153
154```text155LLMScraper/156├── main.py # Actor entry point157├── src/llmscraper/158│ ├── mcp/ # MCP client implementation159│ ├── claude/ # Claude AI integration160│ ├── llm_scraper/ # Main scraper logic161│ │ ├── actor.py # Main LLMScraperActor class162│ │ ├── models.py # Input/output models163│ │ ├── retry_logic.py # Intelligent retry logic164│ │ └── quality_evaluator.py # Data quality assessment165│ ├── scraping/ # Apify actor integrations166│ └── utils/ # Configuration and utilities167├── .actor/168│ ├── actor.json # Actor metadata169│ ├── input_schema.json # Input validation schema170│ └── README.md # This file171├── Dockerfile # Container configuration172├── requirements.txt # Python dependencies173├── package.json # Node.js metadata174└── pyproject.toml # Python packaging configuration175```176
177## 📚 API Reference178
179### Main Function180
181```python182from llmscraper.llm_scraper import LLMScraperActor, LLMScraperInput183
184# Create configuration185config = LLMScraperInput(186 target_url="https://example.com",187 extraction_goal="Extract product data",188 anthropic_api_key="sk-ant-..."189)190
191# Run the scraper192scraper = LLMScraperActor(config)193result = await scraper.run(progress_callback=None)194```195
196### Configuration197
198```python199from llmscraper.llm_scraper.models import LLMScraperInput200
201config = LLMScraperInput(202 target_url="https://example.com",203 extraction_goal="Extract product data",204 anthropic_api_key="sk-ant-...",205 max_actor_attempts=10,206 max_retries_per_actor=3,207 max_time_minutes=30,208 model_name="claude-3-5-haiku-latest",209 debug_mode=False,210 prefer_specific_actors=True,211 min_data_quality_score=0.7, # Note: API expects 0.0-1.0, input form uses 0-100212 enable_proxy=True213)214```215
216## 🤝 Contributing217
2181. Fork the repository2192. Create a feature branch2203. Make your changes2214. Add tests2225. Submit a pull request223
224## 📄 License225
226MIT License - see LICENSE file for details.227
228## 🆘 Support & Troubleshooting229
230### Common Issues231
232- **API Key Issues**: Ensure your Claude API key is valid and has sufficient credits233- **No Results Found**: Try reducing `minDataQualityScore` or increasing `maxActorAttempts`234- **Timeout Errors**: Increase `maxTimeMinutes` for complex websites235- **Quality Score Too Low**: Adjust your `extractionGoal` to be more specific236
237### Debugging238
239- Enable `debugMode: true` for detailed logging240- Check the Actor logs for step-by-step execution details241- Verify the target URL is accessible and returns content242- Monitor the progress updates in the key-value store243
244### Performance Optimization245
246- Use `claude-3-5-haiku-latest` for faster, cost-effective processing247- Set appropriate `maxActorAttempts` based on your time/quality requirements248- Enable `preferSpecificActors` to prioritize domain-specific solutions249
250## 🔄 Version History251
252- **v1.0.0** (July 2025): Initial release with MCP integration, Claude AI, and intelligent retry logic253 - AI-powered actor discovery and testing254 - Multi-dimensional quality assessment255 - Real-time progress tracking256 - Comprehensive error handling and retry logic257 - Support for all major Claude models
.actor/actor.json
{ "actorSpecification": 1, "name": "LLMScraper", "title": "� LLM-Powered Web Scraper", "description": "AI-powered web scraper that automatically discovers and tests the best Apify actors for your scraping task using Claude AI. No manual configuration needed!", "version": "1.0", "buildTag": "latest", "environmentVariables": { "ANTHROPIC_API_KEY": "@claudeApiKey" }, "dockerfile": "./Dockerfile", "input": "./input_schema.json", "storages": { "dataset": { "actorSpecification": 1, "views": { "scraped_data": { "title": "Scraped Data", "transformation": {}, "display": { "component": "table", "properties": { "url": { "label": "Source URL", "format": "link" }, "data": { "label": "Extracted Data", "format": "object" }, "quality_score": { "label": "Quality Score", "format": "number" }, "actor_used": { "label": "Actor Used", "format": "text" }, "timestamp": { "label": "Scraped At", "format": "datetime" }, "success": { "label": "Success", "format": "boolean" }, "error": { "label": "Error Message", "format": "text" } } } } } } }}
.actor/input_schema.json
{ "title": "LLM Scraper Configuration", "description": "AI-powered web scraper that automatically discovers and tests the best Apify actors for your scraping task using Claude AI", "type": "object", "schemaVersion": 1, "properties": { "targetUrl": { "title": "Target URL", "description": "The URL of the website you want to scrape", "type": "string", "editor": "textfield", "prefill": "https://books.toscrape.com/" }, "extractionGoal": { "title": "Extraction Goal", "description": "Describe what data you want to extract from the website", "type": "string", "editor": "textarea", "prefill": "Extract product information including title, price, rating, and availability from the book listings" }, "claudeApiKey": { "title": "Claude API Key", "description": "Your Anthropic Claude API key for AI-powered actor discovery and testing", "type": "string", "editor": "textfield", "isSecret": true }, "maxActorAttempts": { "title": "Max Actor Attempts", "description": "Maximum number of different actors to try", "type": "integer", "editor": "number", "minimum": 1, "maximum": 20, "default": 10 }, "maxRetriesPerActor": { "title": "Max Retries per Actor", "description": "Maximum number of retry attempts per actor", "type": "integer", "editor": "number", "minimum": 1, "maximum": 10, "default": 3 }, "maxTimeMinutes": { "title": "Max Time (minutes)", "description": "Maximum total execution time in minutes", "type": "integer", "editor": "number", "minimum": 5, "maximum": 120, "default": 30, "unit": "minutes" }, "mcpUrl": { "title": "MCP Server URL", "description": "URL of the Apify MCP server for actor discovery", "type": "string", "editor": "textfield", "default": "https://mcp.apify.com/sse?enableAddingActors=true" }, "modelName": { "title": "Claude Model", "description": "Choose which Claude model to use for AI analysis", "type": "string", "editor": "select", "default": "claude-3-5-haiku-latest", "enum": [ "claude-3-5-haiku-latest", "claude-3-5-sonnet-latest", "claude-3-opus-latest", "claude-3-haiku-20240307", "claude-3-sonnet-20240229", "claude-3-5-sonnet-20241022" ], "enumTitles": [ "Claude 3.5 Haiku (Fast & Cost-Effective)", "Claude 3.5 Sonnet (Balanced)", "Claude 3 Opus (Maximum Quality)", "Claude 3 Haiku (Legacy)", "Claude 3 Sonnet (Legacy)", "Claude 3.5 Sonnet (Legacy)" ] }, "debugMode": { "title": "Debug Mode", "description": "Enable detailed logging for debugging", "type": "boolean", "default": false, "editor": "checkbox", "sectionCaption": "Advanced Settings", "sectionDescription": "Configure advanced scraping behavior and quality settings" }, "preferSpecificActors": { "title": "Prefer Specific Actors", "description": "Prioritize domain-specific actors over general-purpose ones", "type": "boolean", "default": true, "editor": "checkbox" }, "minDataQualityScore": { "title": "Minimum Data Quality Score", "description": "Minimum quality score (0-100) to accept results", "type": "integer", "editor": "number", "minimum": 0, "maximum": 100, "default": 70 }, "enableProxy": { "title": "Enable Proxy", "description": "Use proxy for scraping requests", "type": "boolean", "default": true, "editor": "checkbox" } }, "required": ["targetUrl", "extractionGoal", "claudeApiKey"]}
src/llmscraper/__init__.py
1"""2ScraperCodeGenerator - Intelligent Web Scraping with AI3
4A smart web scraping framework that uses multiple scraping strategies5and AI-powered quality evaluation to extract data from websites.6Includes LLM Scraper for automated actor discovery and testing.7"""8
9from .pipeline import IntelligentScraperPipeline, run_intelligent_scraper10from .models import ScrapingResult, GoalExtractionResult, PipelineConfig, ClaudeModel11from .utils.config_parser import ConfigurationParser12from .scraping.actor_multi_scraper import ActorMultiScraper13
14# LLM Scraper functionality15from .llm_scraper import (16 LLMScraperActor, run_llm_scraper_actor, run_llm_scraper,17 LLMScraperInput, LLMScraperOutput18)19
20# MCP and Claude functionality21from .mcp import MCPClient22from .claude import ClaudeManager23
24__version__ = "0.1.0"25__all__ = [26 "IntelligentScraperPipeline",27 "run_intelligent_scraper",28 "ScrapingResult",29 "GoalExtractionResult",30 "PipelineConfig",31 "ClaudeModel",32 "ConfigurationParser",33 "ActorMultiScraper",34 # LLM Scraper35 "LLMScraperActor",36 "run_llm_scraper_actor", 37 "run_llm_scraper",38 "LLMScraperInput",39 "LLMScraperOutput",40 # MCP and Claude41 "MCPClient",42 "ClaudeManager"43 "ConfigurationParser",44 "ActorMultiScraper"45]
src/llmscraper/models.py
1"""2Data models for the ScraperCodeGenerator pipeline.3"""4
5from dataclasses import dataclass, field6from typing import Dict, Any, Optional, List7from enum import Enum8
9
10class ClaudeModel(Enum):11 """Available Claude model versions."""12 # Claude 4 models (latest)13 CLAUDE_4_SONNET = "claude-sonnet-4-20250514"14 CLAUDE_4_OPUS = "claude-opus-4-20250514"15 16 # Claude 3.7 models17 CLAUDE_3_7_SONNET = "claude-3-7-sonnet-20250219"18 19 # Claude 3.5 models20 CLAUDE_3_5_SONNET = "claude-3-5-sonnet-20241022"21 CLAUDE_3_5_HAIKU = "claude-3-5-haiku-20241022"22 23 # Claude 3 models24 CLAUDE_3_SONNET = "claude-3-sonnet-20240229"25 CLAUDE_3_HAIKU = "claude-3-haiku-20240307"26
27
28@dataclass29class ActorConfig:30 """Configuration for an individual Apify actor."""31 actor_id: str32 enabled: bool = True33 input: Dict[str, Any] = field(default_factory=dict)34 name: Optional[str] = None35 description: Optional[str] = None36
37
38@dataclass39class HTMLPruningConfig:40 """Configuration for HTML pruning behavior."""41 enabled: bool = True42 max_list_items: int = 543 max_text_length: int = 50044 prune_before_evaluation: bool = True45 prune_percentage: float = 0.8 # Keep 80% of content, remove 20%46
47
48@dataclass49class PipelineConfig:50 """Complete pipeline configuration."""51 # Core settings52 for_actor: bool = False53 test_script: bool = False54 output_script_path: Optional[str] = None55 56 # Claude settings57 claude_model: ClaudeModel = ClaudeModel.CLAUDE_4_SONNET58 claude_api_key: Optional[str] = None59 60 # HTML processing settings61 html_pruning: HTMLPruningConfig = field(default_factory=HTMLPruningConfig)62 63 # Actor configurations64 actors: Dict[str, ActorConfig] = field(default_factory=dict)65 66 # Execution settings67 max_retries: int = 368 timeout_seconds: int = 6069 concurrent_actors: bool = True70 71 def get_enabled_actors(self) -> Dict[str, ActorConfig]:72 """Get only enabled actors."""73 return {name: config for name, config in self.actors.items() if config.enabled}74
75
76@dataclass77class ScrapingResult:78 """Result of the complete scraping pipeline."""79 success: bool80 generated_script: Optional[str] = None81 best_actor: Optional[str] = None82 schema: Optional[Dict[str, Any]] = None83 error_message: Optional[str] = None84 quality_scores: Optional[Dict[str, int]] = None85 extracted_data: Optional[List[Dict[str, Any]]] = None86
87
88@dataclass89class EvaluationResult:90 """Result of HTML quality evaluation."""91 score: int # 1-10 scale92 reasoning: str93
94
95@dataclass96class PreEvaluationResult:97 """Result of pre-evaluation checks before sending to Claude."""98 is_valid_html: bool99 score: Optional[int] = None # If we can determine score without Claude100 reasoning: Optional[str] = None101 should_continue_to_claude: bool = True102
103
104@dataclass105class GoalExtractionResult:106 """Result of extracting goal from natural language prompt."""107 goal: str108 url: str109 success: bool110 error_message: Optional[str] = None111
112
113def get_default_actor_configs() -> Dict[str, ActorConfig]:114 """Get default actor configurations with common Apify actors."""115 return {116 "cheerio-scraper": ActorConfig(117 actor_id="apify/cheerio-scraper",118 name="Cheerio Scraper",119 description="Fast jQuery-like server-side scraping",120 enabled=True,121 input={122 "maxRequestRetries": 3,123 "requestTimeoutSecs": 30,124 "maxRequestsPerCrawl": 1,125 "pseudoUrls": [],126 "linkSelector": "",127 "pageFunction": """128 async function pageFunction(context) {129 const { request, log, skipLinks, $ } = context;130 return {131 url: request.url,132 title: $('title').text(),133 html: $('html').html()134 };135 }136 """,137 "proxyConfiguration": {"useApifyProxy": True}138 }139 ),140 "web-scraper": ActorConfig(141 actor_id="apify/web-scraper",142 name="Web Scraper",143 description="Versatile web scraper with JavaScript support",144 enabled=True,145 input={146 "maxRequestRetries": 3,147 "requestTimeoutSecs": 30,148 "maxPagesPerCrawl": 1,149 "pageFunction": """150 async function pageFunction(context) {151 const { request, log, skipLinks, $ } = context;152 return {153 url: request.url,154 title: $('title').text(),155 html: $('html').html()156 };157 }158 """,159 "proxyConfiguration": {"useApifyProxy": True}160 }161 ),162 "website-content-crawler": ActorConfig(163 actor_id="apify/website-content-crawler",164 name="Website Content Crawler",165 description="Advanced crawler with Playwright support",166 enabled=True,167 input={168 "maxCrawlPages": 1,169 "crawler": "playwright",170 "proxyConfiguration": {"useApifyProxy": True}171 }172 ),173 "playwright-scraper": ActorConfig(174 actor_id="apify/playwright-scraper",175 name="Playwright Scraper",176 description="Modern browser automation with Playwright",177 enabled=False,178 input={179 "maxRequestRetries": 3,180 "requestTimeoutSecs": 30,181 "maxPagesPerCrawl": 1,182 "pageFunction": """183 async function pageFunction(context) {184 const { request, log, page } = context;185 const title = await page.title();186 const html = await page.content();187 return {188 url: request.url,189 title: title,190 html: html191 };192 }193 """,194 "proxyConfiguration": {"useApifyProxy": True}195 }196 ),197 "puppeteer-scraper": ActorConfig(198 actor_id="apify/puppeteer-scraper",199 name="Puppeteer Scraper",200 description="Chrome-based scraping with Puppeteer",201 enabled=False,202 input={203 "maxRequestRetries": 3,204 "requestTimeoutSecs": 30,205 "maxPagesPerCrawl": 1,206 "pageFunction": """207 async function pageFunction(context) {208 const { request, log, page } = context;209 const title = await page.title();210 const html = await page.content();211 return {212 url: request.url,213 title: title,214 html: html215 };216 }217 """,218 "proxyConfiguration": {"useApifyProxy": True}219 }220 ),221 "jsdom-scraper": ActorConfig(222 actor_id="apify/jsdom-scraper",223 name="JSDOM Scraper",224 description="Lightweight JavaScript DOM scraping",225 enabled=False,226 input={227 "maxRequestRetries": 3,228 "requestTimeoutSecs": 30,229 "maxPagesPerCrawl": 1,230 "pageFunction": """231 async function pageFunction(context) {232 const { request, log, window } = context;233 const $ = window.$;234 return {235 url: request.url,236 title: $('title').text(),237 html: $('html').html()238 };239 }240 """,241 "proxyConfiguration": {"useApifyProxy": True}242 }243 )244 }
src/llmscraper/pipeline.py
1"""2Main pipeline for intelligent web scraping.3"""4
5import logging6from typing import Optional7
8from .models import ScrapingResult, PipelineConfig9from .scraping import MultiActorScraper10from .scraping.actor_multi_scraper import ActorMultiScraper11from .evaluation import HTMLQualityEvaluator12from .generation import ScriptGenerator, ScriptExecutor13from .utils import prune_html, validate_required_keys, get_api_key14
15
16class IntelligentScraperPipeline:17 """Main pipeline class that orchestrates the intelligent web scraping process."""18 19 def __init__(self, apify_token: str, claude_api_key: str, actor_logger=None, config: Optional[PipelineConfig] = None):20 """21 Initialize the pipeline with required API tokens.22 23 Args:24 apify_token: Apify API token for web scraping25 claude_api_key: Anthropic Claude API key for AI analysis26 actor_logger: Optional Actor logger for actor mode27 config: Optional pipeline configuration28 """29 # Validate API keys30 validated_keys = validate_required_keys(31 apify_token=apify_token,32 claude_api_key=claude_api_key33 )34 35 self.apify_token = validated_keys['apify_token']36 self.claude_api_key = validated_keys['claude_api_key']37 self.config = config or PipelineConfig()38 39 # Initialize components with configuration40 self.multi_scraper = MultiActorScraper(self.apify_token)41 self.actor_scraper = ActorMultiScraper() # For actor-to-actor communication42 self.quality_evaluator = HTMLQualityEvaluator(self.claude_api_key, self.config.claude_model)43 self.script_generator = ScriptGenerator(self.claude_api_key, self.config.claude_model)44 self.script_executor = ScriptExecutor()45 46 # Setup logging - use Actor logger if provided, otherwise standard logging47 self.logger = actor_logger if actor_logger else logging.getLogger(__name__)48 self.is_actor_mode = actor_logger is not None49 50 async def run_complete_pipeline(self, target_url: str, user_goal: str, 51 output_script_path: Optional[str] = None,52 prune_before_evaluation: bool = True,53 test_script: bool = False,54 for_actor: bool = False) -> ScrapingResult:55 """56 Run the complete intelligent scraping pipeline.57 58 Args:59 target_url: The URL to scrape60 user_goal: Natural language description of what to extract61 output_script_path: Path where to save the generated script (None for actor mode)62 prune_before_evaluation: If True, prune HTML before quality evaluation63 test_script: If True, test the generated script before finalizing64 for_actor: If True, generate script for Apify actor format65 66 Returns:67 ScrapingResult containing the outcome and generated artifacts68 """69 self.logger.info(f"PIPELINE: Starting intelligent scraping pipeline for: {target_url}")70 self.logger.info(f"PIPELINE: User goal: {user_goal}")71 self.logger.info(f"PIPELINE: Actor mode: {for_actor}")72 73 try:74 # Step 1: Run multiple actors to scrape the website75 self.logger.info("PIPELINE: Step 1: Running multi-actor scraping...")76 77 # Use actor-aware scraper if running inside an Apify actor78 if for_actor:79 self.logger.info("PIPELINE: Using actor-to-actor communication...")80 scraping_results = await self.actor_scraper.scrape_with_multiple_actors(target_url)81 else:82 self.logger.info("PIPELINE: Using client-based scraping...")83 # Use configured actors instead of hardcoded ones84 enabled_actors = self.config.get_enabled_actors()85 if enabled_actors:86 scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url, enabled_actors)87 else:88 scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url)89 90 if not any(content for content in scraping_results.values() if content):91 return ScrapingResult(92 success=False,93 error_message="All scraping actors failed to retrieve content"94 )95 96 # Step 2: Evaluate quality of each result97 self.logger.info("PIPELINE: Step 2: Evaluating HTML quality for each actor...")98 quality_scores, best_actor, best_html = self._evaluate_html_quality(99 scraping_results, user_goal, prune_before_evaluation100 )101 102 if not best_html:103 return ScrapingResult(104 success=False,105 error_message="No actor produced quality HTML content",106 quality_scores=quality_scores107 )108 109 self.logger.info(f"PIPELINE: Best actor selected: {best_actor} with score {quality_scores[best_actor]}/10")110 111 # Step 3: Prune the best HTML to reduce token count112 self.logger.info("PIPELINE: Step 3: Pruning HTML content...")113 114 # Use configuration for pruning settings115 if self.config.html_pruning.enabled:116 pruned_html = prune_html(117 best_html, 118 max_list_items=self.config.html_pruning.max_list_items, 119 max_text_length=self.config.html_pruning.max_text_length,120 prune_percentage=self.config.html_pruning.prune_percentage121 )122 else:123 pruned_html = best_html124 125 original_length = len(best_html)126 pruned_length = len(pruned_html)127 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0128 129 self.logger.info(f"PIPELINE: HTML pruned: {original_length:,} → {pruned_length:,} chars ({reduction:.1f}% reduction)")130 131 # Step 4: Generate Python scraping script132 self.logger.info("PIPELINE: Step 4: Generating Python scraping script...")133 generated_script = self.script_generator.generate_scraping_script(134 target_url, best_actor, pruned_html, user_goal, for_actor135 )136 137 if not generated_script:138 return ScrapingResult(139 success=False,140 error_message="Failed to generate scraping script",141 best_actor=best_actor,142 quality_scores=quality_scores143 )144 145 # Step 5: Test the script if requested146 extracted_data = None147 if test_script:148 self.logger.info("PIPELINE: Step 5: Testing generated script...")149 test_result = self.script_executor.test_script(generated_script, best_html)150 151 if test_result["success"]:152 self.logger.info(f"PIPELINE: ✅ Script test passed! Extracted {test_result.get('item_count', 0)} items")153 extracted_data = test_result["data"]154 else:155 self.logger.warning(f"PIPELINE: ⚠️ Script test failed: {test_result['error']}")156 # Continue anyway, but log the issue157 158 # Step 6: Save the generated script (only if not actor mode)159 if output_script_path and not for_actor:160 self.logger.info(f"PIPELINE: Step 6: Saving generated script to {output_script_path}")161 with open(output_script_path, 'w', encoding='utf-8') as f:162 f.write(generated_script)163 164 self.logger.info("PIPELINE: ✅ Pipeline completed successfully!")165 166 return ScrapingResult(167 success=True,168 generated_script=generated_script,169 best_actor=best_actor,170 quality_scores=quality_scores,171 extracted_data=extracted_data172 )173 174 except Exception as e:175 self.logger.error(f"PIPELINE: Pipeline failed with error: {str(e)}")176 return ScrapingResult(177 success=False,178 error_message=f"Pipeline error: {str(e)}"179 )180 181 def _evaluate_html_quality(self, scraping_results: dict, user_goal: str, 182 prune_before_evaluation: bool) -> tuple[dict, str, str]:183 """Evaluate HTML quality for each scraping result."""184 quality_scores = {}185 best_actor = None186 best_html = None187 best_score = 0188 189 for actor_name, html_content in scraping_results.items():190 if html_content:191 self.logger.info(f"PIPELINE: Evaluating {actor_name}...")192 193 # Optionally prune HTML before evaluation194 evaluation_html = html_content195 if prune_before_evaluation:196 original_length = len(html_content)197 # Use more aggressive pruning for evaluation198 evaluation_html = prune_html(199 html_content, 200 max_list_items=3, 201 max_text_length=100,202 prune_percentage=0.5 # More aggressive for evaluation203 )204 pruned_length = len(evaluation_html)205 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0206 self.logger.info(f"PIPELINE: {actor_name} HTML pruned for evaluation: {original_length:,} → {pruned_length:,} chars ({reduction:.1f}% reduction)")207 208 evaluation = self.quality_evaluator.evaluate_html_quality(user_goal, evaluation_html)209 210 if evaluation:211 quality_scores[actor_name] = evaluation.score212 self.logger.info(f"PIPELINE: {actor_name} quality score: {evaluation.score}/10 - {evaluation.reasoning}")213 214 if evaluation.score > best_score:215 best_score = evaluation.score216 best_actor = actor_name217 best_html = html_content # Keep original HTML, not pruned version218 else:219 quality_scores[actor_name] = 0220 self.logger.warning(f"PIPELINE: Failed to evaluate {actor_name}")221 else:222 quality_scores[actor_name] = 0223 self.logger.warning(f"PIPELINE: {actor_name} returned no content")224 225 return quality_scores, best_actor, best_html226
227
228async def run_intelligent_scraper(target_url: str, user_goal: str, 229 apify_token: Optional[str] = None,230 claude_api_key: Optional[str] = None,231 output_path: Optional[str] = "generated_scraper.py",232 prune_before_evaluation: bool = True,233 test_script: bool = False,234 for_actor: bool = False,235 actor_logger=None,236 config: Optional[PipelineConfig] = None) -> ScrapingResult:237 """238 Convenience function to run the complete intelligent scraping pipeline.239 240 Args:241 target_url: URL to scrape242 user_goal: Natural language description of extraction goal243 apify_token: Apify API token (uses APIFY_TOKEN env var if not provided)244 claude_api_key: Claude API key (uses CLAUDE_API_KEY env var if not provided)245 output_path: Path to save the generated script (None for actor mode)246 prune_before_evaluation: If True, prune HTML before quality evaluation247 test_script: If True, test the generated script before finalizing248 for_actor: If True, generate script for Apify actor format249 actor_logger: Optional Actor logger for actor mode250 config: Optional pipeline configuration251 252 Returns:253 ScrapingResult with the outcome254 """255 # Get tokens from environment if not provided256 if not apify_token:257 apify_token = get_api_key("APIFY_TOKEN")258 if not claude_api_key:259 claude_api_key = get_api_key("CLAUDE_API_KEY")260 261 if not apify_token:262 return ScrapingResult(263 success=False,264 error_message="APIFY_TOKEN not provided and not found in environment variables"265 )266 267 if not claude_api_key:268 return ScrapingResult(269 success=False,270 error_message="CLAUDE_API_KEY not provided and not found in environment variables"271 )272 273 # Create and run pipeline274 pipeline = IntelligentScraperPipeline(apify_token, claude_api_key, actor_logger, config)275 return await pipeline.run_complete_pipeline(276 target_url, user_goal, output_path, prune_before_evaluation, test_script, for_actor277 )
src/llmscraper/claude/__init__.py
1"""2Claude conversation management package.3"""4
5from .manager import ClaudeManager, ToolCall, ConversationResult6
7__all__ = ['ClaudeManager', 'ToolCall', 'ConversationResult']
src/llmscraper/claude/manager.py
1"""2Claude conversation manager for LLM interactions and tool calling.3"""4
5import asyncio6import logging7from typing import Dict, List, Any, Optional, Callable8from dataclasses import dataclass9import json10import time11
12import anthropic13from anthropic.types import MessageParam, ContentBlockParam, Message14
15from ..mcp import MCPClient, MCPCallResult16
17
18@dataclass19class ToolCall:20 """Represents a tool call to be executed."""21 id: str22 name: str23 arguments: Dict[str, Any]24
25
26@dataclass27class ConversationResult:28 """Result from a conversation step."""29 message: str30 tool_calls: List[ToolCall]31 reasoning: str32 is_final: bool = False33 error: Optional[str] = None34
35
36class ClaudeManager:37 """Manages conversations with Claude and tool execution."""38 39 def __init__(self, api_key: str, model: str = "claude-3-5-haiku-latest", 40 max_tokens: int = 2048, max_tool_calls_per_round: int = 10):41 """42 Initialize Claude manager.43 44 Args:45 api_key: Anthropic API key46 model: Claude model to use47 max_tokens: Maximum tokens per response48 max_tool_calls_per_round: Maximum tool calls per conversation round49 """50 self.client = anthropic.Anthropic(api_key=api_key)51 self.model = model52 self.max_tokens = max_tokens53 self.max_tool_calls_per_round = max_tool_calls_per_round54 self.conversation: List[MessageParam] = []55 self.logger = logging.getLogger(__name__)56 57 # System prompt for LLM scraper58 self.system_prompt = ""59 60 def set_system_prompt(self, target_url: str, extraction_goal: str, 61 max_attempts: int, max_retries: int, max_time: int):62 """Set the system prompt with specific parameters."""63 self.system_prompt = f"""You are an expert web scraping agent that systematically tests Apify Actors to find the best one for a specific task.64
65TARGET: {target_url}66GOAL: {extraction_goal}67LIMITS: {max_attempts} attempts, {max_retries} retries per actor, {max_time} minutes68
69STRATEGY:701. Search for relevant actors using search-actors712. From search results, extract a prioritized list of actor names to test723. For each actor: get details, configure input, test it734. Analyze results and try next actor if current one fails745. Stop when you find an actor that successfully extracts the target data75
76ACTOR SELECTION PRIORITY:77- Domain-specific scrapers first (e.g., "linkedin-scraper" for LinkedIn URLs)78- Popular scrapers with high user counts and runs79- General web scrapers as fallback (web-scraper, cheerio-scraper, website-content-crawler, rag-web-browser)80
81TESTING APPROACH:82- Start with simple, minimal input configurations83- If actor fails, try different input parameters (proxy settings, timeouts, formats)84- Analyze failure patterns and adjust accordingly85- Don't waste attempts on obviously unsuitable actors86
87RESULT ANALYSIS:88- Look for actual extracted data relevant to the goal89- Check data quality and completeness90- Prefer actors that return structured, relevant data91- Stop testing when you find a working solution92
93IMPORTANT:94- Be systematic and efficient with your attempts95- Extract actor names from search results as a prioritized list96- Test actors one by one until success97- Focus on finding ANY working solution, then optimize if needed98- Provide clear reasoning for each decision99
100Start by searching for actors, then systematically test them."""101
102 async def process_query(self, query: str, mcp_client: MCPClient, 103 on_progress: Optional[Callable[[str, str], None]] = None) -> ConversationResult:104 """105 Process a user query with Claude, handling tool calls.106 107 Args:108 query: The user query/request109 mcp_client: MCP client for tool execution110 on_progress: Optional callback for progress updates (role, content)111 112 Returns:113 ConversationResult with the response and any tool calls made114 """115 try:116 self.logger.info(f"Processing query: {query[:100]}...")117 self.logger.debug(f"Full query: {query}")118 119 # Add user message to conversation120 self.conversation.append({"role": "user", "content": query})121 self.logger.debug(f"Added user message to conversation. Total messages: {len(self.conversation)}")122 123 # Get available tools124 tools = mcp_client.format_tools_for_claude()125 126 # Start conversation loop127 total_tool_calls = 0128 reasoning_parts = []129 all_tool_calls = []130 131 while total_tool_calls < self.max_tool_calls_per_round:132 # Create message with Claude133 response = await self._create_message(tools)134 135 if not response:136 break137 138 # Process response blocks139 assistant_content = []140 tool_use_blocks = []141 text_content = ""142 143 for block in response.content:144 if block.type == 'text':145 assistant_content.append(block)146 text_content += block.text147 reasoning_parts.append(block.text)148 149 # Log Claude's text responses150 self.logger.info(f"Claude says: {block.text}")151 152 if on_progress:153 on_progress("assistant", block.text)154 155 elif block.type == 'tool_use':156 assistant_content.append(block)157 tool_use_blocks.append(block)158 159 # Log tool usage160 self.logger.info(f"Claude wants to use tool: {block.name} with args: {block.input}")161 162 tool_call = ToolCall(163 id=block.id,164 name=block.name,165 arguments=block.input166 )167 all_tool_calls.append(tool_call)168 169 if on_progress:170 on_progress("tool_call", f"Calling {block.name} with {block.input}")171 172 # Add assistant message to conversation173 self.conversation.append({174 "role": "assistant", 175 "content": assistant_content176 })177 self.logger.debug(f"Added assistant message to conversation. Total messages: {len(self.conversation)}")178 179 # If no tool calls, we're done180 if not tool_use_blocks:181 self.logger.info("No tool calls in response, conversation complete")182 return ConversationResult(183 message=text_content,184 tool_calls=all_tool_calls,185 reasoning=" ".join(reasoning_parts),186 is_final=True187 )188 189 # Execute tool calls190 tool_results = []191 for block in tool_use_blocks:192 total_tool_calls += 1193 194 # Check limit195 if total_tool_calls > self.max_tool_calls_per_round:196 result_content = f"Tool call limit reached ({self.max_tool_calls_per_round})"197 break198 199 # Execute tool200 try:201 self.logger.info(f"Executing tool: {block.name} with input: {block.input}")202 result = await mcp_client.call_tool(block.name, block.input)203 204 # Format result content205 if isinstance(result.content, list):206 # Convert list to readable string207 result_content = json.dumps(result.content, indent=2)208 elif isinstance(result.content, dict):209 result_content = json.dumps(result.content, indent=2)210 else:211 result_content = str(result.content)212 213 # Log the raw result214 self.logger.info(f"Tool {block.name} returned {len(result_content)} chars")215 if result.is_error:216 self.logger.warning(f"Tool {block.name} failed: {result_content[:200]}...")217 else:218 self.logger.info(f"Tool {block.name} succeeded. Result preview: {result_content[:200]}...")219 220 # Truncate only if it would exceed context limits221 result_content = self._truncate_result_if_needed(result_content, block.name)222 223 if result.is_error:224 result_content = f"Error: {result_content}"225 226 except Exception as e:227 self.logger.error(f"Tool execution failed for {block.name}: {str(e)}")228 result_content = f"Tool execution failed: {str(e)}"229 result = MCPCallResult(content=result_content, is_error=True)230 231 # Create tool result block232 tool_result = {233 "type": "tool_result",234 "tool_use_id": block.id,235 "content": result_content,236 "is_error": getattr(result, 'is_error', False)237 }238 239 tool_results.append(tool_result)240 241 if on_progress:242 status = "ERROR" if getattr(result, 'is_error', False) else "SUCCESS"243 on_progress("tool_result", f"{block.name}: {status}")244 245 # Add tool results to conversation246 if tool_results:247 self.conversation.append({248 "role": "user",249 "content": tool_results250 })251 self.logger.debug(f"Added tool results to conversation. Total messages: {len(self.conversation)}")252 253 # Continue conversation loop254 await asyncio.sleep(0.1) # Small delay to prevent rate limiting255 256 # Return final result257 return ConversationResult(258 message=" ".join(reasoning_parts),259 tool_calls=all_tool_calls,260 reasoning=" ".join(reasoning_parts),261 is_final=total_tool_calls >= self.max_tool_calls_per_round,262 error="Tool call limit reached" if total_tool_calls >= self.max_tool_calls_per_round else None263 )264 265 except Exception as e:266 self.logger.error(f"Error processing query: {str(e)}")267 return ConversationResult(268 message=f"Error processing query: {str(e)}",269 tool_calls=[],270 reasoning="",271 error=str(e)272 )273 274 async def _create_message(self, tools: List[Dict[str, Any]]) -> Optional[Message]:275 """Create a message with Claude."""276 try:277 response = await asyncio.to_thread(278 self.client.messages.create,279 model=self.model,280 max_tokens=self.max_tokens,281 system=self.system_prompt,282 messages=self.conversation,283 tools=tools if tools else None284 )285 return response286 287 except Exception as e:288 self.logger.error(f"Error creating message: {str(e)}")289 return None290 291 def _truncate_result_if_needed(self, content: str, tool_name: str) -> str:292 """Truncate results only if they would exceed Claude's context limits."""293 # Claude models have different context limits:294 # Haiku: ~200K tokens (~800K chars)295 # Sonnet: ~200K tokens (~800K chars) 296 # Opus: ~200K tokens (~800K chars)297 298 # Conservative limit to leave room for conversation history and system prompt299 max_chars_per_result = 50000 # ~12.5K tokens300 301 if len(content) <= max_chars_per_result:302 return content303 304 self.logger.warning(f"Truncating {tool_name} result from {len(content)} to {max_chars_per_result} chars")305 306 # For search results, keep the structure but limit items307 if tool_name == "search-actors" and content.startswith('['):308 try:309 data = json.loads(content)310 if isinstance(data, list) and len(data) > 10:311 # Keep first 10 results for search instead of 5312 truncated = data[:10]313 result = json.dumps(truncated, indent=2)314 if len(result) <= max_chars_per_result:315 return result + f"\n... (showing 10 of {len(data)} results)"316 except:317 pass318 319 # For actor execution results, try to preserve key data320 if tool_name in ["run-actor", "execute-actor"]:321 try:322 data = json.loads(content)323 if isinstance(data, list) and len(data) > 5:324 # Keep first 5 items for actor results325 truncated = data[:5]326 result = json.dumps(truncated, indent=2)327 if len(result) <= max_chars_per_result:328 return result + f"\n... [Showing 5 of {len(data)} scraped items. Total result was {len(content)} chars.]"329 except:330 pass331 332 # Generic truncation with smart ending333 truncated = content[:max_chars_per_result]334 335 # Try to end at a complete JSON object or line336 if content.startswith('{') or content.startswith('['):337 # Find last complete line338 last_newline = truncated.rfind('\n')339 if last_newline > max_chars_per_result // 2:340 truncated = truncated[:last_newline]341 342 if tool_name in ["run-actor", "execute-actor"]:343 truncated += f"\n\n... [Result truncated - original length: {len(content)} chars. Contains scraped data that was cut for context management.]"344 else:345 truncated += f"\n\n... [Truncated - original length: {len(content)} chars]"346 347 return truncated348 349 def reset_conversation(self):350 """Reset the conversation history."""351 self.conversation = []352 353 def get_conversation_summary(self) -> str:354 """Get a summary of the conversation."""355 summary_parts = []356 357 for msg in self.conversation:358 role = msg["role"]359 content = msg["content"]360 361 if isinstance(content, str):362 summary_parts.append(f"{role}: {content[:100]}...")363 elif isinstance(content, list):364 text_parts = []365 for block in content:366 if isinstance(block, dict):367 if block.get("type") == "text":368 text_parts.append(block.get("text", "")[:50])369 elif block.get("type") == "tool_use":370 text_parts.append(f"[TOOL: {block.get('name', 'unknown')}]")371 elif block.get("type") == "tool_result":372 text_parts.append("[TOOL_RESULT]")373 374 summary_parts.append(f"{role}: {' '.join(text_parts)}")375 376 return "\n".join(summary_parts)
src/llmscraper/evaluation/__init__.py
1"""2Evaluation module for ScraperCodeGenerator.3"""4
5from .html_quality_evaluator import HTMLQualityEvaluator6
7__all__ = ["HTMLQualityEvaluator"]
src/llmscraper/evaluation/html_quality_evaluator.py
1"""2HTML quality evaluation using Claude AI.3"""4
5import json6import logging7import re8from typing import Optional9
10import anthropic11
12from ..models import EvaluationResult, PreEvaluationResult, ClaudeModel13
14
15class HTMLQualityEvaluator:16 """Evaluates HTML quality for web scraping using Claude AI."""17 18 def __init__(self, claude_api_key: str, claude_model: ClaudeModel = ClaudeModel.CLAUDE_3_5_SONNET):19 """Initialize with Claude API key and model."""20 if not claude_api_key or not claude_api_key.strip():21 raise ValueError("Claude API key cannot be empty")22 23 self.client = anthropic.Anthropic(api_key=claude_api_key)24 self.claude_model = claude_model25 self.logger = logging.getLogger(__name__)26 27 def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:28 """29 Evaluate HTML quality for data extraction.30 31 Args:32 user_goal: User's extraction goal33 html_content: HTML content to evaluate34 35 Returns:36 EvaluationResult or None if evaluation fails37 """38 try:39 # Pre-evaluation checks40 pre_eval = self._pre_evaluate_html(html_content)41 if not pre_eval.should_continue_to_claude:42 if pre_eval.score is not None:43 return EvaluationResult(score=pre_eval.score, reasoning=pre_eval.reasoning)44 return None45 46 # Claude evaluation47 return self._evaluate_with_claude(user_goal, html_content)48 49 except Exception as e:50 self.logger.error(f"Error evaluating HTML quality: {str(e)}")51 return None52 53 def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:54 """Perform basic HTML validation checks."""55 if not html_content or not html_content.strip():56 return PreEvaluationResult(57 is_valid_html=False,58 score=1,59 reasoning="Empty or whitespace-only HTML content",60 should_continue_to_claude=False61 )62 63 # Check for common failure indicators64 content_lower = html_content.lower()65 66 # Bot detection/blocking indicators67 blocking_indicators = [68 'please verify you are a human',69 'access denied',70 'blocked',71 'captcha',72 'cloudflare',73 'ddos protection',74 'security check',75 'bot detected'76 ]77 78 for indicator in blocking_indicators:79 if indicator in content_lower:80 return PreEvaluationResult(81 is_valid_html=False,82 score=1,83 reasoning=f"HTML appears to be blocked/bot-detected (found: '{indicator}')",84 should_continue_to_claude=False85 )86 87 # Check for minimal HTML structure88 if not re.search(r'<html|<body|<div|<p|<span', content_lower):89 return PreEvaluationResult(90 is_valid_html=False,91 score=2,92 reasoning="HTML lacks basic structural elements",93 should_continue_to_claude=False94 )95 96 return PreEvaluationResult(97 is_valid_html=True,98 should_continue_to_claude=True99 )100 101 def _evaluate_with_claude(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:102 """Evaluate HTML using Claude AI."""103 try:104 prompt = self._create_evaluation_prompt(user_goal, html_content)105 106 response = self.client.messages.create(107 model=self.claude_model.value,108 max_tokens=500,109 messages=[{"role": "user", "content": prompt}]110 )111 112 content = response.content[0].text113 return self._parse_evaluation_response(content)114 115 except Exception as e:116 self.logger.error(f"Error in Claude evaluation: {str(e)}")117 return None118 119 def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:120 """Create the evaluation prompt for Claude."""121 return f"""You are an expert web scraper evaluator. Analyze the provided HTML and determine how suitable it is for extracting the requested data.122
123USER EXTRACTION GOAL:124{user_goal}125
126HTML CONTENT TO EVALUATE:127{html_content}128
129Evaluate the HTML on a scale of 1-10 based on:1301. Presence of the target data elements1312. HTML structure quality and accessibility1323. Whether the page loaded correctly (not blocked, error page, etc.)1334. How easy it would be to extract the requested data134
135Return your evaluation in this EXACT JSON format:136{{137 "score": [1-10 integer],138 "reasoning": "[brief explanation of the score]"139}}140
141Only return the JSON, no other text.142"""143 144 def _parse_evaluation_response(self, response: str) -> Optional[EvaluationResult]:145 """Parse Claude's evaluation response."""146 try:147 # Extract JSON from response148 json_match = re.search(r'\{.*\}', response, re.DOTALL)149 if not json_match:150 raise ValueError("No JSON found in response")151 152 data = json.loads(json_match.group())153 154 score = data.get('score')155 reasoning = data.get('reasoning', '')156 157 if not isinstance(score, int) or score < 1 or score > 10:158 raise ValueError(f"Invalid score: {score}")159 160 return EvaluationResult(score=score, reasoning=reasoning)161 162 except Exception as e:163 self.logger.error(f"Error parsing evaluation response: {str(e)}")164 return None
src/llmscraper/generation/__init__.py
1"""2Generation module for ScraperCodeGenerator.3"""4
5from .script_generator import ScriptGenerator6from .script_executor import ScriptExecutor7
8__all__ = ["ScriptGenerator", "ScriptExecutor"]
src/llmscraper/generation/script_executor.py
1"""2Script execution and testing functionality.3"""4
5import subprocess6import tempfile7import os8import json9import logging10from typing import Dict, Any, Optional11import ast12import traceback13
14
15class ScriptExecutor:16 """Executes and tests generated scraping scripts."""17 18 def __init__(self):19 """Initialize the script executor."""20 self.logger = logging.getLogger(__name__)21 22 def test_script(self, script_content: str, html_content: str) -> Dict[str, Any]:23 """24 Test a scraping script against sample HTML content.25 26 Args:27 script_content: The Python script to test28 html_content: Sample HTML to test against29 30 Returns:31 Dict with test results including success, data, and errors32 """33 try:34 # Extract the extract_data function from the script35 extract_function = self._extract_function_from_script(script_content, 'extract_data')36 37 if not extract_function:38 return {39 "success": False,40 "error": "Could not find extract_data function in script",41 "data": None42 }43 44 # Create a safe execution environment45 safe_globals = {46 '__builtins__': {47 'len': len,48 'str': str,49 'int': int,50 'float': float,51 'bool': bool,52 'list': list,53 'dict': dict,54 'range': range,55 'enumerate': enumerate,56 'zip': zip,57 'isinstance': isinstance,58 'hasattr': hasattr,59 'getattr': getattr,60 'print': print,61 '__import__': __import__,62 }63 }64 65 # Import necessary modules into the environment66 exec("from bs4 import BeautifulSoup", safe_globals)67 exec("import re", safe_globals)68 exec("import json", safe_globals)69 70 # Execute the function definition71 exec(extract_function, safe_globals)72 73 # Call the function with the HTML content74 extracted_data = safe_globals['extract_data'](html_content)75 76 return {77 "success": True,78 "data": extracted_data,79 "error": None,80 "data_type": type(extracted_data).__name__,81 "item_count": len(extracted_data) if isinstance(extracted_data, (list, dict)) else 182 }83 84 except Exception as e:85 self.logger.error(f"Error testing script: {str(e)}")86 return {87 "success": False,88 "error": str(e),89 "data": None,90 "traceback": traceback.format_exc()91 }92 93 def _extract_function_from_script(self, script_content: str, function_name: str) -> Optional[str]:94 """Extract a specific function from a script."""95 try:96 # Parse the script into an AST97 tree = ast.parse(script_content)98 99 # Find the function definition100 for node in ast.walk(tree):101 if isinstance(node, ast.FunctionDef) and node.name == function_name:102 # Get the source code of the function103 lines = script_content.split('\n')104 start_line = node.lineno - 1105 106 # Find the end of the function107 end_line = start_line + 1108 while end_line < len(lines):109 line = lines[end_line]110 # Check if this line starts a new function or class111 if line.strip() and not line.startswith(' ') and not line.startswith('\t'):112 break113 end_line += 1114 115 return '\n'.join(lines[start_line:end_line])116 117 return None118 119 except Exception as e:120 self.logger.error(f"Error extracting function: {str(e)}")121 return None122 123 def validate_script_syntax(self, script_content: str) -> Dict[str, Any]:124 """125 Validate the syntax of a Python script.126 127 Args:128 script_content: The Python script to validate129 130 Returns:131 Dict with validation results132 """133 try:134 # Try to parse the script135 ast.parse(script_content)136 137 return {138 "valid": True,139 "error": None140 }141 142 except SyntaxError as e:143 return {144 "valid": False,145 "error": f"Syntax error: {str(e)}",146 "line": e.lineno,147 "offset": e.offset148 }149 except Exception as e:150 return {151 "valid": False,152 "error": f"Parse error: {str(e)}"153 }154 155 def run_script_in_sandbox(self, script_content: str, timeout: int = 60) -> Dict[str, Any]:156 """157 Run a complete script in a sandboxed environment.158 159 Args:160 script_content: The complete Python script161 timeout: Maximum execution time in seconds162 163 Returns:164 Dict with execution results165 """166 try:167 # Create a temporary file168 with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file:169 temp_file.write(script_content)170 temp_file_path = temp_file.name171 172 try:173 # Run the script174 result = subprocess.run(175 ['python', temp_file_path],176 capture_output=True,177 text=True,178 timeout=timeout,179 cwd=os.path.dirname(temp_file_path)180 )181 182 return {183 "success": result.returncode == 0,184 "stdout": result.stdout,185 "stderr": result.stderr,186 "return_code": result.returncode187 }188 189 finally:190 # Clean up the temporary file191 os.unlink(temp_file_path)192 193 except subprocess.TimeoutExpired:194 return {195 "success": False,196 "stdout": "",197 "stderr": f"Script execution timed out after {timeout} seconds",198 "return_code": -1199 }200 except Exception as e:201 return {202 "success": False,203 "stdout": "",204 "stderr": str(e),205 "return_code": -1206 }
src/llmscraper/generation/script_generator.py
1"""2Code generation functionality for creating scraping scripts.3"""4
5import logging6from typing import Optional7import re8
9import anthropic10from ..models import ClaudeModel11
12
13class ScriptGenerator:14 """Generates Python scraping scripts using Claude AI."""15 16 def __init__(self, claude_api_key: str, claude_model: ClaudeModel = ClaudeModel.CLAUDE_3_5_SONNET):17 """Initialize with Claude API key and model."""18 if not claude_api_key or not claude_api_key.strip():19 raise ValueError("Claude API key cannot be empty")20 21 self.client = anthropic.Anthropic(api_key=claude_api_key)22 self.claude_model = claude_model23 self.logger = logging.getLogger(__name__)24 25 def generate_scraping_script(self, target_url: str, best_actor: str, 26 pruned_html: str, user_goal: str, 27 for_actor: bool = False) -> Optional[str]:28 """29 Generate a complete Python scraping script.30 31 Args:32 target_url: The target URL to scrape33 best_actor: Name of the best performing actor34 pruned_html: Sample HTML content for reference35 user_goal: User's extraction goal36 for_actor: If True, generate for Apify actor (key-value store output)37 38 Returns:39 Complete Python script as string, or None if generation fails40 """41 try:42 # Generate the HTML parsing code from Claude43 parsing_code = self._generate_html_parsing_code(pruned_html, user_goal)44 45 if not parsing_code:46 self.logger.error("Failed to generate HTML parsing code")47 return None48 49 # Create the complete script50 if for_actor:51 return self._create_actor_script(target_url, best_actor, parsing_code, user_goal)52 else:53 return self._create_standalone_script(target_url, best_actor, parsing_code, user_goal)54 55 except Exception as e:56 self.logger.error(f"Error generating script: {str(e)}")57 return None58 59 def _generate_html_parsing_code(self, pruned_html: str, user_goal: str) -> Optional[str]:60 """Generate HTML parsing/extraction code using Claude."""61 try:62 prompt = f"""Generate Python code that parses HTML content and extracts data based on the user's goal. You should generate ONLY the extraction logic, not a complete script.63
64## USER GOAL:65{user_goal}66
67## SAMPLE HTML (for reference):68{pruned_html}69
70## REQUIREMENTS:711. Create a function called `extract_data(html_content)` that takes HTML string as input722. Use BeautifulSoup to parse the HTML733. Extract the data according to the user's goal using CSS selectors, attributes, text content, etc.744. Return the extracted data as a Python dictionary or list of dictionaries755. Handle missing or malformed data gracefully766. Include appropriate error handling77
78## EXAMPLE OUTPUT FORMAT:79```python80def extract_data(html_content):81 from bs4 import BeautifulSoup82 83 soup = BeautifulSoup(html_content, 'html.parser')84 results = []85 86 # Your extraction logic here87 # Use soup.find(), soup.find_all(), CSS selectors, etc.88 89 return results90```91
92Generate ONLY the `extract_data` function and any helper functions needed. Do not include imports outside the function, full scripts, or other boilerplate code."""93
94 self.logger.info("Requesting HTML parsing code generation from Claude...")95 96 response = self.client.messages.create(97 model=self.claude_model.value,98 max_tokens=2000,99 messages=[{"role": "user", "content": prompt}]100 )101 102 parsing_code = response.content[0].text103 104 # Extract Python code from response if wrapped in code blocks105 if "```python" in parsing_code:106 code_match = re.search(r'```python\n(.*?)\n```', parsing_code, re.DOTALL)107 if code_match:108 parsing_code = code_match.group(1)109 110 return parsing_code111 112 except Exception as e:113 self.logger.error(f"Error generating HTML parsing code: {str(e)}")114 return None115 116 def _create_standalone_script(self, target_url: str, best_actor: str, 117 parsing_code: str, user_goal: str) -> str:118 """Create a standalone Python script."""119 return f'''#!/usr/bin/env python3120"""121Generated Web Scraper122Target: {target_url}123Goal: {user_goal}124Best Actor: {best_actor}125Generated by: ScraperCodeGenerator126
127This script is completely standalone and does not require the original ScraperCodeGenerator project.128"""129
130import os131import json132import logging133from typing import Dict, Any, List, Optional134
135# Check and import required libraries136try:137 import requests138except ImportError:139 raise ImportError("requests not installed. Please install using: pip install requests")140
141try:142 from bs4 import BeautifulSoup143except ImportError:144 raise ImportError("beautifulsoup4 not installed. Please install using: pip install beautifulsoup4")145
146try:147 from apify_client import ApifyClient148except ImportError:149 raise ImportError("apify-client not installed. Please install using: pip install apify-client")150
151
152{parsing_code}153
154
155def run_actor_scraping(target_url: str, apify_token: str) -> Optional[str]:156 """157 Run the best performing actor to get HTML content.158 159 Args:160 target_url: URL to scrape161 apify_token: Apify API token162 163 Returns:164 HTML content or None if failed165 """166 client = ApifyClient(apify_token)167 168 # Actor configuration for {best_actor}169 actor_input = {{170 "startUrls": [{{"url": target_url}}],171 "maxRequestRetries": 3,172 "requestTimeoutSecs": 30,173 "maxPagesPerCrawl": 1,174 }}175 176 # Add actor-specific configuration177 if "{best_actor}" == "cheerio-scraper":178 actor_input.update(\{{179 "pageFunction": \'\'\'180 async function pageFunction(context) {{181 const {{ request, log, $ }} = context;182 try {{183 const title = $('title').text() || '';184 const html = $('html').html() || '';185 return {{186 url: request.url,187 title: title,188 html: html189 }};190 }} catch (error) {{191 log.error('Error in pageFunction:', error);192 return {{193 url: request.url,194 title: '',195 html: ''196 }};197 }}198 }}199 \'\'\',200 "proxyConfiguration": {{"useApifyProxy": True}}201 }})202 actor_id = "apify/cheerio-scraper"203 elif "{best_actor}" == "web-scraper":204 actor_input.update({{205 "pageFunction": \'\'\'206 async function pageFunction(context) {{207 const {{ request, log, page }} = context;208 try {{209 const title = await page.title();210 const html = await page.content();211 return {{212 url: request.url,213 title: title,214 html: html215 }};216 }} catch (error) {{217 log.error('Error in pageFunction:', error);218 return {{219 url: request.url,220 title: '',221 html: ''222 }};223 }}224 }}225 \'\'\',226 "proxyConfiguration": {{"useApifyProxy": True}}227 }})228 actor_id = "apify/web-scraper"229 elif "{best_actor}" == "website-content-crawler":230 actor_input = {{231 "startUrls": [{{"url": target_url}}],232 "maxCrawlPages": 1,233 "crawler": "playwright",234 "proxyConfiguration": {{"useApifyProxy": True}}235 }}236 actor_id = "apify/website-content-crawler"237 else:238 # Fallback to simple requests if actor not recognized239 logging.warning(f"Unknown actor '{best_actor}', falling back to requests")240 try:241 response = requests.get(target_url, timeout=30)242 response.raise_for_status()243 return response.text244 except Exception as e:245 logging.error(f"Failed to fetch with requests: {{e}}")246 return None247 248 try:249 # Run the actor250 logging.info(f"Running {{actor_id}} actor...")251 run = client.actor(actor_id).call(run_input=actor_input)252 253 # Get the dataset items254 dataset_client = client.dataset(run["defaultDatasetId"])255 items = list(dataset_client.iterate_items())256 257 if not items:258 logging.warning("No items returned from actor")259 return None260 261 # Extract HTML content262 item = items[0]263 html_content = item.get('html') or item.get('text') or item.get('markdown', '')264 265 if not html_content:266 logging.warning("No HTML content found in actor result")267 return None268 269 return html_content270 271 except Exception as e:272 logging.error(f"Error running actor: {{e}}")273 return None274
275
276def main():277 """Main function to run the scraper."""278 # Setup logging279 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')280 logger = logging.getLogger(__name__)281 282 # Configuration283 target_url = "{target_url}"284 apify_token = os.getenv("APIFY_TOKEN")285 286 if not apify_token:287 logger.error("APIFY_TOKEN environment variable not set")288 logger.info("Please set your Apify API token: export APIFY_TOKEN='your_token_here'")289 logger.info("Get your token at: https://console.apify.com/")290 return291 292 try:293 logger.info(f"🚀 Starting scraper for: {{target_url}}")294 logger.info(f"📝 Goal: {user_goal}")295 logger.info(f"🏆 Using best actor: {best_actor}")296 297 # Get HTML content using the best performing actor298 html_content = run_actor_scraping(target_url, apify_token)299 300 if not html_content:301 logger.error("Failed to get HTML content")302 return303 304 logger.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")305 306 # Extract data using the generated parsing code307 logger.info("🔍 Extracting data from HTML...")308 extracted_data = extract_data(html_content)309 310 if not extracted_data:311 logger.warning("No data was extracted from the HTML")312 return313 314 # Prepare final results315 results = {{316 "target_url": target_url,317 "extraction_goal": "{user_goal}",318 "actor_used": "{best_actor}",319 "data": extracted_data,320 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1321 }}322 323 # Output results324 print("\\n" + "="*60)325 print("📊 EXTRACTION RESULTS")326 print("="*60)327 print(json.dumps(results, indent=2, ensure_ascii=False))328 329 # Save to file330 output_file = "extracted_data.json"331 with open(output_file, 'w', encoding='utf-8') as f:332 json.dump(results, f, indent=2, ensure_ascii=False)333 334 logger.info(f"💾 Results saved to {{output_file}}")335 logger.info(f"🎉 Successfully extracted {{results['total_items']}} items!")336 337 except Exception as e:338 logger.error(f"❌ Scraping failed: {{e}}")339 import traceback340 traceback.print_exc()341
342
343if __name__ == "__main__":344 main()345'''346 347 def _create_actor_script(self, target_url: str, best_actor: str, 348 parsing_code: str, user_goal: str) -> str:349 """Create a script for Apify actor."""350 return f'''"""351Apify Actor Script352Target: {target_url}353Goal: {user_goal}354Best Actor: {best_actor}355Generated by: ScraperCodeGenerator356
357This script is completely standalone and does not require the original ScraperCodeGenerator project.358"""359
360import json361from typing import Optional362
363# Check and import required libraries364try:365 from apify import Actor366except ImportError:367 raise ImportError("apify not installed. Please install using: pip install apify")368
369try:370 from bs4 import BeautifulSoup371except ImportError:372 raise ImportError("beautifulsoup4 not installed. Please install using: pip install beautifulsoup4")373
374try:375 from apify_client import ApifyClient376except ImportError:377 raise ImportError("apify-client not installed. Please install using: pip install apify-client")378
379
380{parsing_code}381
382
383async def run_actor_scraping(target_url: str, apify_token: str) -> Optional[str]:384 """385 Run the best performing actor to get HTML content.386 387 Args:388 target_url: URL to scrape389 apify_token: Apify API token390 391 Returns:392 HTML content or None if failed393 """394 client = ApifyClient(apify_token)395 396 # Actor configuration for {best_actor}397 actor_input = {{398 "startUrls": [{{"url": target_url}}],399 "maxRequestRetries": 3,400 "requestTimeoutSecs": 30,401 "maxPagesPerCrawl": 1,402 }}403 404 # Add actor-specific configuration405 if "{best_actor}" == "cheerio-scraper":406 actor_input.update({{407 "pageFunction": \'\'\'408 async function pageFunction(context) {{409 const {{ request, log, $ }} = context;410 try {{411 const title = $('title').text() || '';412 const html = $('html').html() || '';413 return {{414 url: request.url,415 title: title,416 html: html417 }};418 }} catch (error) {{419 log.error('Error in pageFunction:', error);420 return {{421 url: request.url,422 title: '',423 html: ''424 }};425 }}426 }}427 \'\'\',428 "proxyConfiguration": {{"useApifyProxy": True}}429 }})430 actor_id = "apify/cheerio-scraper"431 elif "{best_actor}" == "web-scraper":432 actor_input.update({{433 "pageFunction": \'\'\'434 async function pageFunction(context) {{435 const {{ request, log, page }} = context;436 try {{437 const title = await page.title();438 const html = await page.content();439 return {{440 url: request.url,441 title: title,442 html: html443 }};444 }} catch (error) {{445 log.error('Error in pageFunction:', error);446 return {{447 url: request.url,448 title: '',449 html: ''450 }};451 }}452 }}453 \'\'\',454 "proxyConfiguration": {{"useApifyProxy": True}}455 }})456 actor_id = "apify/web-scraper"457 elif "{best_actor}" == "website-content-crawler":458 actor_input = {{459 "startUrls": [{{"url": target_url}}],460 "maxCrawlPages": 1,461 "crawler": "playwright",462 "proxyConfiguration": {{"useApifyProxy": True}}463 }}464 actor_id = "apify/website-content-crawler"465 else:466 Actor.log.error(f"Unknown actor: {best_actor}")467 return None468 469 try:470 # Run the actor471 Actor.log.info(f"Running {{actor_id}} actor...")472 run = client.actor(actor_id).call(run_input=actor_input)473 474 # Get the dataset items475 dataset_client = client.dataset(run["defaultDatasetId"])476 items = list(dataset_client.iterate_items())477 478 if not items:479 Actor.log.warning("No items returned from actor")480 return None481 482 # Extract HTML content483 item = items[0]484 html_content = item.get('html') or item.get('text') or item.get('markdown', '')485 486 if not html_content:487 Actor.log.warning("No HTML content found in actor result")488 return None489 490 return html_content491 492 except Exception as e:493 Actor.log.error(f"Error running actor: {{e}}")494 return None495
496
497async def main():498 """Main actor function."""499 async with Actor:500 # Get input501 actor_input = await Actor.get_input() or {{}}502 target_url = actor_input.get('targetUrl', '{target_url}')503 user_goal = actor_input.get('userGoal', '{user_goal}')504 apify_token = actor_input.get('apifyToken') or Actor.config.token505 506 Actor.log.info(f"🚀 Starting scraper for: {{target_url}}")507 Actor.log.info(f"📝 Goal: {{user_goal}}")508 Actor.log.info(f"🏆 Using best actor: {best_actor}")509 510 try:511 # Get HTML content using the best performing actor512 html_content = await run_actor_scraping(target_url, apify_token)513 514 if not html_content:515 await Actor.fail(f"Failed to get HTML content from {best_actor} actor")516 return517 518 Actor.log.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")519 520 # Extract data using the generated parsing code521 Actor.log.info("🔍 Extracting data from HTML...")522 extracted_data = extract_data(html_content)523 524 if not extracted_data:525 Actor.log.warning("No data was extracted from the HTML")526 extracted_data = []527 528 # Prepare final results529 results = {{530 "target_url": target_url,531 "extraction_goal": user_goal,532 "actor_used": "{best_actor}",533 "data": extracted_data,534 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1535 }}536 537 # Save to key-value store538 await Actor.set_value('OUTPUT', results)539 540 Actor.log.info(f"🎉 Successfully extracted {{results['total_items']}} items!")541 Actor.log.info("💾 Results saved to key-value store as 'OUTPUT'")542 543 except Exception as e:544 Actor.log.error(f"❌ Scraping failed: {{e}}")545 await Actor.fail(str(e))546
547
548if __name__ == "__main__":549 import asyncio550 asyncio.run(main())551'''
src/llmscraper/llm_scraper/__init__.py
1"""2LLM Scraper package - Intelligent web scraping using Claude and Apify MCP.3"""4
5from .actor import LLMScraperActor, run_llm_scraper_actor, run_llm_scraper6from .models import (7 LLMScraperInput, LLMScraperOutput, ActorAttempt, 8 ScrapingStrategy, ProgressUpdate9)10from .retry_logic import RetryManager11from .quality_evaluator import DataQualityEvaluator, QualityMetrics12
13__all__ = [14 'LLMScraperActor',15 'run_llm_scraper_actor',16 'run_llm_scraper',17 'LLMScraperInput',18 'LLMScraperOutput',19 'ActorAttempt',20 'ScrapingStrategy',21 'ProgressUpdate',22 'RetryManager',23 'DataQualityEvaluator',24 'QualityMetrics'25]
src/llmscraper/llm_scraper/actor.py
1"""2Main LLM Scraper Actor implementation.3"""4
5import asyncio6import logging7import time8from datetime import datetime, timedelta9from typing import Dict, List, Any, Optional, Callable10from urllib.parse import urlparse11import json12import os13
14from apify import Actor15
16from ..mcp import MCPClient17from ..claude import ClaudeManager, ConversationResult18from .models import (19 LLMScraperInput, LLMScraperOutput, ActorAttempt, 20 ScrapingStrategy, ProgressUpdate21)22from .retry_logic import RetryManager23from .quality_evaluator import DataQualityEvaluator24
25
26class LLMScraperActor:27 """28 Main LLM Scraper Actor that uses Claude to discover and test Apify actors.29 """30 31 def __init__(self, input_config: LLMScraperInput):32 """Initialize the LLM Scraper Actor."""33 self.config = input_config34 self.start_time = datetime.now()35 self.logger = logging.getLogger(__name__)36 37 # Initialize components38 self.mcp_client: Optional[MCPClient] = None39 self.claude_manager: Optional[ClaudeManager] = None40 self.retry_manager = RetryManager(41 max_retries_per_actor=input_config.max_retries_per_actor42 )43 self.quality_evaluator = DataQualityEvaluator()44 45 # State tracking46 self.output = LLMScraperOutput(47 success=False,48 status="initializing",49 claude_model_used=input_config.model_name,50 mcp_server_used=input_config.mcp_url51 )52 self.current_attempts = 053 self.tested_actors: Dict[str, List[ActorAttempt]] = {}54 55 # Progress callback56 self.on_progress: Optional[Callable[[ProgressUpdate], None]] = None57 58 async def run(self) -> LLMScraperOutput:59 """60 Main execution method.61 62 Returns:63 LLMScraperOutput with results64 """65 try:66 self.logger.info(f"🚀 Starting LLM Scraper for {self.config.target_url}")67 68 # Validate input69 self.config.validate()70 71 # Initialize connections72 await self._initialize_connections()73 74 # Main scraping process75 await self._execute_scraping_process()76 77 # Finalize results78 self._finalize_results()79 80 self.logger.info(f"✅ LLM Scraper completed: {self.output.status}")81 return self.output82 83 except Exception as e:84 self.logger.error(f"❌ LLM Scraper failed: {str(e)}")85 self.output.success = False86 self.output.status = "failed"87 self.output.llm_reasoning = f"Execution failed: {str(e)}"88 self._finalize_results()89 return self.output90 91 async def _initialize_connections(self):92 """Initialize MCP client and Claude manager."""93 self._emit_progress("initializing", "Setting up connections...")94 95 # Get API keys96 anthropic_key = (self.config.anthropic_api_key or 97 os.getenv('CLAUDE_API_KEY') or98 os.getenv('CLAUDE_API_KEY'))99 100 if not anthropic_key:101 raise ValueError("Anthropic API key is required (anthropic_api_key or CLAUDE_API_KEY env var)")102 103 apify_token = os.getenv('APIFY_TOKEN')104 if not apify_token:105 raise ValueError("APIFY_TOKEN environment variable is required")106 107 # Initialize MCP client108 self.mcp_client = MCPClient(109 server_url=self.config.mcp_url,110 apify_token=apify_token111 )112 113 connection_success = await self.mcp_client.connect()114 if not connection_success:115 self.logger.warning("MCP server connection failed, using fallback mode")116 117 # Initialize Claude manager118 self.claude_manager = ClaudeManager(119 api_key=anthropic_key,120 model=self.config.model_name,121 max_tool_calls_per_round=min(self.config.max_actor_attempts, 15)122 )123 124 # Set system prompt with specific parameters125 self.claude_manager.set_system_prompt(126 target_url=self.config.target_url,127 extraction_goal=self.config.extraction_goal,128 max_attempts=self.config.max_actor_attempts,129 max_retries=self.config.max_retries_per_actor,130 max_time=self.config.max_time_minutes131 )132 133 self.logger.info("✅ Connections initialized successfully")134 135 async def _execute_scraping_process(self):136 """Execute the main scraping discovery and testing process."""137 # Phase 1: Actor Discovery138 self._emit_progress("discovery", "Discovering suitable actors...")139 candidate_actors = await self._discover_actors()140 141 if not candidate_actors:142 self.output.status = "no_actors_found"143 self.output.llm_reasoning = "No suitable actors found for the given task"144 return145 146 self.logger.info(f"Found {len(candidate_actors)} candidate actors to test")147 for i, actor in enumerate(candidate_actors[:10], 1):148 self.logger.info(f" {i}. {actor['name']} - {actor['title']} (priority: {actor['priority']:.2f})")149 if len(candidate_actors) > 10:150 self.logger.info(f" ... and {len(candidate_actors) - 10} more actors")151 152 # Phase 2: Actor Testing153 self._emit_progress("testing", f"Testing {len(candidate_actors)} actors...")154 await self._test_actors(candidate_actors)155 156 # Determine final status157 if self._has_successful_attempts():158 self.output.success = True159 self.output.status = "completed"160 elif self._time_exceeded():161 self.output.status = "timeout"162 elif self.current_attempts >= self.config.max_actor_attempts:163 self.output.status = "limit_reached"164 else:165 self.output.status = "no_successful_actors"166 167 def _build_discovery_query(self) -> str:168 """Build the initial query for Claude."""169 domain = urlparse(self.config.target_url).netloc170 171 query = f"""I need to find the best Apify actors to scrape this website:172
173TARGET URL: {self.config.target_url}174DOMAIN: {domain}175EXTRACTION GOAL: {self.config.extraction_goal}176
177Please search for relevant actors and provide me with a prioritized list of actor names to test.178
179Process:1801. Use search-actors to find actors specifically for this domain/platform1812. If no specific actors found, search for general web scraping actors1823. Analyze the search results and extract the most promising actor names1834. Provide a prioritized list based on relevance, popularity, and suitability184
185Focus on finding actor NAMES/IDs that I can test, not on executing them yet.186Start by searching for actors relevant to this task."""187 188 return query189 190 async def _discover_actors(self) -> List[Dict[str, Any]]:191 """192 Phase 1: Discover candidate actors for the scraping task.193 194 Returns:195 List of candidate actors with metadata196 """197 # Create discovery query198 query = self._build_discovery_query()199 200 # Process with Claude for discovery only201 result = await self.claude_manager.process_query(202 query=query,203 mcp_client=self.mcp_client,204 on_progress=self._handle_claude_progress205 )206 207 # Extract candidate actors from tool calls208 candidate_actors = []209 for tool_call in result.tool_calls:210 if tool_call.name == "search-actors":211 # Get the actual result from MCP client212 search_result = await self.mcp_client.call_tool(213 tool_call.name, 214 tool_call.arguments215 )216 217 if not search_result.is_error and isinstance(search_result.content, list):218 for actor_data in search_result.content:219 candidate_actors.append({220 "name": actor_data.get("name", ""),221 "title": actor_data.get("title", ""),222 "description": actor_data.get("description", ""),223 "username": actor_data.get("username", ""),224 "stats": actor_data.get("stats", {}),225 "priority": self._calculate_actor_priority(actor_data)226 })227 228 # Sort by priority (higher is better)229 candidate_actors.sort(key=lambda x: x["priority"], reverse=True)230 231 # Limit to max attempts232 candidate_actors = candidate_actors[:self.config.max_actor_attempts]233 234 self.logger.info(f"Discovered {len(candidate_actors)} candidate actors")235 for i, actor in enumerate(candidate_actors[:5]):236 self.logger.info(f" {i+1}. {actor['name']} (priority: {actor['priority']:.2f})")237 238 return candidate_actors239 240 def _calculate_actor_priority(self, actor_data: Dict[str, Any]) -> float:241 """Calculate priority score for an actor based on relevance and popularity."""242 score = 0.0243 244 name = actor_data.get("name", "").lower()245 description = actor_data.get("description", "").lower()246 stats = actor_data.get("stats", {})247 248 # Domain-specific bonus249 domain = urlparse(self.config.target_url).netloc.lower()250 if domain.replace("www.", "") in name:251 score += 10.0252 253 # Platform-specific bonuses254 platform_keywords = {255 "linkedin": ["linkedin"],256 "facebook": ["facebook", "fb"],257 "instagram": ["instagram", "insta"],258 "twitter": ["twitter", "x.com"],259 "youtube": ["youtube"],260 "amazon": ["amazon"],261 "shopify": ["shopify"],262 "ecommerce": ["shop", "store", "product", "ecommerce"]263 }264 265 for platform, keywords in platform_keywords.items():266 if any(keyword in domain for keyword in keywords):267 if any(keyword in name for keyword in keywords):268 score += 8.0269 elif any(keyword in description for keyword in keywords):270 score += 5.0271 272 # General scraper types273 if "web-scraper" in name:274 score += 4.0275 elif "cheerio-scraper" in name:276 score += 4.5277 elif "website-content-crawler" in name:278 score += 3.0279 elif "puppeteer" in name:280 score += 3.5281 282 # Popularity bonus283 users = stats.get("users", 0)284 runs = stats.get("runs", 0)285 286 if users > 1000:287 score += 2.0288 elif users > 100:289 score += 1.0290 291 if runs > 10000:292 score += 2.0293 elif runs > 1000:294 score += 1.0295 296 return score297 298 async def _test_actors(self, candidate_actors: List[Dict[str, Any]]):299 """300 Phase 2: Test each actor individually in separate conversations.301 302 Args:303 candidate_actors: List of actors to test304 """305 for i, actor in enumerate(candidate_actors):306 if self.current_attempts >= self.config.max_actor_attempts:307 self.logger.info("Reached maximum attempts limit")308 break309 310 if self._time_exceeded():311 self.logger.info("Reached time limit")312 break313 314 actor_name = actor["name"]315 self.logger.info(f"Testing actor {i+1}/{len(candidate_actors)}: {actor_name}")316 317 # Test this actor in a separate conversation318 success = await self._test_single_actor(actor)319 320 if success and self.config.min_data_quality_score > 0:321 # If we found a good enough actor, we might stop here322 best_attempt = self._get_best_attempt()323 if (best_attempt and 324 best_attempt.data_quality_score >= self.config.min_data_quality_score):325 self.logger.info(f"Found satisfactory actor: {actor_name} " +326 f"(quality: {best_attempt.data_quality_score:.2f})")327 break328 329 async def _test_single_actor(self, actor_info: Dict[str, Any]) -> bool:330 """331 Test a single actor in its own conversation.332 333 Args:334 actor_info: Actor information from discovery phase335 336 Returns:337 True if actor was successful, False otherwise338 """339 actor_name = actor_info["name"]340 341 # Create a fresh Claude manager for this actor test342 test_claude = ClaudeManager(343 api_key=self.claude_manager.client.api_key,344 model=self.config.model_name,345 max_tool_calls_per_round=self.config.max_retries_per_actor + 2346 )347 348 # Set specialized system prompt for testing this specific actor349 test_claude.system_prompt = f"""You are testing a specific Apify actor to see if it can scrape the target data.350
351ACTOR TO TEST: {actor_name}352ACTOR DESCRIPTION: {actor_info.get('description', 'No description')}353
354TARGET URL: {self.config.target_url}355EXTRACTION GOAL: {self.config.extraction_goal}356
357YOUR TASK:3581. Get details about this actor using get-actor-details3592. Understand its input schema and capabilities3603. Configure optimal input parameters for the target URL and goal3614. Run the actor with those parameters3625. Analyze the results to determine if it successfully extracted the target data3636. If it fails, try up to {self.config.max_retries_per_actor} different input configurations364
365IMPORTANT:366- Focus ONLY on testing this specific actor367- Try different input configurations if the first attempt fails368- Look for the target data in the results369- Determine success based on whether relevant data was extracted370
371Start by getting details about the actor, then test it."""372 373 # Create test query374 test_query = f"""Please test the actor '{actor_name}' for my scraping task.375
376Target URL: {self.config.target_url}377Goal: {self.config.extraction_goal}378
379Test this actor systematically and determine if it can extract the required data."""380 381 # Run the test conversation382 self._emit_progress(383 "testing", 384 f"Testing {actor_name}...",385 actor_name=actor_name.split('/')[-1],386 attempt_number=1387 )388 389 try:390 result = await test_claude.process_query(391 query=test_query,392 mcp_client=self.mcp_client,393 on_progress=lambda role, content: self._handle_test_progress(actor_name, role, content)394 )395 396 # Analyze the tool calls to extract actor execution results397 success = await self._analyze_single_actor_test(actor_name, result.tool_calls)398 399 return success400 401 except Exception as e:402 self.logger.error(f"Error testing actor {actor_name}: {str(e)}")403 return False404 405 async def _analyze_single_actor_test(self, actor_name: str, tool_calls: List) -> bool:406 """407 Analyze the results of testing a single actor.408 409 Args:410 actor_name: Name of the actor being tested411 tool_calls: Tool calls made during the test412 413 Returns:414 True if the actor was successful415 """416 actor_executions = []417 418 # Find all actor execution attempts419 for tool_call in tool_calls:420 if tool_call.name in ["run-actor", "execute-actor"]:421 if tool_call.arguments.get('actor_id') == actor_name or tool_call.arguments.get('actor') == actor_name:422 actor_executions.append(tool_call)423 424 if not actor_executions:425 self.logger.warning(f"No executions found for {actor_name}")426 return False427 428 # Process each execution attempt429 best_attempt = None430 for i, execution in enumerate(actor_executions):431 attempt = await self._create_attempt_from_execution(actor_name, execution, i + 1)432 433 # Add to tracking434 if actor_name not in self.tested_actors:435 self.tested_actors[actor_name] = []436 self.tested_actors[actor_name].append(attempt)437 self.output.add_attempt(attempt)438 self.current_attempts += 1439 440 if attempt.success and (not best_attempt or attempt.data_quality_score > best_attempt.data_quality_score):441 best_attempt = attempt442 443 return best_attempt is not None and best_attempt.success444 445 async def _create_attempt_from_execution(self, actor_name: str, execution_tool_call, attempt_number: int) -> ActorAttempt:446 """Create an ActorAttempt from a tool call execution."""447 input_config = execution_tool_call.arguments.get('input', {})448 449 # Execute the actor to get real results450 result = await self.mcp_client.call_tool(execution_tool_call.name, execution_tool_call.arguments)451 452 attempt = ActorAttempt(453 actor_id=actor_name,454 actor_name=actor_name.split('/')[-1] if '/' in actor_name else actor_name,455 attempt_number=attempt_number,456 input_config=input_config,457 timestamp=datetime.now().isoformat()458 )459 460 if result.is_error:461 attempt.success = False462 attempt.error_message = str(result.content)463 attempt.error_type = "execution_error"464 self.logger.warning(f"Actor {actor_name} failed: {attempt.error_message}")465 else:466 # Analyze the results to determine success467 extracted_data = self._extract_data_from_result(result.content)468 469 if extracted_data:470 attempt.success = True471 attempt.extracted_data = extracted_data472 attempt.result_count = len(extracted_data) if isinstance(extracted_data, list) else 1473 attempt.data_quality_score = self._evaluate_data_quality(extracted_data)474 attempt.execution_time_seconds = 30.0 # Placeholder475 476 self.logger.info(f"Actor {actor_name} succeeded: {attempt.result_count} items, " +477 f"quality: {attempt.data_quality_score:.2f}")478 else:479 attempt.success = False480 attempt.error_message = "No relevant data extracted"481 attempt.error_type = "no_data"482 self.logger.warning(f"Actor {actor_name} returned no relevant data")483 484 return attempt485 486 def _extract_data_from_result(self, result_content) -> List[Dict[str, Any]]:487 """Extract meaningful data from actor execution result."""488 if isinstance(result_content, list):489 return result_content490 elif isinstance(result_content, dict):491 # Look for common data fields492 if "items" in result_content:493 return result_content["items"]494 elif "data" in result_content:495 return result_content["data"]496 elif "results" in result_content:497 return result_content["results"]498 else:499 return [result_content]500 else:501 return []502 503 def _evaluate_data_quality(self, data: List[Dict[str, Any]]) -> float:504 """Evaluate the quality of extracted data."""505 if not data:506 return 0.0507 508 # Simple quality scoring based on data completeness509 total_score = 0.0510 511 for item in data:512 item_score = 0.0513 514 # Check for common useful fields515 if "title" in item or "name" in item:516 item_score += 0.3517 if "price" in item or "cost" in item:518 item_score += 0.2519 if "url" in item or "link" in item:520 item_score += 0.2521 if "description" in item:522 item_score += 0.2523 if len(item.keys()) > 3: # Has multiple fields524 item_score += 0.1525 526 total_score += min(item_score, 1.0)527 528 return min(total_score / len(data), 1.0)529 530 def _handle_test_progress(self, actor_name: str, role: str, content: str):531 """Handle progress updates during individual actor testing."""532 if role == "tool_call":533 self.logger.debug(f"[{actor_name}] Tool call: {content}")534 elif role == "tool_result":535 self.logger.debug(f"[{actor_name}] Tool result: {content[:100]}...")536 537 def _get_best_attempt(self) -> Optional[ActorAttempt]:538 """Get the best successful attempt so far."""539 best_attempt = None540 541 for attempts in self.tested_actors.values():542 for attempt in attempts:543 if attempt.success and (not best_attempt or 544 attempt.data_quality_score > best_attempt.data_quality_score):545 best_attempt = attempt546 547 return best_attempt548 549 def _handle_claude_progress(self, role: str, content: str):550 """Handle progress updates from Claude conversation."""551 if role == "tool_call":552 self._emit_progress("testing", content)553 elif role == "tool_result":554 self._emit_progress("analyzing", f"Tool result: {content}")555 556 def _has_successful_attempts(self) -> bool:557 """Check if we have any successful attempts."""558 return any(559 attempt.success for attempts in self.tested_actors.values() 560 for attempt in attempts561 )562 563 def _time_exceeded(self) -> bool:564 """Check if maximum execution time has been exceeded."""565 elapsed = datetime.now() - self.start_time566 return elapsed.total_seconds() > (self.config.max_time_minutes * 60)567 568 def _finalize_results(self):569 """Finalize the output results."""570 end_time = datetime.now()571 self.output.total_execution_time_seconds = (end_time - self.start_time).total_seconds()572 573 # Add recommendations based on results574 if self.output.success:575 self.output.add_recommendation(576 f"Use {self.output.best_actor_name} with the provided optimal configuration for best results"577 )578 else:579 self.output.add_recommendation(580 "Consider trying with different extraction goals or using manual actor configuration"581 )582 if self.current_attempts >= self.config.max_actor_attempts:583 self.output.add_recommendation(584 "Increase max_actor_attempts limit to test more actors"585 )586 if self._time_exceeded():587 self.output.add_recommendation(588 "Increase max_time_minutes to allow more thorough testing"589 )590 591 self.output.calculate_performance_summary()592 593 self.logger.info(f"Final results: {self.output.total_attempts_made} attempts, "594 f"{self.output.unique_actors_tested} actors tested, "595 f"success: {self.output.success}")596 597 def _emit_progress(self, stage: str, message: str, **kwargs):598 """Emit a progress update."""599 # Calculate progress based on stage and current attempts600 progress = 0.0601 if stage == "initializing":602 progress = 0.1603 elif stage == "discovery":604 progress = 0.2605 elif stage == "testing":606 # Progress through testing based on attempts made607 max_attempts = self.config.max_actor_attempts608 progress = 0.2 + (0.6 * (self.current_attempts / max_attempts))609 elif stage == "analyzing":610 progress = 0.8611 elif stage == "completed":612 progress = 1.0613 614 update = ProgressUpdate(615 timestamp=datetime.now().isoformat(),616 stage=stage,617 message=message,618 progress=progress,619 **kwargs620 )621 622 self.logger.info(f"[{stage.upper()}] {message}")623 624 if self.on_progress:625 self.on_progress(update)626 627 def set_progress_callback(self, callback: Callable[[ProgressUpdate], None]):628 """Set progress callback for real-time updates."""629 self.on_progress = callback630
631
632# Actor entry point function633async def run_llm_scraper_actor() -> LLMScraperOutput:634 """635 Main entry point for running as an Apify Actor.636 637 Returns:638 LLMScraperOutput with results639 """640 async with Actor:641 Actor.log.info("🚀 LLM Scraper Actor starting...")642 643 # Get input644 actor_input = await Actor.get_input() or {}645 646 # Parse input647 try:648 input_config = LLMScraperInput(649 target_url=actor_input.get('targetUrl', ''),650 extraction_goal=actor_input.get('userGoal', ''),651 max_actor_attempts=actor_input.get('max_actor_attempts', 10),652 max_retries_per_actor=actor_input.get('max_retries_per_actor', 3),653 max_time_minutes=actor_input.get('max_time_minutes', 30),654 anthropic_api_key=actor_input.get('claudeApiKey'),655 mcp_url=actor_input.get('mcp_url', "https://mcp.apify.com/sse?enableAddingActors=true"),656 model_name=actor_input.get('model_name', "claude-3-5-haiku-latest"),657 debug_mode=actor_input.get('debug_mode', False),658 prefer_specific_actors=actor_input.get('prefer_specific_actors', True),659 min_data_quality_score=actor_input.get('min_data_quality_score', 0.7),660 enable_proxy=actor_input.get('enable_proxy', True)661 )662 except Exception as e:663 Actor.log.error(f"Invalid input: {str(e)}")664 await Actor.fail(f"Invalid input: {str(e)}")665 return LLMScraperOutput(success=False, status="failed", llm_reasoning=f"Input validation failed: {str(e)}")666 667 # Create and run scraper668 scraper = LLMScraperActor(input_config)669 670 # Set up progress callback to emit to Actor log671 def progress_callback(update: ProgressUpdate):672 Actor.log.info(f"[{update.stage.upper()}] {update.message}")673 # Push progress updates to dataset for real-time monitoring674 asyncio.create_task(Actor.push_data({675 "type": "progress",676 "timestamp": update.timestamp,677 "stage": update.stage,678 "message": update.message,679 "details": update.to_dict()680 }))681 682 scraper.set_progress_callback(progress_callback)683 684 # Run the scraper685 result = await scraper.run()686 687 # Push final results688 await Actor.push_data({689 "type": "final_result",690 **result.to_dict()691 })692 693 Actor.log.info(f"✅ LLM Scraper completed with status: {result.status}")694 695 return result696
697
698# Standalone function for use outside of Actor context699async def run_llm_scraper(input_config: LLMScraperInput, 700 progress_callback: Optional[Callable[[ProgressUpdate], None]] = None) -> LLMScraperOutput:701 """702 Run LLM Scraper outside of Apify Actor context.703 704 Args:705 input_config: Input configuration706 progress_callback: Optional callback for progress updates707 708 Returns:709 LLMScraperOutput with results710 """711 scraper = LLMScraperActor(input_config)712 713 if progress_callback:714 scraper.set_progress_callback(progress_callback)715 716 return await scraper.run()
src/llmscraper/llm_scraper/models.py
1"""2Input and output models for LLM Scraper Actor.3"""4
5from dataclasses import dataclass, field6from typing import Dict, List, Any, Optional, Union7from datetime import datetime8import json9
10
11@dataclass12class LLMScraperInput:13 """Input schema for LLM Scraper Actor."""14 15 # Required fields16 target_url: str17 extraction_goal: str18 19 # Optional configuration20 max_actor_attempts: int = 1021 max_retries_per_actor: int = 322 max_time_minutes: int = 3023 24 # API keys (optional - can use environment variables)25 anthropic_api_key: Optional[str] = None26 27 # MCP configuration28 mcp_url: str = "https://mcp.apify.com/sse?enableAddingActors=true"29 model_name: str = "claude-3-5-haiku-latest"30 31 # Advanced options32 debug_mode: bool = False33 prefer_specific_actors: bool = True34 min_data_quality_score: float = 0.735 enable_proxy: bool = True36 37 def validate(self) -> bool:38 """Validate input parameters."""39 if not self.target_url or not self.target_url.startswith(('http://', 'https://')):40 raise ValueError("target_url must be a valid HTTP/HTTPS URL")41 42 if not self.extraction_goal or len(self.extraction_goal.strip()) < 10:43 raise ValueError("extraction_goal must be at least 10 characters describing what to extract")44 45 if self.max_actor_attempts < 1 or self.max_actor_attempts > 50:46 raise ValueError("max_actor_attempts must be between 1 and 50")47 48 if self.max_retries_per_actor < 1 or self.max_retries_per_actor > 10:49 raise ValueError("max_retries_per_actor must be between 1 and 10")50 51 if self.max_time_minutes < 1 or self.max_time_minutes > 120:52 raise ValueError("max_time_minutes must be between 1 and 120")53 54 return True55
56
57@dataclass58class ActorAttempt:59 """Represents a single actor execution attempt."""60 61 actor_id: str62 actor_name: str63 attempt_number: int64 input_config: Dict[str, Any]65 timestamp: str66 67 # Results68 success: bool = False69 execution_time_seconds: float = 0.070 result_count: int = 071 data_quality_score: float = 0.072 extracted_data: Optional[List[Dict[str, Any]]] = None73 74 # Error details75 error_message: Optional[str] = None76 error_type: Optional[str] = None77 78 # Metadata79 run_id: Optional[str] = None80 dataset_id: Optional[str] = None81 82 def to_dict(self) -> Dict[str, Any]:83 """Convert to dictionary for JSON serialization."""84 return {85 "actor_id": self.actor_id,86 "actor_name": self.actor_name,87 "attempt_number": self.attempt_number,88 "input_config": self.input_config,89 "timestamp": self.timestamp,90 "success": self.success,91 "execution_time_seconds": self.execution_time_seconds,92 "result_count": self.result_count,93 "data_quality_score": self.data_quality_score,94 "error_message": self.error_message,95 "error_type": self.error_type,96 "run_id": self.run_id,97 "dataset_id": self.dataset_id,98 "has_data": bool(self.extracted_data)99 }100
101
102@dataclass103class ScrapingStrategy:104 """Represents a scraping strategy (actor + configuration)."""105 106 actor_id: str107 actor_name: str108 priority: int # 1 = highest priority109 input_template: Dict[str, Any]110 expected_data_fields: List[str]111 reasoning: str112 113 # Success metrics114 success_rate: float = 0.0115 avg_quality_score: float = 0.0116 avg_execution_time: float = 0.0117 118 def to_dict(self) -> Dict[str, Any]:119 """Convert to dictionary."""120 return {121 "actor_id": self.actor_id,122 "actor_name": self.actor_name,123 "priority": self.priority,124 "input_template": self.input_template,125 "expected_data_fields": self.expected_data_fields,126 "reasoning": self.reasoning,127 "success_rate": self.success_rate,128 "avg_quality_score": self.avg_quality_score,129 "avg_execution_time": self.avg_execution_time130 }131
132
133@dataclass134class LLMScraperOutput:135 """Output schema for LLM Scraper Actor."""136 137 # Overall success138 success: bool139 status: str # "completed", "failed", "timeout", "limit_reached"140 141 # Best result142 best_actor_id: Optional[str] = None143 best_actor_name: Optional[str] = None144 optimal_input_config: Optional[Dict[str, Any]] = None145 final_extracted_data: Optional[List[Dict[str, Any]]] = None146 147 # Execution details148 total_execution_time_seconds: float = 0.0149 total_attempts_made: int = 0150 unique_actors_tested: int = 0151 152 # Quality metrics153 best_data_quality_score: float = 0.0154 best_result_count: int = 0155 156 # Strategy and reasoning157 llm_reasoning: str = ""158 strategies_discovered: List[ScrapingStrategy] = field(default_factory=list)159 all_attempts: List[ActorAttempt] = field(default_factory=list)160 161 # Performance analysis162 performance_summary: Dict[str, Any] = field(default_factory=dict)163 recommendations: List[str] = field(default_factory=list)164 165 # Metadata166 completion_timestamp: str = ""167 claude_model_used: str = ""168 mcp_server_used: str = ""169 170 def __post_init__(self):171 """Set completion timestamp."""172 if not self.completion_timestamp:173 self.completion_timestamp = datetime.now().isoformat()174 175 def add_attempt(self, attempt: ActorAttempt):176 """Add an attempt to the results."""177 self.all_attempts.append(attempt)178 self.total_attempts_made = len(self.all_attempts)179 180 # Update best result if this is better181 if (attempt.success and 182 attempt.data_quality_score > self.best_data_quality_score):183 184 self.best_actor_id = attempt.actor_id185 self.best_actor_name = attempt.actor_name186 self.optimal_input_config = attempt.input_config187 self.final_extracted_data = attempt.extracted_data188 self.best_data_quality_score = attempt.data_quality_score189 self.best_result_count = attempt.result_count190 191 # Update unique actors count192 unique_actors = set(a.actor_id for a in self.all_attempts)193 self.unique_actors_tested = len(unique_actors)194 195 def calculate_performance_summary(self):196 """Calculate performance metrics."""197 if not self.all_attempts:198 return199 200 successful_attempts = [a for a in self.all_attempts if a.success]201 202 self.performance_summary = {203 "total_attempts": len(self.all_attempts),204 "successful_attempts": len(successful_attempts),205 "success_rate": len(successful_attempts) / len(self.all_attempts),206 "avg_execution_time": sum(a.execution_time_seconds for a in self.all_attempts) / len(self.all_attempts),207 "avg_quality_score": sum(a.data_quality_score for a in successful_attempts) / len(successful_attempts) if successful_attempts else 0,208 "actors_tested": self.unique_actors_tested,209 "best_actor": self.best_actor_name,210 "total_runtime": self.total_execution_time_seconds211 }212 213 def add_recommendation(self, recommendation: str):214 """Add a recommendation for future improvements."""215 if recommendation not in self.recommendations:216 self.recommendations.append(recommendation)217 218 def to_dict(self) -> Dict[str, Any]:219 """Convert to dictionary for JSON output."""220 self.calculate_performance_summary()221 222 return {223 "success": self.success,224 "status": self.status,225 "best_actor_id": self.best_actor_id,226 "best_actor_name": self.best_actor_name,227 "optimal_input_config": self.optimal_input_config,228 "final_extracted_data": self.final_extracted_data,229 "total_execution_time_seconds": self.total_execution_time_seconds,230 "total_attempts_made": self.total_attempts_made,231 "unique_actors_tested": self.unique_actors_tested,232 "best_data_quality_score": self.best_data_quality_score,233 "best_result_count": self.best_result_count,234 "llm_reasoning": self.llm_reasoning,235 "strategies_discovered": [s.to_dict() for s in self.strategies_discovered],236 "all_attempts": [a.to_dict() for a in self.all_attempts],237 "performance_summary": self.performance_summary,238 "recommendations": self.recommendations,239 "completion_timestamp": self.completion_timestamp,240 "claude_model_used": self.claude_model_used,241 "mcp_server_used": self.mcp_server_used242 }243 244 def to_json(self, indent: int = 2) -> str:245 """Convert to JSON string."""246 return json.dumps(self.to_dict(), indent=indent, default=str)247 248 # Convenience properties for backward compatibility249 @property250 def quality_score(self) -> float:251 """Alias for best_data_quality_score."""252 return self.best_data_quality_score253 254 @property255 def scraped_data(self) -> Optional[List[Dict[str, Any]]]:256 """Alias for final_extracted_data."""257 return self.final_extracted_data or []258 259 @property260 def total_execution_time(self) -> float:261 """Alias for total_execution_time_seconds."""262 return self.total_execution_time_seconds263 264 @property265 def actor_attempts(self) -> List[ActorAttempt]:266 """Alias for all_attempts."""267 return self.all_attempts268
269
270@dataclass271class ProgressUpdate:272 """Progress update for real-time status reporting."""273 274 timestamp: str275 stage: str # "discovery", "testing", "retrying", "analyzing", "completed"276 message: str277 progress: float = 0.0 # 0.0 to 1.0 representing completion percentage278 actor_name: Optional[str] = None279 attempt_number: Optional[int] = None280 success: Optional[bool] = None281 details: Optional[Dict[str, Any]] = None282 283 def __post_init__(self):284 """Set timestamp if not provided."""285 if not self.timestamp:286 self.timestamp = datetime.now().isoformat()287 288 def to_dict(self) -> Dict[str, Any]:289 """Convert to dictionary."""290 return {291 "timestamp": self.timestamp,292 "stage": self.stage,293 "message": self.message,294 "progress": self.progress,295 "actor_name": self.actor_name,296 "attempt_number": self.attempt_number,297 "success": self.success,298 "details": self.details or {}299 }
src/llmscraper/llm_scraper/quality_evaluator.py
1"""2Data quality evaluator for assessing scraping results.3"""4
5import logging6from typing import Dict, List, Any, Optional, Tuple7from dataclasses import dataclass8import re9import json10
11
12@dataclass13class QualityMetrics:14 """Quality metrics for scraped data."""15 16 completeness_score: float # 0-1, how complete the data is17 relevance_score: float # 0-1, how relevant to extraction goal18 structure_score: float # 0-1, how well-structured the data is19 volume_score: float # 0-1, appropriate amount of data20 overall_score: float # 0-1, weighted overall score21 22 # Detailed metrics23 total_items: int = 024 non_empty_fields: int = 025 total_fields: int = 026 unique_items: int = 027 28 # Quality indicators29 has_required_fields: bool = False30 has_duplicates: bool = False31 has_errors: bool = False32 33 feedback: List[str] = None34 35 def __post_init__(self):36 if self.feedback is None:37 self.feedback = []38
39
40class DataQualityEvaluator:41 """Evaluates the quality of scraped data."""42 43 def __init__(self):44 """Initialize the quality evaluator."""45 self.logger = logging.getLogger(__name__)46 47 # Common field names that indicate good data48 self.valuable_fields = {49 'title', 'name', 'price', 'cost', 'amount', 'value',50 'description', 'text', 'content', 'body', 'summary',51 'url', 'link', 'href', 'address', 'location',52 'date', 'time', 'timestamp', 'created', 'updated',53 'rating', 'score', 'review', 'feedback',54 'category', 'type', 'tag', 'label', 'status',55 'id', 'identifier', 'key', 'reference',56 'email', 'phone', 'contact', 'author', 'user'57 }58 59 def evaluate_data_quality(self, data: List[Dict[str, Any]], 60 extraction_goal: str,61 target_url: str) -> QualityMetrics:62 """63 Evaluate the quality of scraped data.64 65 Args:66 data: List of scraped data items67 extraction_goal: Original extraction goal68 target_url: Target URL that was scraped69 70 Returns:71 QualityMetrics with detailed quality assessment72 """73 if not data or not isinstance(data, list):74 return QualityMetrics(75 completeness_score=0.0,76 relevance_score=0.0,77 structure_score=0.0,78 volume_score=0.0,79 overall_score=0.0,80 feedback=["No data found or data is not in expected list format"]81 )82 83 # Calculate individual metrics84 completeness = self._calculate_completeness(data)85 relevance = self._calculate_relevance(data, extraction_goal)86 structure = self._calculate_structure_quality(data)87 volume = self._calculate_volume_quality(data, extraction_goal)88 89 # Calculate overall score (weighted average)90 overall = (91 completeness * 0.3 + # 30% weight on completeness92 relevance * 0.35 + # 35% weight on relevance93 structure * 0.2 + # 20% weight on structure94 volume * 0.15 # 15% weight on volume95 )96 97 # Detailed analysis98 total_items = len(data)99 unique_items = len(set(json.dumps(item, sort_keys=True) for item in data))100 has_duplicates = unique_items < total_items101 102 # Count fields103 all_fields = set()104 non_empty_count = 0105 total_field_count = 0106 107 for item in data:108 if isinstance(item, dict):109 all_fields.update(item.keys())110 for key, value in item.items():111 total_field_count += 1112 if value and str(value).strip():113 non_empty_count += 1114 115 # Check for required fields based on extraction goal116 has_required_fields = self._check_required_fields(data, extraction_goal)117 118 # Check for errors in data119 has_errors = self._check_for_errors(data)120 121 # Generate feedback122 feedback = self._generate_feedback(data, extraction_goal, completeness, relevance, structure, volume)123 124 return QualityMetrics(125 completeness_score=completeness,126 relevance_score=relevance,127 structure_score=structure,128 volume_score=volume,129 overall_score=overall,130 total_items=total_items,131 non_empty_fields=non_empty_count,132 total_fields=total_field_count,133 unique_items=unique_items,134 has_required_fields=has_required_fields,135 has_duplicates=has_duplicates,136 has_errors=has_errors,137 feedback=feedback138 )139 140 def _calculate_completeness(self, data: List[Dict[str, Any]]) -> float:141 """Calculate how complete the data is (non-empty fields)."""142 if not data:143 return 0.0144 145 total_fields = 0146 filled_fields = 0147 148 for item in data:149 if isinstance(item, dict):150 for key, value in item.items():151 total_fields += 1152 if value and str(value).strip() and str(value).strip() not in ['', 'null', 'None', 'undefined']:153 filled_fields += 1154 155 return filled_fields / total_fields if total_fields > 0 else 0.0156 157 def _calculate_relevance(self, data: List[Dict[str, Any]], extraction_goal: str) -> float:158 """Calculate how relevant the data is to the extraction goal."""159 if not data or not extraction_goal:160 return 0.0161 162 goal_keywords = set(re.findall(r'\w+', extraction_goal.lower()))163 164 # Remove common stop words165 stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'under', 'over', 'within', 'without', 'across', 'around', 'near', 'beyond', 'behind', 'except', 'until', 'since', 'while', 'although', 'because', 'if', 'when', 'where', 'how', 'what', 'who', 'which', 'why', 'extract', 'get', 'find', 'scrape', 'data', 'information', 'from', 'website', 'page'}166 goal_keywords -= stop_words167 168 if not goal_keywords:169 return 0.5 # Neutral score if no meaningful keywords170 171 relevance_scores = []172 173 for item in data:174 if isinstance(item, dict):175 item_text = ' '.join(str(v).lower() for v in item.values() if v)176 item_keywords = set(re.findall(r'\w+', item_text))177 178 # Calculate keyword overlap179 overlap = len(goal_keywords & item_keywords)180 score = overlap / len(goal_keywords)181 relevance_scores.append(score)182 183 return sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0.0184 185 def _calculate_structure_quality(self, data: List[Dict[str, Any]]) -> float:186 """Calculate how well-structured the data is."""187 if not data:188 return 0.0189 190 structure_scores = []191 192 # Check consistency of fields across items193 all_fields = set()194 for item in data:195 if isinstance(item, dict):196 all_fields.update(item.keys())197 198 if not all_fields:199 return 0.0200 201 for item in data:202 if isinstance(item, dict):203 # Score based on:204 # 1. Field consistency (has common fields)205 # 2. Field name quality (meaningful names)206 # 3. Data type consistency207 208 field_score = len(item.keys()) / len(all_fields) if all_fields else 0209 210 # Check for meaningful field names211 meaningful_fields = 0212 for field in item.keys():213 field_lower = field.lower()214 if any(valuable in field_lower for valuable in self.valuable_fields):215 meaningful_fields += 1216 217 meaning_score = meaningful_fields / len(item.keys()) if item.keys() else 0218 219 # Combined structure score for this item220 item_score = (field_score + meaning_score) / 2221 structure_scores.append(item_score)222 223 return sum(structure_scores) / len(structure_scores) if structure_scores else 0.0224 225 def _calculate_volume_quality(self, data: List[Dict[str, Any]], extraction_goal: str) -> float:226 """Calculate if the volume of data is appropriate."""227 if not data:228 return 0.0229 230 data_count = len(data)231 232 # Determine expected volume based on extraction goal233 goal_lower = extraction_goal.lower()234 235 if any(word in goal_lower for word in ['all', 'every', 'each', 'list']):236 # Expecting larger dataset237 if data_count >= 10:238 return 1.0239 elif data_count >= 5:240 return 0.8241 elif data_count >= 2:242 return 0.6243 else:244 return 0.3245 elif any(word in goal_lower for word in ['first', 'top', 'main', 'primary']):246 # Expecting smaller, focused dataset247 if 1 <= data_count <= 5:248 return 1.0249 elif data_count <= 10:250 return 0.8251 else:252 return 0.6253 else:254 # General expectation255 if 3 <= data_count <= 20:256 return 1.0257 elif 1 <= data_count <= 30:258 return 0.8259 elif data_count > 30:260 return 0.7261 else:262 return 0.4263 264 def _check_required_fields(self, data: List[Dict[str, Any]], extraction_goal: str) -> bool:265 """Check if data contains fields that seem required based on the goal."""266 if not data:267 return False268 269 goal_lower = extraction_goal.lower()270 required_patterns = []271 272 # Map goal keywords to expected fields273 if any(word in goal_lower for word in ['price', 'cost', 'amount']):274 required_patterns.extend(['price', 'cost', 'amount', 'value', '$'])275 if any(word in goal_lower for word in ['title', 'name', 'product']):276 required_patterns.extend(['title', 'name', 'product'])277 if any(word in goal_lower for word in ['description', 'text', 'content']):278 required_patterns.extend(['description', 'text', 'content', 'body'])279 if any(word in goal_lower for word in ['rating', 'review', 'score']):280 required_patterns.extend(['rating', 'review', 'score', 'star'])281 if any(word in goal_lower for word in ['url', 'link']):282 required_patterns.extend(['url', 'link', 'href'])283 284 if not required_patterns:285 return True # No specific requirements identified286 287 # Check if any item has fields matching the patterns288 for item in data:289 if isinstance(item, dict):290 item_fields = ' '.join(item.keys()).lower()291 item_values = ' '.join(str(v) for v in item.values()).lower()292 293 for pattern in required_patterns:294 if pattern in item_fields or pattern in item_values:295 return True296 297 return False298 299 def _check_for_errors(self, data: List[Dict[str, Any]]) -> bool:300 """Check if data contains obvious errors."""301 error_indicators = [302 'error', 'exception', 'failed', 'null', 'undefined',303 'not found', '404', '500', 'blocked', 'denied'304 ]305 306 for item in data:307 if isinstance(item, dict):308 item_text = ' '.join(str(v).lower() for v in item.values() if v)309 if any(error in item_text for error in error_indicators):310 return True311 312 return False313 314 def _generate_feedback(self, data: List[Dict[str, Any]], extraction_goal: str,315 completeness: float, relevance: float, 316 structure: float, volume: float) -> List[str]:317 """Generate human-readable feedback about data quality."""318 feedback = []319 320 # Overall assessment321 overall = (completeness * 0.3 + relevance * 0.35 + structure * 0.2 + volume * 0.15)322 323 if overall >= 0.8:324 feedback.append("Excellent data quality - this actor produced high-quality results")325 elif overall >= 0.6:326 feedback.append("Good data quality - results are usable with minor issues")327 elif overall >= 0.4:328 feedback.append("Fair data quality - results have some issues but may be usable")329 else:330 feedback.append("Poor data quality - consider trying a different actor or configuration")331 332 # Specific feedback333 if completeness < 0.5:334 feedback.append("Data has many empty fields - try adjusting extraction settings")335 336 if relevance < 0.5:337 feedback.append("Data doesn't closely match extraction goal - consider more specific actor or different parameters")338 339 if structure < 0.5:340 feedback.append("Data structure is inconsistent - try a different actor or modify extraction logic")341 342 if volume < 0.5:343 feedback.append("Data volume is suboptimal - consider adjusting maxResults or crawl settings")344 345 # Positive feedback346 if completeness > 0.8:347 feedback.append("Data fields are well-populated")348 349 if relevance > 0.8:350 feedback.append("Data is highly relevant to extraction goal")351 352 if structure > 0.8:353 feedback.append("Data has consistent, well-structured format")354 355 return feedback356 357 def compare_results(self, results1: QualityMetrics, results2: QualityMetrics) -> str:358 """Compare two quality results and recommend the better one."""359 if results1.overall_score > results2.overall_score:360 winner = "first"361 score_diff = results1.overall_score - results2.overall_score362 elif results2.overall_score > results1.overall_score:363 winner = "second"364 score_diff = results2.overall_score - results1.overall_score365 else:366 return "Both results have equal overall quality scores"367 368 comparison = f"The {winner} result is better with {score_diff:.2f} higher overall score"369 370 # Add specific reasons371 reasons = []372 if abs(results1.relevance_score - results2.relevance_score) > 0.1:373 if results1.relevance_score > results2.relevance_score:374 reasons.append("first result is more relevant to extraction goal")375 else:376 reasons.append("second result is more relevant to extraction goal")377 378 if abs(results1.completeness_score - results2.completeness_score) > 0.1:379 if results1.completeness_score > results2.completeness_score:380 reasons.append("first result has more complete data")381 else:382 reasons.append("second result has more complete data")383 384 if reasons:385 comparison += f" because {' and '.join(reasons)}"386 387 return comparison
src/llmscraper/llm_scraper/retry_logic.py
1"""2Retry logic for intelligent actor execution with parameter adjustment.3"""4
5import asyncio6import logging7from typing import Dict, List, Any, Optional, Tuple8from dataclasses import dataclass9from datetime import datetime10import copy11
12
13@dataclass14class RetryAttempt:15 """Represents a retry attempt with adjusted parameters."""16 attempt_number: int17 adjusted_input: Dict[str, Any]18 reasoning: str19 timestamp: str20
21
22class RetryManager:23 """Manages intelligent retry logic with parameter adjustments."""24 25 def __init__(self, max_retries_per_actor: int = 3):26 """Initialize retry manager."""27 self.max_retries_per_actor = max_retries_per_actor28 self.logger = logging.getLogger(__name__)29 30 # Track retry history31 self.retry_history: Dict[str, List[RetryAttempt]] = {}32 33 def should_retry(self, actor_id: str, error_message: str) -> bool:34 """35 Determine if an actor should be retried based on error and history.36 37 Args:38 actor_id: ID of the actor that failed39 error_message: Error message from the failed attempt40 41 Returns:42 True if should retry, False otherwise43 """44 current_attempts = len(self.retry_history.get(actor_id, []))45 46 if current_attempts >= self.max_retries_per_actor:47 self.logger.debug(f"Max retries reached for {actor_id}")48 return False49 50 # Check if error is retryable51 retryable_errors = [52 "timeout", "rate limit", "proxy", "network", "temporary",53 "service unavailable", "too many requests", "blocked"54 ]55 56 error_lower = error_message.lower()57 is_retryable = any(err in error_lower for err in retryable_errors)58 59 if not is_retryable:60 self.logger.debug(f"Error not retryable for {actor_id}: {error_message}")61 return False62 63 self.logger.info(f"Will retry {actor_id} (attempt {current_attempts + 1}/{self.max_retries_per_actor})")64 return True65 66 def adjust_input_for_retry(self, actor_id: str, base_input: Dict[str, Any], 67 error_message: str, attempt_number: int) -> Tuple[Dict[str, Any], str]:68 """69 Adjust input parameters for retry based on the error and attempt number.70 71 Args:72 actor_id: ID of the actor73 base_input: Original input configuration74 error_message: Error message from failed attempt75 attempt_number: Current attempt number (1-based)76 77 Returns:78 Tuple of (adjusted_input, reasoning)79 """80 adjusted_input = copy.deepcopy(base_input)81 adjustments = []82 83 error_lower = error_message.lower()84 85 # Attempt-based adjustments86 if attempt_number == 1:87 # First retry: Basic adjustments88 if "timeout" in error_lower or "slow" in error_lower:89 if "requestTimeoutSecs" in adjusted_input:90 adjusted_input["requestTimeoutSecs"] = adjusted_input.get("requestTimeoutSecs", 30) * 291 adjustments.append("increased request timeout")92 93 if "maxRequestRetries" in adjusted_input:94 adjusted_input["maxRequestRetries"] = min(adjusted_input.get("maxRequestRetries", 3) + 2, 10)95 adjustments.append("increased retry attempts")96 97 if "rate limit" in error_lower or "blocked" in error_lower:98 # Enable proxy if not already enabled99 if "proxyConfiguration" not in adjusted_input:100 adjusted_input["proxyConfiguration"] = {"useApifyProxy": True}101 adjustments.append("enabled proxy")102 elif not adjusted_input["proxyConfiguration"].get("useApifyProxy"):103 adjusted_input["proxyConfiguration"]["useApifyProxy"] = True104 adjustments.append("enabled Apify proxy")105 106 # Reduce load if memory/resource issues107 if "memory" in error_lower or "resource" in error_lower:108 if "maxResults" in adjusted_input:109 adjusted_input["maxResults"] = max(adjusted_input.get("maxResults", 10) // 2, 1)110 adjustments.append("reduced max results")111 112 if "maxPagesPerCrawl" in adjusted_input:113 adjusted_input["maxPagesPerCrawl"] = max(adjusted_input.get("maxPagesPerCrawl", 10) // 2, 1)114 adjustments.append("reduced pages per crawl")115 116 elif attempt_number == 2:117 # Second retry: More aggressive adjustments118 if "proxyConfiguration" in adjusted_input:119 proxy_config = adjusted_input["proxyConfiguration"]120 if "apifyProxyGroups" not in proxy_config:121 proxy_config["apifyProxyGroups"] = ["RESIDENTIAL"]122 adjustments.append("switched to residential proxies")123 elif "RESIDENTIAL" not in proxy_config.get("apifyProxyGroups", []):124 proxy_config["apifyProxyGroups"] = ["RESIDENTIAL"]125 adjustments.append("switched to residential proxies")126 127 # Reduce concurrent operations128 if "maxConcurrency" in adjusted_input:129 adjusted_input["maxConcurrency"] = max(adjusted_input.get("maxConcurrency", 5) // 2, 1)130 adjustments.append("reduced concurrency")131 132 # Increase wait times133 if "dynamicContentWaitSecs" in adjusted_input:134 adjusted_input["dynamicContentWaitSecs"] = min(adjusted_input.get("dynamicContentWaitSecs", 10) * 2, 60)135 adjustments.append("increased wait time for dynamic content")136 137 elif attempt_number >= 3:138 # Final retry: Conservative settings139 adjustments.append("using conservative settings")140 141 # Minimal resource usage142 if "maxResults" in adjusted_input:143 adjusted_input["maxResults"] = min(adjusted_input.get("maxResults", 10), 5)144 if "maxPagesPerCrawl" in adjusted_input:145 adjusted_input["maxPagesPerCrawl"] = 1146 if "maxConcurrency" in adjusted_input:147 adjusted_input["maxConcurrency"] = 1148 149 # Maximum timeouts and retries150 if "requestTimeoutSecs" in adjusted_input:151 adjusted_input["requestTimeoutSecs"] = 120152 if "maxRequestRetries" in adjusted_input:153 adjusted_input["maxRequestRetries"] = 10154 155 # Force proxy usage156 adjusted_input["proxyConfiguration"] = {157 "useApifyProxy": True,158 "apifyProxyGroups": ["RESIDENTIAL"]159 }160 161 # Actor-specific adjustments162 if "web-scraper" in actor_id:163 self._adjust_web_scraper_input(adjusted_input, error_lower, adjustments)164 elif "cheerio-scraper" in actor_id:165 self._adjust_cheerio_scraper_input(adjusted_input, error_lower, adjustments)166 elif "website-content-crawler" in actor_id:167 self._adjust_website_crawler_input(adjusted_input, error_lower, adjustments)168 169 # Record this retry attempt170 retry_attempt = RetryAttempt(171 attempt_number=attempt_number,172 adjusted_input=adjusted_input,173 reasoning=f"Retry {attempt_number}: {', '.join(adjustments) if adjustments else 'no specific adjustments'}",174 timestamp=datetime.now().isoformat()175 )176 177 if actor_id not in self.retry_history:178 self.retry_history[actor_id] = []179 self.retry_history[actor_id].append(retry_attempt)180 181 return adjusted_input, retry_attempt.reasoning182 183 def _adjust_web_scraper_input(self, input_config: Dict[str, Any], error_lower: str, adjustments: List[str]):184 """Apply web-scraper specific adjustments."""185 if "javascript" in error_lower or "js" in error_lower:186 # Disable JavaScript if causing issues187 if "useChrome" in input_config:188 input_config["useChrome"] = False189 adjustments.append("disabled Chrome/JavaScript")190 191 if "page function" in error_lower:192 # Simplify page function if it exists193 if "pageFunction" in input_config:194 # Use a minimal page function195 input_config["pageFunction"] = """196 async function pageFunction(context) {197 const { request, log, $ } = context;198 return {199 url: request.url,200 title: $('title').text(),201 html: $('body').html()202 };203 }204 """205 adjustments.append("simplified page function")206 207 def _adjust_cheerio_scraper_input(self, input_config: Dict[str, Any], error_lower: str, adjustments: List[str]):208 """Apply cheerio-scraper specific adjustments."""209 if "selector" in error_lower or "parse" in error_lower:210 # Simplify selectors if parsing issues211 if "pageFunction" in input_config:212 input_config["pageFunction"] = """213 async function pageFunction(context) {214 const { request, $ } = context;215 return {216 url: request.url,217 title: $('title').text(),218 content: $('body').text()219 };220 }221 """222 adjustments.append("simplified cheerio selectors")223 224 def _adjust_website_crawler_input(self, input_config: Dict[str, Any], error_lower: str, adjustments: List[str]):225 """Apply website-content-crawler specific adjustments."""226 if "content" in error_lower or "extraction" in error_lower:227 # Adjust content extraction settings228 if "htmlTransformer" in input_config:229 input_config["htmlTransformer"] = "readableText"230 adjustments.append("switched to readable text extraction")231 232 if "readableTextCharThreshold" in input_config:233 input_config["readableTextCharThreshold"] = 50 # Lower threshold234 adjustments.append("lowered text threshold")235 236 if "crawler" in error_lower or "navigation" in error_lower:237 # Simplify crawler settings238 if "maxCrawlDepth" in input_config:239 input_config["maxCrawlDepth"] = 0 # Only crawl start page240 adjustments.append("limited to single page")241 242 if "crawlerType" in input_config:243 input_config["crawlerType"] = "cheerio" # Use simpler crawler244 adjustments.append("switched to Cheerio crawler")245 246 def get_retry_history(self, actor_id: str) -> List[RetryAttempt]:247 """Get retry history for an actor."""248 return self.retry_history.get(actor_id, [])249 250 def reset_history(self, actor_id: Optional[str] = None):251 """Reset retry history for a specific actor or all actors."""252 if actor_id:253 self.retry_history.pop(actor_id, None)254 else:255 self.retry_history.clear()256 257 def get_total_retries(self) -> int:258 """Get total number of retries across all actors."""259 return sum(len(attempts) for attempts in self.retry_history.values())260 261 def calculate_retry_success_rate(self) -> float:262 """Calculate overall retry success rate (placeholder for future implementation)."""263 # This would be implemented based on actual success tracking264 total_retries = self.get_total_retries()265 if total_retries == 0:266 return 0.0267 268 # Placeholder calculation269 return 0.6 # 60% success rate on retries
src/llmscraper/mcp/__init__.py
1"""2MCP (Model Context Protocol) package for Apify integration.3"""4
5from .client import MCPClient, MCPTool, MCPCallResult6
7__all__ = ['MCPClient', 'MCPTool', 'MCPCallResult']
src/llmscraper/mcp/client.py
1"""2MCP (Model Context Protocol) Client for Apify integration.3"""4
5import asyncio6import httpx7import logging8from typing import Dict, List, Any, Optional, Union9from dataclasses import dataclass10import json11
12
13@dataclass14class MCPTool:15 """Represents an MCP tool."""16 name: str17 description: str18 input_schema: Dict[str, Any]19
20
21@dataclass22class MCPCallResult:23 """Result from an MCP tool call."""24 content: Union[str, List[Dict[str, Any]]]25 is_error: bool = False26 error_message: Optional[str] = None27
28
29class MCPClient:30 """Client for connecting to Apify MCP Server."""31 32 def __init__(self, server_url: str, apify_token: str, timeout: int = 300):33 """34 Initialize MCP client.35 36 Args:37 server_url: URL of the MCP server (e.g., https://mcp.apify.com)38 apify_token: Apify API token for authentication39 timeout: Default timeout for requests40 """41 # For now, let's use fallback mode since MCP server endpoints are unclear42 self.server_url = server_url.rstrip('/')43 self.apify_token = apify_token44 self.timeout = timeout45 self.available_tools: List[MCPTool] = []46 self.logger = logging.getLogger(__name__)47 self.use_fallback_only = True # Force fallback mode until MCP endpoints are clarified48 49 self.headers = {50 'Authorization': f'Bearer {apify_token}',51 'Content-Type': 'application/json',52 'User-Agent': 'LLMScraper/1.0.0'53 }54 55 async def connect(self) -> bool:56 """57 Connect to MCP server and fetch available tools.58 59 Returns:60 True if connection successful, False otherwise61 """62 try:63 self.logger.info(f"Initializing Apify client with fallback mode")64 65 # For now, skip MCP server connection and use direct Apify API66 if self.use_fallback_only:67 await self._setup_fallback_tools()68 self.logger.info(f"Connected in fallback mode. Available tools: {len(self.available_tools)}")69 return True70 71 # Original MCP connection logic (currently disabled)72 self.logger.info(f"Connecting to MCP server: {self.server_url}")73 await self._list_tools()74 self.logger.info(f"Successfully connected to MCP server. Available tools: {len(self.available_tools)}")75 return True76 77 except Exception as e:78 self.logger.error(f"Failed to connect to MCP server: {str(e)}")79 return False80 81 async def _list_tools(self) -> List[MCPTool]:82 """Fetch available tools from MCP server using proper MCP protocol."""83 try:84 async with httpx.AsyncClient(timeout=30) as client:85 # Use proper MCP protocol format86 payload = {87 "jsonrpc": "2.0",88 "id": 1,89 "method": "tools/list",90 "params": {}91 }92 93 # Try the streamable endpoint first, then SSE fallback94 endpoints_to_try = [95 self.server_url, # https://mcp.apify.com96 f"{self.server_url}/sse" # https://mcp.apify.com/sse97 ]98 99 for endpoint in endpoints_to_try:100 try:101 self.logger.debug(f"Trying MCP endpoint: {endpoint}")102 response = await client.post(103 endpoint, 104 headers=self.headers, 105 json=payload106 )107 108 if response.status_code == 200:109 data = response.json()110 111 # Handle MCP protocol response112 if "result" in data and "tools" in data["result"]:113 tools = data["result"]["tools"]114 115 self.available_tools = [116 MCPTool(117 name=tool['name'],118 description=tool.get('description', ''),119 input_schema=tool.get('inputSchema', {})120 )121 for tool in tools122 ]123 124 self.logger.info(f"Found {len(self.available_tools)} tools from MCP server")125 return self.available_tools126 127 # Handle error response128 elif "error" in data:129 error = data["error"]130 self.logger.warning(f"MCP error listing tools: {error.get('message', 'Unknown error')}")131 continue132 133 except httpx.HTTPStatusError as e:134 self.logger.debug(f"Endpoint {endpoint} returned {e.response.status_code}")135 continue136 except Exception as e:137 self.logger.debug(f"Endpoint {endpoint} failed: {str(e)}")138 continue139 140 # If MCP endpoints fail, fallback to direct Apify actor search141 self.logger.warning("MCP endpoints failed, using fallback actor discovery")142 await self._setup_fallback_tools()143 return self.available_tools144 145 except Exception as e:146 self.logger.error(f"Error listing tools: {str(e)}")147 # Setup minimal fallback tools for basic functionality148 await self._setup_fallback_tools()149 return self.available_tools150 151 async def _setup_fallback_tools(self):152 """Setup fallback tools when MCP server is not available."""153 self.logger.info("Setting up fallback tools for direct Apify API usage")154 155 # Basic tools that we can simulate without MCP server156 fallback_tools = [157 MCPTool(158 name="search-actors",159 description="Search for Apify actors",160 input_schema={161 "type": "object",162 "properties": {163 "search": {"type": "string", "description": "Search query"},164 "limit": {"type": "integer", "description": "Max results", "default": 10}165 },166 "required": ["search"]167 }168 ),169 MCPTool(170 name="get-actor-details", 171 description="Get details about a specific actor",172 input_schema={173 "type": "object",174 "properties": {175 "actor": {"type": "string", "description": "Actor ID or name"}176 },177 "required": ["actor"]178 }179 ),180 MCPTool(181 name="run-actor",182 description="Run an Apify actor directly",183 input_schema={184 "type": "object", 185 "properties": {186 "actor_id": {"type": "string", "description": "Actor ID"},187 "input": {"type": "object", "description": "Actor input"}188 },189 "required": ["actor_id", "input"]190 }191 )192 ]193 194 self.available_tools = fallback_tools195 196 async def call_tool(self, tool_name: str, arguments: Dict[str, Any], timeout: Optional[int] = None) -> MCPCallResult:197 """198 Call an MCP tool.199 200 Args:201 tool_name: Name of the tool to call202 arguments: Arguments to pass to the tool203 timeout: Optional timeout override204 205 Returns:206 MCPCallResult with the response207 """208 call_timeout = timeout or self.timeout209 210 try:211 self.logger.info(f"Calling tool: {tool_name} with args: {arguments}")212 213 # Check if we need to use fallback mode214 if not any(tool.name == tool_name for tool in self.available_tools):215 return MCPCallResult(216 content=f"Tool {tool_name} not available",217 is_error=True,218 error_message=f"Tool {tool_name} not found in available tools"219 )220 221 # Use fallback mode by default for now222 if self.use_fallback_only or tool_name in ["search-actors", "get-actor-details", "run-actor"]:223 return await self._fallback_tool_call(tool_name, arguments)224 225 # Try MCP call (currently disabled)226 result = await self._try_mcp_call(tool_name, arguments, call_timeout)227 if result and not result.is_error:228 return result229 230 # Fallback to direct API calls231 return await self._fallback_tool_call(tool_name, arguments)232 233 except Exception as e:234 self.logger.error(f"Error calling tool {tool_name}: {str(e)}")235 return MCPCallResult(236 content=f"Tool execution error: {str(e)}",237 is_error=True,238 error_message=str(e)239 )240 241 async def _try_mcp_call(self, tool_name: str, arguments: Dict[str, Any], timeout: int) -> Optional[MCPCallResult]:242 """Try to make an MCP tool call using proper MCP protocol."""243 try:244 async with httpx.AsyncClient(timeout=timeout) as client:245 # Use proper MCP protocol format246 payload = {247 "jsonrpc": "2.0",248 "id": 1,249 "method": "tools/call",250 "params": {251 "name": tool_name,252 "arguments": arguments253 }254 }255 256 # Try the streamable endpoint first, then SSE fallback257 endpoints_to_try = [258 self.server_url, # https://mcp.apify.com259 f"{self.server_url}/sse" # https://mcp.apify.com/sse260 ]261 262 for endpoint in endpoints_to_try:263 try:264 self.logger.debug(f"Trying MCP endpoint: {endpoint}")265 266 response = await client.post(267 endpoint,268 headers=self.headers,269 json=payload270 )271 272 if response.status_code == 200:273 data = response.json()274 275 # Handle MCP protocol response276 if "result" in data:277 result = data["result"]278 content = result.get("content", [])279 280 # Extract text content from MCP response281 if isinstance(content, list) and content:282 text_content = ""283 for item in content:284 if isinstance(item, dict) and item.get("type") == "text":285 text_content += item.get("text", "")286 287 return MCPCallResult(288 content=text_content if text_content else content,289 is_error=False290 )291 else:292 return MCPCallResult(293 content=content,294 is_error=False295 )296 297 # Handle error response298 elif "error" in data:299 error = data["error"]300 return MCPCallResult(301 content=f"MCP error: {error.get('message', 'Unknown error')}",302 is_error=True,303 error_message=error.get('message', 'Unknown error')304 )305 306 # Fallback for non-standard response307 return MCPCallResult(308 content=data,309 is_error=False310 )311 312 except Exception as e:313 self.logger.debug(f"MCP endpoint {endpoint} failed: {str(e)}")314 continue315 316 return None317 318 except Exception as e:319 self.logger.debug(f"MCP call attempt failed: {str(e)}")320 return None321 322 async def _fallback_tool_call(self, tool_name: str, arguments: Dict[str, Any]) -> MCPCallResult:323 """Handle tool calls using direct Apify API when MCP is unavailable."""324 from apify_client import ApifyClient325 326 try:327 client = ApifyClient(self.apify_token)328 329 if tool_name == "search-actors":330 search_query = arguments.get("search", "")331 limit = arguments.get("limit", 10)332 333 try:334 # Use Apify Store API to search for actors335 async with httpx.AsyncClient() as http_client:336 store_url = "https://api.apify.com/v2/store"337 params = {338 "limit": limit,339 "search": search_query340 }341 headers = {"Authorization": f"Bearer {self.apify_token}"}342 343 response = await http_client.get(store_url, params=params, headers=headers)344 345 if response.status_code == 200:346 data = response.json()347 actors = data.get("data", {}).get("items", [])348 349 # Format results similar to MCP response350 result_content = [351 {352 "name": actor.get("name", ""),353 "title": actor.get("title", ""), 354 "description": actor.get("description", ""),355 "username": actor.get("username", ""),356 "stats": {357 "users": actor.get("stats", {}).get("totalUsers", 0),358 "runs": actor.get("stats", {}).get("totalRuns", 0)359 }360 }361 for actor in actors362 ]363 364 return MCPCallResult(content=result_content)365 else:366 self.logger.warning(f"Store API returned {response.status_code}")367 368 except Exception as e:369 self.logger.warning(f"Store API failed: {str(e)}")370 371 # Fallback to hardcoded popular actors if API fails372 result_content = [373 {374 "name": "apify/web-scraper",375 "title": "Web Scraper",376 "description": "Crawls arbitrary websites using the Chrome browser and extracts data from pages using a provided JavaScript code.",377 "username": "apify",378 "stats": {"users": 1000, "runs": 50000}379 },380 {381 "name": "apify/cheerio-scraper", 382 "title": "Cheerio Scraper",383 "description": "Crawls websites using the Cheerio library and extracts data from HTML documents.",384 "username": "apify",385 "stats": {"users": 800, "runs": 30000}386 },387 {388 "name": "apify/website-content-crawler",389 "title": "Website Content Crawler",390 "description": "Crawls websites and extracts text content, metadata, and other information.",391 "username": "apify", 392 "stats": {"users": 600, "runs": 20000}393 },394 {395 "name": "apify/puppeteer-scraper",396 "title": "Puppeteer Scraper", 397 "description": "Crawls websites using Puppeteer and extracts data from pages.",398 "username": "apify",399 "stats": {"users": 500, "runs": 15000}400 }401 ]402 403 # Filter based on search query404 if search_query:405 search_lower = search_query.lower()406 result_content = [407 actor for actor in result_content408 if (search_lower in actor["name"].lower() or 409 search_lower in actor["title"].lower() or410 search_lower in actor["description"].lower())411 ]412 413 return MCPCallResult(content=result_content[:limit])414 415 elif tool_name == "get-actor-details":416 actor_id = arguments.get("actor")417 418 try:419 actor_info = client.actor(actor_id).get()420 421 # Convert datetime objects to strings for JSON serialization422 if actor_info:423 actor_info = self._serialize_datetime_fields(actor_info)424 425 return MCPCallResult(content=actor_info)426 except Exception as e:427 return MCPCallResult(428 content=f"Actor {actor_id} not found",429 is_error=True,430 error_message=str(e)431 )432 433 elif tool_name == "run-actor":434 actor_id = arguments.get("actor_id")435 actor_input = arguments.get("input", {})436 437 try:438 self.logger.info(f"Running actor {actor_id} with input: {actor_input}")439 run = client.actor(actor_id).call(run_input=actor_input)440 441 if run and run.get('status') == 'SUCCEEDED':442 # Get results from dataset or key-value store443 dataset_id = run.get('defaultDatasetId')444 if dataset_id:445 items = list(client.dataset(dataset_id).iterate_items())446 return MCPCallResult(content=items)447 448 return MCPCallResult(content={"status": "completed", "run": run})449 else:450 return MCPCallResult(451 content=f"Actor run failed: {run.get('status') if run else 'Unknown error'}",452 is_error=True,453 error_message="Actor execution failed"454 )455 456 except Exception as e:457 return MCPCallResult(458 content=f"Error running actor: {str(e)}",459 is_error=True,460 error_message=str(e)461 )462 463 return MCPCallResult(464 content=f"Fallback not implemented for tool: {tool_name}",465 is_error=True,466 error_message="Fallback not available"467 )468 469 except Exception as e:470 return MCPCallResult(471 content=f"Fallback execution error: {str(e)}",472 is_error=True,473 error_message=str(e)474 )475 476 def get_available_tools(self) -> List[MCPTool]:477 """Get list of available tools."""478 return self.available_tools.copy()479 480 def format_tools_for_claude(self) -> List[Dict[str, Any]]:481 """Format tools for Claude API."""482 return [483 {484 "name": tool.name,485 "description": tool.description,486 "input_schema": tool.input_schema487 }488 for tool in self.available_tools489 ]490 491 def _serialize_datetime_fields(self, obj):492 """Recursively convert datetime objects to ISO format strings."""493 import datetime494 495 if isinstance(obj, dict):496 return {key: self._serialize_datetime_fields(value) for key, value in obj.items()}497 elif isinstance(obj, list):498 return [self._serialize_datetime_fields(item) for item in obj]499 elif isinstance(obj, datetime.datetime):500 return obj.isoformat()501 elif isinstance(obj, datetime.date):502 return obj.isoformat()503 else:504 return obj
src/llmscraper/scraping/__init__.py
1"""2Scraping module for ScraperCodeGenerator.3"""4
5from .apify_runner import ApifyRunner6from .multi_actor_scraper import MultiActorScraper7from .actor_multi_scraper import ActorMultiScraper8
9__all__ = ["ApifyRunner", "MultiActorScraper", "ActorMultiScraper"]
src/llmscraper/scraping/actor_multi_scraper.py
1"""2Apify Actor-specific scraping module for running other actors from within an Apify actor.3"""4
5import asyncio6import logging7from typing import Dict, Any, List, Optional, Tuple8from apify import Actor9
10
11class ActorMultiScraper:12 """Handles running multiple Apify actors from within an Apify actor context."""13 14 def __init__(self):15 """Initialize the actor scraper."""16 self.logger = logging.getLogger(__name__)17 18 async def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, Optional[str]]:19 """20 Run multiple actors in parallel to scrape the target URL and return HTML content.21 22 Args:23 target_url: The URL to scrape24 25 Returns:26 Dictionary mapping actor names to their HTML content (or None if failed)27 """28 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors called for {target_url}")29 30 # Define actor configurations31 actor_configs = self._get_actor_configs(target_url)32 Actor.log.info(f"DEBUG: Will run {len(actor_configs)} actors in parallel: {list(actor_configs.keys())}")33 34 # Create tasks for parallel execution35 tasks = []36 actor_names = []37 38 for actor_name, config in actor_configs.items():39 Actor.log.info(f"DEBUG: Creating task for {actor_name}...")40 task = self._run_single_actor_with_name(actor_name, config)41 tasks.append(task)42 actor_names.append(actor_name)43 44 # Run all actors in parallel45 Actor.log.info("DEBUG: Starting all actors in parallel...")46 results_list = await asyncio.gather(*tasks, return_exceptions=True)47 48 # Process results49 results = {}50 for i, (actor_name, result) in enumerate(zip(actor_names, results_list)):51 if isinstance(result, Exception):52 Actor.log.error(f"DEBUG: {actor_name} failed: {str(result)}")53 results[actor_name] = None54 else:55 results[actor_name] = result56 if result:57 Actor.log.info(f"DEBUG: {actor_name} succeeded: {len(result):,} characters")58 else:59 Actor.log.warning(f"DEBUG: {actor_name} returned no content")60 61 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors completed. Results: {list(results.keys())}")62 Actor.log.info(f"DEBUG: Results with content: {[name for name, content in results.items() if content]}")63 return results64 65 async def _run_single_actor_with_name(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:66 """67 Run a single actor and return its HTML content.68 69 Args:70 actor_name: Name of the actor for logging71 config: Actor configuration72 73 Returns:74 HTML content or None if failed75 """76 try:77 Actor.log.info(f"DEBUG: Starting {actor_name}...")78 return await self._run_single_actor(actor_name, config)79 except Exception as e:80 Actor.log.error(f"DEBUG: {actor_name} failed: {str(e)}")81 return None82 83 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:84 """Get configurations for all actors to run."""85 return {86 "cheerio-scraper": {87 "actor_id": "apify/cheerio-scraper",88 "input": {89 "startUrls": [{"url": target_url}],90 "maxRequestRetries": 3,91 "requestTimeoutSecs": 30,92 "maxPagesPerCrawl": 1,93 "pageFunction": """94 async function pageFunction(context) {95 const { request, log, $ } = context;96 try {97 const title = $('title').text() || '';98 const html = $('html').html() || '';99 return {100 url: request.url,101 title: title,102 html: html103 };104 } catch (error) {105 log.error('Error in pageFunction:', error);106 return {107 url: request.url,108 title: '',109 html: ''110 };111 }112 }113 """,114 "proxyConfiguration": {"useApifyProxy": True}115 }116 },117 "web-scraper": {118 "actor_id": "apify/web-scraper",119 "input": {120 "startUrls": [{"url": target_url}],121 "maxRequestRetries": 3,122 "requestTimeoutSecs": 30,123 "maxPagesPerCrawl": 1,124 "pageFunction": """125 async function pageFunction(context) {126 const { request, log, page } = context;127 try {128 const title = await page.title();129 const html = await page.content();130 return { url: request.url, title, html };131 } catch (error) {132 log.error('Error in pageFunction:', error);133 return { url: request.url, title: '', html: '' };134 }135 }136 """,137 "proxyConfiguration": {"useApifyProxy": True}138 }139 },140 "website-content-crawler": {141 "actor_id": "apify/website-content-crawler",142 "input": {143 "startUrls": [{"url": target_url}],144 "maxRequestsPerCrawl": 1,145 "maxCrawlDepth": 0,146 "htmlTransformer": "readableText",147 "readableTextCharThreshold": 100,148 "removeCookieWarnings": True,149 "clickElementsCssSelector": "",150 "proxyConfiguration": {"useApifyProxy": True}151 }152 }153 }154 155 async def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:156 """157 Run a single actor and extract HTML content.158 159 Args:160 actor_name: Name of the actor (for logging)161 config: Actor configuration including actor_id and input162 163 Returns:164 HTML content as string, or None if failed165 """166 try:167 actor_id = config["actor_id"]168 actor_input = config["input"]169 170 Actor.log.info(f"DEBUG: Calling actor {actor_id}")171 172 # Call the actor using Apify SDK - use the exact same pattern as working code173 run = await Actor.call(174 actor_id=actor_id,175 run_input=actor_input176 )177 178 if not run:179 Actor.log.error(f"DEBUG: Actor {actor_name} failed to start - run is None")180 return None181 182 Actor.log.info(f"DEBUG: Actor {actor_name} run created with ID: {run.id}")183 Actor.log.info(f"DEBUG: Default dataset ID: {run.default_dataset_id}")184 185 # Use the exact same pattern as your working code186 if run.default_dataset_id:187 try:188 Actor.log.info(f"DEBUG: Getting dataset items for {actor_name}...")189 items = (await Actor.apify_client.dataset(run.default_dataset_id).list_items()).items190 191 if items:192 Actor.log.info(f"DEBUG: Found {len(items)} items in dataset for {actor_name}")193 194 for i, item in enumerate(items):195 Actor.log.info(f"DEBUG: Dataset item {i} keys: {list(item.keys()) if isinstance(item, dict) else type(item)}")196 197 # Look for HTML content in the item198 html_content = self._extract_html_from_item(item, actor_name)199 if html_content:200 Actor.log.info(f"DEBUG: Found HTML content in dataset item {i} for {actor_name}: {len(html_content)} characters")201 return html_content202 else:203 Actor.log.info(f"DEBUG: No dataset items found for {actor_name}")204 205 except Exception as e:206 Actor.log.error(f"DEBUG: Dataset retrieval failed for {actor_name}: {e}")207 import traceback208 Actor.log.error(f"DEBUG: Dataset traceback: {traceback.format_exc()}")209 210 # Fallback: Try key-value store (simplified)211 if run.default_key_value_store_id:212 try:213 Actor.log.info(f"DEBUG: Trying key-value store as fallback for {actor_name}...")214 kvs_client = Actor.apify_client.key_value_store(run.default_key_value_store_id)215 216 # Try common keys that might contain HTML217 common_keys = ['OUTPUT', 'RESULTS', 'DATA']218 for key_name in common_keys:219 try:220 record = await kvs_client.get_record(key_name)221 if record:222 Actor.log.info(f"DEBUG: Found record for key {key_name}")223 html_content = self._extract_html_from_record(record, actor_name)224 if html_content:225 Actor.log.info(f"DEBUG: Found HTML content in key {key_name} for {actor_name}")226 return html_content227 except Exception:228 pass # Key doesn't exist, continue229 230 except Exception as e:231 Actor.log.error(f"DEBUG: Key-value store retrieval failed for {actor_name}: {e}")232 233 Actor.log.warning(f"DEBUG: No HTML content found for {actor_name}")234 return None235 236 except Exception as e:237 Actor.log.error(f"DEBUG: Error running {actor_name}: {str(e)}")238 import traceback239 Actor.log.error(f"DEBUG: Full traceback: {traceback.format_exc()}")240 return None241 242 def _extract_html_from_item(self, item: Dict[str, Any], actor_name: str) -> Optional[str]:243 """Extract HTML content from a dataset item."""244 Actor.log.info(f"DEBUG: Extracting HTML from item for {actor_name}, item keys: {list(item.keys()) if isinstance(item, dict) else 'not a dict'}")245 246 # Look for HTML in common fields247 html_fields = ['html', 'content', 'body', 'pageContent', 'text', 'data']248 249 for field in html_fields:250 if field in item and item[field]:251 content = item[field]252 Actor.log.info(f"DEBUG: Found content in field '{field}': {type(content)}, length: {len(content) if isinstance(content, str) else 'N/A'}")253 254 if isinstance(content, str) and len(content) > 100:255 # Check if it looks like HTML256 if '<' in content and '>' in content:257 Actor.log.info(f"DEBUG: Found HTML content in field '{field}' for {actor_name}")258 return content259 elif actor_name == "website-content-crawler":260 # For website-content-crawler, text content is also acceptable261 Actor.log.info(f"DEBUG: Found text content in field '{field}' for {actor_name}")262 html_content = f"<html><body><div>{content}</div></body></html>"263 return html_content264 265 # For website-content-crawler, look for any text-like content266 if actor_name == "website-content-crawler":267 for key, value in item.items():268 if isinstance(value, str) and len(value) > 50:269 Actor.log.info(f"DEBUG: Using text content from field '{key}' for website-content-crawler")270 html_content = f"<html><body><div>{value}</div></body></html>"271 return html_content272 273 Actor.log.info(f"DEBUG: No HTML content found in item for {actor_name}")274 return None275 276 def _extract_html_from_record(self, record: Any, actor_name: str) -> Optional[str]:277 """Extract HTML content from a key-value store record."""278 try:279 # The record might be the content directly or wrapped in a dict280 content = record281 282 if hasattr(record, 'value'):283 content = record.value284 elif isinstance(record, dict) and 'value' in record:285 content = record['value']286 287 # If content is a string, check if it's HTML288 if isinstance(content, str):289 if len(content) > 100 and ('<' in content or actor_name == "website-content-crawler"):290 return content291 292 # If content is a dict, look for HTML fields293 elif isinstance(content, dict):294 html_content = self._extract_html_from_item(content, actor_name)295 if html_content:296 return html_content297 298 return None299 300 except Exception as e:301 Actor.log.debug(f"DEBUG: Error extracting HTML from record for {actor_name}: {e}")302 return None
src/llmscraper/scraping/apify_runner.py
1"""2Apify integration for running actors and retrieving results.3"""4
5import logging6from typing import Optional, Dict, Any, List, Union7
8from apify_client import ApifyClient9
10
11class ApifyRunner:12 """Handles running Apify actors and retrieving results."""13 14 def __init__(self, api_token: str):15 """Initialize with API token."""16 if not api_token or not api_token.strip():17 raise ValueError("API token cannot be empty")18 19 self.client = ApifyClient(api_token)20 self.logger = logging.getLogger(__name__)21 22 def run_actor(self, actor_id: str, actor_input: dict, 23 retrieve_from: str = "auto") -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:24 """25 Run an Apify actor and retrieve results.26 27 Args:28 actor_id: The ID of the Apify actor29 actor_input: Input configuration for the actor30 retrieve_from: "auto", "dataset", "key-value-store", or "both"31 32 Returns:33 Retrieved data or None if failed34 """35 if not actor_id or not actor_id.strip():36 raise ValueError("actor_id cannot be empty")37 38 if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:39 raise ValueError("Invalid retrieve_from option")40 41 # Determine storage type42 if retrieve_from == "auto":43 retrieve_from = "key-value-store" if "website-content-crawler" in actor_id else "dataset"44 45 try:46 self.logger.info(f"Starting Apify actor: {actor_id}")47 48 # Start the actor run49 run = self.client.actor(actor_id).call(run_input=actor_input)50 51 if not run or run.get('status') != 'SUCCEEDED':52 self.logger.error(f"Actor run failed: {run.get('status') if run else 'No run created'}")53 return None54 55 run_id = run.get('id')56 self.logger.info(f"Actor run {run_id} completed successfully")57 58 # Retrieve results based on type59 if retrieve_from == "dataset":60 return self._get_dataset_items(run_id)61 elif retrieve_from == "key-value-store":62 return self._get_key_value_store_items(run_id)63 elif retrieve_from == "both":64 return {65 "dataset": self._get_dataset_items(run_id),66 "key_value_store": self._get_key_value_store_items(run_id)67 }68 69 except Exception as e:70 self.logger.error(f"Error running actor {actor_id}: {str(e)}")71 return None72 73 def _get_dataset_items(self, run_id: str) -> List[Dict[str, Any]]:74 """Get items from the dataset of a run."""75 try:76 dataset_id = self.client.run(run_id).get().get('defaultDatasetId')77 if not dataset_id:78 self.logger.warning(f"No dataset found for run {run_id}")79 return []80 81 dataset_items = list(self.client.dataset(dataset_id).iterate_items())82 self.logger.info(f"Retrieved {len(dataset_items)} items from dataset")83 return dataset_items84 85 except Exception as e:86 self.logger.error(f"Error retrieving dataset items: {str(e)}")87 return []88 89 def _get_key_value_store_items(self, run_id: str) -> Dict[str, Any]:90 """Get items from the key-value store of a run."""91 try:92 kvs_id = self.client.run(run_id).get().get('defaultKeyValueStoreId')93 if not kvs_id:94 self.logger.warning(f"No key-value store found for run {run_id}")95 return {}96 97 kvs = self.client.key_value_store(kvs_id)98 keys = list(kvs.list_keys())99 100 items = {}101 for key_info in keys:102 # Handle case where key_info might be a string or dict103 if isinstance(key_info, dict):104 key_name = key_info.get('key')105 else:106 key_name = str(key_info)107 108 if key_name:109 try:110 value = kvs.get_record(key_name)111 if value:112 # Handle case where value might be a string or dict113 if isinstance(value, dict):114 items[key_name] = value.get('value', value)115 else:116 items[key_name] = value117 except Exception as e:118 self.logger.warning(f"Failed to retrieve key '{key_name}': {str(e)}")119 120 self.logger.info(f"Retrieved {len(items)} items from key-value store")121 return items122 123 except Exception as e:124 self.logger.error(f"Error retrieving key-value store items: {str(e)}")125 return {}126
127
128# Legacy functions for backward compatibility129def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:130 """Legacy function - use ApifyRunner class instead."""131 runner = ApifyRunner(api_token)132 result = runner.run_actor(actor_id, actor_input, "dataset")133 return result if isinstance(result, list) else None134
135
136def run_apify_actor_with_flexible_retrieval(137 actor_id: str, actor_input: dict, *, api_token: str, retrieve_from: str = "auto"138) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:139 """Legacy function - use ApifyRunner class instead."""140 runner = ApifyRunner(api_token)141 return runner.run_actor(actor_id, actor_input, retrieve_from)
src/llmscraper/scraping/multi_actor_scraper.py
1"""2Multi-actor scraping functionality.3"""4
5import logging6from typing import Dict, Any7from concurrent.futures import ThreadPoolExecutor, as_completed8
9from .apify_runner import ApifyRunner10from ..models import ActorConfig11
12
13class MultiActorScraper:14 """Scrapes websites using multiple Apify actors simultaneously."""15 16 def __init__(self, api_token: str):17 """Initialize with Apify API token."""18 self.api_token = api_token19 self.runner = ApifyRunner(api_token)20 self.logger = logging.getLogger(__name__)21 22 def scrape_with_multiple_actors(self, target_url: str, 23 actor_configs: Dict[str, ActorConfig] = None) -> Dict[str, str]:24 """25 Scrape a URL with multiple actors and return HTML content.26 27 Args:28 target_url: URL to scrape29 actor_configs: Dictionary of actor configurations to use30 31 Returns:32 Dict mapping actor names to HTML content33 """34 if actor_configs is None:35 # Use default configurations for backward compatibility36 actor_configs = self._get_default_actor_configs(target_url)37 38 # Filter to only enabled actors39 enabled_configs = {name: config for name, config in actor_configs.items() if config.enabled}40 41 if not enabled_configs:42 self.logger.warning("No enabled actors found")43 return {}44 45 results = {}46 47 # Use ThreadPoolExecutor for concurrent execution48 with ThreadPoolExecutor(max_workers=len(enabled_configs)) as executor:49 future_to_actor = {50 executor.submit(self._run_single_actor, name, config): name51 for name, config in enabled_configs.items()52 }53 54 for future in as_completed(future_to_actor):55 actor_name = future_to_actor[future]56 try:57 name, html_content = future.result()58 results[name] = html_content59 except Exception as e:60 self.logger.error(f"Actor {actor_name} failed: {str(e)}")61 results[actor_name] = None62 63 return results64 65 def _get_default_actor_configs(self, target_url: str) -> Dict[str, ActorConfig]:66 """Get default actor configurations for backward compatibility."""67 from ..models import get_default_actor_configs68 69 configs = get_default_actor_configs()70 # Add target URL to all configs71 for config in configs.values():72 config.input['startUrls'] = [{"url": target_url}]73 74 return configs75 76 def _run_single_actor(self, actor_name: str, config) -> tuple[str, str]:77 """78 Run a single actor and extract HTML content.79 80 Args:81 actor_name: Name of the actor82 config: Actor configuration (can be ActorConfig or dict for backward compatibility)83 84 Returns:85 Tuple of (actor_name, html_content)86 """87 try:88 self.logger.info(f"Starting {actor_name}...")89 90 # Handle both ActorConfig and dict formats91 if hasattr(config, 'actor_id'):92 actor_id = config.actor_id93 actor_input = config.input94 else:95 actor_id = config["actor_id"]96 actor_input = config["input"]97 98 result = self.runner.run_actor(99 actor_id,100 actor_input,101 "auto"102 )103 104 if not result:105 self.logger.warning(f"{actor_name} returned no results")106 return actor_name, None107 108 # Extract HTML based on result type109 html_content = self._extract_html_from_result(result, actor_name)110 111 if html_content:112 self.logger.info(f"{actor_name} completed successfully: {len(html_content):,} chars")113 else:114 self.logger.warning(f"{actor_name} returned no HTML content")115 116 return actor_name, html_content117 118 except Exception as e:119 self.logger.error(f"Error running {actor_name}: {str(e)}")120 return actor_name, None121 122 def _extract_html_from_result(self, result: Any, actor_name: str) -> str:123 """Extract HTML content from actor result."""124 try:125 if isinstance(result, list) and result:126 # Dataset result127 item = result[0]128 return item.get('html') or item.get('content', '')129 elif isinstance(result, dict):130 # Key-value store result131 if 'OUTPUT' in result:132 output = result['OUTPUT']133 if isinstance(output, dict):134 return output.get('html') or output.get('content', '')135 elif isinstance(output, str):136 return output137 138 self.logger.warning(f"Unexpected result format from {actor_name}")139 return None140 141 except Exception as e:142 self.logger.error(f"Error extracting HTML from {actor_name}: {str(e)}")143 return None
src/llmscraper/utils/__init__.py
1"""2Utilities module for ScraperCodeGenerator.3"""4
5from .html_utils import is_html, prune_html, extract_text_content, validate_html_structure6from .config import get_api_key, validate_required_keys, setup_logging7
8__all__ = [9 "is_html",10 "prune_html", 11 "extract_text_content",12 "validate_html_structure",13 "get_api_key",14 "validate_required_keys",15 "setup_logging"16]
src/llmscraper/utils/config.py
1"""2Configuration and environment utilities.3"""4
5import os6from typing import Optional7
8
9def get_api_key(key_name: str, provided_key: Optional[str] = None) -> Optional[str]:10 """11 Get API key from provided value or environment variable.12 13 Args:14 key_name: Name of the environment variable15 provided_key: Explicitly provided key (takes precedence)16 17 Returns:18 API key or None if not found19 """20 if provided_key and provided_key.strip():21 return provided_key.strip()22 23 return os.getenv(key_name)24
25
26def validate_required_keys(**keys) -> dict[str, str]:27 """28 Validate that all required API keys are present.29 30 Args:31 **keys: Key-value pairs of key names and values32 33 Returns:34 Dict of validated keys35 36 Raises:37 ValueError: If any required key is missing38 """39 validated = {}40 missing = []41 42 for key_name, key_value in keys.items():43 if not key_value or not key_value.strip():44 missing.append(key_name)45 else:46 validated[key_name] = key_value.strip()47 48 if missing:49 raise ValueError(f"Missing required API keys: {', '.join(missing)}")50 51 return validated52
53
54def setup_logging(level: str = "INFO") -> None:55 """56 Setup logging configuration.57 58 Args:59 level: Logging level (DEBUG, INFO, WARNING, ERROR)60 """61 import logging62 63 logging.basicConfig(64 level=getattr(logging, level.upper()),65 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',66 handlers=[67 logging.StreamHandler()68 ]69 )
src/llmscraper/utils/config_parser.py
1"""2Configuration parser for the ScraperCodeGenerator pipeline.3"""4
5import json6import logging7from typing import Dict, Any, Optional, Union8
9from ..models import (10 PipelineConfig, ActorConfig, HTMLPruningConfig, ClaudeModel, 11 get_default_actor_configs12)13
14
15class ConfigurationParser:16 """Parses and validates configuration from input data."""17 18 def __init__(self):19 self.logger = logging.getLogger(__name__)20 21 def parse_from_input(self, input_data: Dict[str, Any]) -> PipelineConfig:22 """23 Parse configuration from input data.24 25 Args:26 input_data: Raw input data from Actor or CLI27 28 Returns:29 Parsed and validated PipelineConfig30 """31 config = PipelineConfig()32 33 # Parse core settings34 config.for_actor = input_data.get('forActor', False)35 config.test_script = input_data.get('testScript', False)36 config.output_script_path = input_data.get('outputScriptPath')37 38 # Parse Claude settings39 claude_model_str = input_data.get('claudeModel', 'claude-3-5-sonnet-20241022')40 config.claude_model = self._parse_claude_model(claude_model_str)41 config.claude_api_key = input_data.get('claudeApiKey')42 43 # Parse HTML pruning settings44 config.html_pruning = self._parse_html_pruning_config(input_data)45 46 # Parse actor configurations47 config.actors = self._parse_actor_configs(48 input_data.get('actors', []), input_data.get('targetUrl', '')49 )50 51 # Parse execution settings52 config.max_retries = input_data.get('maxRetries', 3)53 config.timeout_seconds = input_data.get('timeout', 60)54 config.concurrent_actors = input_data.get('concurrentActors', True)55 56 return config57 58 def _parse_claude_model(self, model_str: str) -> ClaudeModel:59 """Parse Claude model from string."""60 model_mapping = {61 # Claude 4 models62 'claude-sonnet-4-20250514': ClaudeModel.CLAUDE_4_SONNET,63 'claude-opus-4-20250514': ClaudeModel.CLAUDE_4_OPUS,64 'claude-sonnet-4-0': ClaudeModel.CLAUDE_4_SONNET,65 'claude-opus-4-0': ClaudeModel.CLAUDE_4_OPUS,66 67 # Claude 3.7 models68 'claude-3-7-sonnet-20250219': ClaudeModel.CLAUDE_3_7_SONNET,69 'claude-3-7-sonnet-latest': ClaudeModel.CLAUDE_3_7_SONNET,70 71 # Claude 3.5 models72 'claude-3-5-sonnet-20241022': ClaudeModel.CLAUDE_3_5_SONNET,73 'claude-3-5-sonnet-latest': ClaudeModel.CLAUDE_3_5_SONNET,74 'claude-3-5-haiku-20241022': ClaudeModel.CLAUDE_3_5_HAIKU,75 76 # Claude 3 models77 'claude-3-sonnet-20240229': ClaudeModel.CLAUDE_3_SONNET,78 'claude-3-haiku-20240307': ClaudeModel.CLAUDE_3_HAIKU,79 80 # Aliases81 'claude-4': ClaudeModel.CLAUDE_4_SONNET,82 'claude-4-sonnet': ClaudeModel.CLAUDE_4_SONNET,83 'claude-4-opus': ClaudeModel.CLAUDE_4_OPUS,84 'sonnet-4': ClaudeModel.CLAUDE_4_SONNET,85 'opus-4': ClaudeModel.CLAUDE_4_OPUS,86 'sonnet-3.7': ClaudeModel.CLAUDE_3_7_SONNET,87 'sonnet-3.5': ClaudeModel.CLAUDE_3_5_SONNET,88 'haiku-3.5': ClaudeModel.CLAUDE_3_5_HAIKU,89 'haiku': ClaudeModel.CLAUDE_3_HAIKU,90 'sonnet': ClaudeModel.CLAUDE_3_SONNET,91 }92 93 return model_mapping.get(model_str.lower(), ClaudeModel.CLAUDE_4_SONNET)94 95 def _parse_html_pruning_config(self, input_data: Dict[str, Any]) -> HTMLPruningConfig:96 """Parse HTML pruning configuration from flat input structure."""97 # Convert percentage from 0-100 to 0.0-1.0 if needed98 prune_percentage = input_data.get('htmlPrunePercentage', 80)99 if prune_percentage > 1:100 prune_percentage = prune_percentage / 100.0101 102 return HTMLPruningConfig(103 enabled=input_data.get('htmlPruningEnabled', True),104 max_list_items=input_data.get('htmlMaxListItems', 5),105 max_text_length=input_data.get('htmlMaxTextLength', 500),106 prune_before_evaluation=input_data.get('htmlPruneBeforeEvaluation', True),107 prune_percentage=prune_percentage108 )109 110 def _parse_actor_configs(self, actors_data: Any, target_url: str) -> Dict[str, ActorConfig]:111 """Parse actor configurations with improved validation."""112 # Start with default configurations113 default_configs = get_default_actor_configs()114 115 # Handle both array and object formats116 if isinstance(actors_data, list):117 # New array format: [{"name": "actor-name", "enabled": true, "input": {...}}]118 return self._parse_actor_configs_from_array(actors_data, target_url, default_configs)119 elif isinstance(actors_data, dict):120 # Legacy object format: {"actor-name": true, "other-actor": {"enabled": true, "input": {...}}}121 return self._parse_actor_configs_from_object(actors_data, target_url, default_configs)122 else:123 # No actor configuration provided, use defaults124 for config in default_configs.values():125 config.input['startUrls'] = [{"url": target_url}]126 return default_configs127 128 def _parse_actor_configs_from_array(self, actors_data: list, target_url: str, default_configs: Dict[str, ActorConfig]) -> Dict[str, ActorConfig]:129 """Parse actor configurations from array format."""130 parsed_configs = {}131 132 for actor_item in actors_data:133 if not isinstance(actor_item, dict):134 self.logger.warning(f"Invalid actor configuration format: {actor_item}")135 continue136 137 actor_name = actor_item.get('name')138 if not actor_name:139 self.logger.warning(f"Actor configuration missing 'name' field: {actor_item}")140 continue141 142 try:143 # Check if this is a known actor144 if actor_name in default_configs:145 config = default_configs[actor_name]146 config.enabled = actor_item.get('enabled', True)147 # Merge custom input with defaults148 if 'input' in actor_item:149 config.input.update(actor_item['input'])150 else:151 # Custom actor152 config = ActorConfig(153 actor_id=actor_item.get('actorId', actor_name),154 name=actor_name,155 description=actor_item.get('description', ''),156 enabled=actor_item.get('enabled', True),157 input=actor_item.get('input', {})158 )159 160 # Ensure startUrls is set161 if 'startUrls' not in config.input:162 config.input['startUrls'] = [{"url": target_url}]163 164 parsed_configs[actor_name] = config165 166 except Exception as e:167 self.logger.error(f"Error parsing configuration for actor '{actor_name}': {e}")168 continue169 170 # If no valid configs, use defaults171 if not parsed_configs:172 for config in default_configs.values():173 config.input['startUrls'] = [{"url": target_url}]174 return default_configs175 176 return parsed_configs177 178 def _parse_actor_configs_from_object(self, actors_data: dict, target_url: str, default_configs: Dict[str, ActorConfig]) -> Dict[str, ActorConfig]:179 """Parse actor configurations from legacy object format."""180 parsed_configs = {}181 182 for actor_name, actor_data in actors_data.items():183 try:184 if isinstance(actor_data, dict):185 # Full configuration object186 if 'actorId' in actor_data:187 # Custom actor with explicit ID188 config = ActorConfig(189 actor_id=actor_data.get('actorId'),190 name=actor_data.get('name', actor_name),191 description=actor_data.get('description', ''),192 enabled=actor_data.get('enabled', True),193 input=actor_data.get('input', {})194 )195 else:196 # Partial configuration for known actor197 if actor_name in default_configs:198 config = default_configs[actor_name]199 config.enabled = actor_data.get('enabled', True)200 # Merge custom input with defaults201 if 'input' in actor_data:202 config.input.update(actor_data['input'])203 else:204 self.logger.warning(f"Unknown actor '{actor_name}' with partial config, skipping")205 continue206 207 # Ensure startUrls is set208 if 'startUrls' not in config.input:209 config.input['startUrls'] = [{"url": target_url}]210 211 # Validate actor ID212 if not config.actor_id:213 self.logger.error(f"Actor '{actor_name}' missing actor_id")214 continue215 216 parsed_configs[actor_name] = config217 218 elif isinstance(actor_data, bool):219 # Simple boolean enable/disable220 if actor_name in default_configs:221 config = default_configs[actor_name]222 config.enabled = actor_data223 config.input['startUrls'] = [{"url": target_url}]224 parsed_configs[actor_name] = config225 else:226 self.logger.warning(f"Unknown actor '{actor_name}' with boolean config, skipping")227 228 elif isinstance(actor_data, str):229 # Just actor ID provided230 config = ActorConfig(231 actor_id=actor_data,232 name=actor_name,233 enabled=True,234 input={'startUrls': [{"url": target_url}]}235 )236 parsed_configs[actor_name] = config237 238 else:239 self.logger.warning(f"Invalid configuration format for actor '{actor_name}': {type(actor_data)}")240 241 except Exception as e:242 self.logger.error(f"Error parsing configuration for actor '{actor_name}': {e}")243 continue244 245 # Ensure at least one actor is enabled246 if not any(config.enabled for config in parsed_configs.values()):247 self.logger.warning("No actors enabled, falling back to defaults")248 for config in default_configs.values():249 config.input['startUrls'] = [{"url": target_url}]250 return default_configs251 252 return parsed_configs253 254 def generate_example_config(self) -> Dict[str, Any]:255 """Generate an example configuration for documentation."""256 return {257 "targetUrl": "https://example.com",258 "userGoal": "Extract product information",259 "claudeApiKey": "sk-ant-api03-...",260 261 # Core settings262 "forActor": False,263 "testScript": True,264 "outputScriptPath": "generated_scraper.py",265 266 # Claude model selection267 "claudeModel": "claude-sonnet-4-20250514", # or "claude-4", "sonnet-4", "opus-4", "sonnet-3.7", "sonnet-3.5", "haiku"268 269 # HTML pruning settings270 "htmlPruningEnabled": True,271 "htmlMaxListItems": 5,272 "htmlMaxTextLength": 500,273 "htmlPrunePercentage": 80,274 "htmlPruneBeforeEvaluation": True,275 276 # Actor configurations (new array format)277 "actors": [278 {279 "name": "cheerio-scraper",280 "enabled": True,281 "input": {282 "maxRequestRetries": 3,283 "requestTimeoutSecs": 30,284 "maxPagesPerCrawl": 1,285 "pageFunction": """286 async function pageFunction(context) {287 const { request, log, $ } = context;288 try {289 const title = $('title').text() || '';290 const html = $('html').html() || '';291 return {292 url: request.url,293 title: title,294 html: html295 };296 } catch (error) {297 log.error('Error in pageFunction:', error);298 return {299 url: request.url,300 title: '',301 html: ''302 };303 }304 }305 """,306 "proxyConfiguration": {"useApifyProxy": True}307 }308 },309 {310 "name": "web-scraper",311 "enabled": True,312 "input": {313 "maxRequestRetries": 3,314 "requestTimeoutSecs": 30,315 "maxPagesPerCrawl": 1,316 "pageFunction": """317 async function pageFunction(context) {318 const { request, log, page } = context;319 try {320 const title = await page.title();321 const html = await page.content();322 return {323 url: request.url,324 title: title,325 html: html326 };327 } catch (error) {328 log.error('Error in pageFunction:', error);329 return {330 url: request.url,331 title: '',332 html: ''333 };334 }335 }336 """,337 "proxyConfiguration": {"useApifyProxy": True}338 }339 },340 {341 "name": "website-content-crawler",342 "enabled": False,343 "input": {344 "maxCrawlPages": 1,345 "crawler": "playwright",346 "proxyConfiguration": {"useApifyProxy": True}347 }348 },349 {350 "name": "custom-scraper",351 "actorId": "your-username/custom-scraper",352 "description": "My custom scraping actor",353 "enabled": True,354 "input": {355 "maxRequestRetries": 5,356 "requestTimeoutSecs": 60,357 "customParam": "value"358 }359 },360 {361 "name": "playwright-scraper",362 "enabled": True,363 "input": {364 "maxRequestRetries": 2,365 "requestTimeoutSecs": 45,366 "maxPagesPerCrawl": 1,367 "pageFunction": """368 async function pageFunction(context) {369 const { request, log, page } = context;370 try {371 const title = await page.title();372 const html = await page.content();373 return {374 url: request.url,375 title: title,376 html: html377 };378 } catch (error) {379 log.error('Error in pageFunction:', error);380 return {381 url: request.url,382 title: '',383 html: ''384 };385 }386 }387 """,388 "proxyConfiguration": {"useApifyProxy": True}389 }390 }391 ],392 393 # Execution settings394 "maxRetries": 3,395 "timeout": 60,396 "concurrentActors": True397 }398 399 def validate_config(self, config: PipelineConfig) -> bool:400 """401 Validate configuration and log any issues.402 403 Args:404 config: Configuration to validate405 406 Returns:407 True if configuration is valid408 """409 is_valid = True410 411 # Check if at least one actor is enabled412 enabled_actors = config.get_enabled_actors()413 if not enabled_actors:414 self.logger.error("No actors are enabled in configuration")415 is_valid = False416 417 # Check Claude API key418 if not config.claude_api_key:419 self.logger.error("Claude API key is required")420 is_valid = False421 422 # Validate HTML pruning settings423 prune_percentage = config.html_pruning.prune_percentage424 if prune_percentage < 0 or prune_percentage > 1:425 self.logger.error("HTML pruning percentage must be between 0 and 1")426 is_valid = False427 428 if config.html_pruning.max_list_items < 1:429 self.logger.error("Max list items must be at least 1")430 is_valid = False431 432 if config.html_pruning.max_text_length < 1:433 self.logger.error("Max text length must be at least 1")434 is_valid = False435 436 # Validate actor configurations437 for actor_name, actor_config in enabled_actors.items():438 if not actor_config.actor_id:439 self.logger.error(f"Actor '{actor_name}' missing actor_id")440 is_valid = False441 442 # Validate actor_id format443 if actor_config.actor_id and '/' not in actor_config.actor_id:444 self.logger.warning(f"Actor '{actor_name}' has unusual actor_id format: {actor_config.actor_id}")445 446 # Validate required input fields447 if not actor_config.input.get('startUrls'):448 self.logger.error(f"Actor '{actor_name}' missing startUrls in input")449 is_valid = False450 451 return is_valid
src/llmscraper/utils/html_utils.py
1"""2HTML utility functions for processing web content.3"""4
5from typing import Optional6from bs4 import BeautifulSoup, Comment, NavigableString7import re8
9
10def is_html(text_content: str) -> bool:11 """12 Check if a string is likely HTML content.13 14 Args:15 text_content: The text content to check16 17 Returns:18 True if the content appears to be HTML19 """20 if not text_content or not isinstance(text_content, str):21 return False22 23 content_lower = text_content.lower()24 return '<html>' in content_lower and '<body>' in content_lower25
26
27def prune_html(html_content: str, max_list_items: int = 5, max_text_length: int = 500, 28 prune_percentage: float = 0.8) -> str:29 """30 Clean and shorten HTML content to reduce token count while preserving structure.31 32 Args:33 html_content: The raw HTML content to process34 max_list_items: Maximum number of list items to keep35 max_text_length: Maximum length of text content in any tag36 prune_percentage: Percentage of content to keep (0.0-1.0)37 38 Returns:39 The cleaned and shortened HTML40 """41 if not html_content or not isinstance(html_content, str):42 return ""43 44 try:45 soup = BeautifulSoup(html_content, 'html.parser')46 47 # Remove unwanted tags entirely48 unwanted_tags = ['script', 'style', 'svg', 'noscript', 'meta', 'link']49 for tag_name in unwanted_tags:50 for tag in soup.find_all(tag_name):51 tag.decompose()52 53 # Remove HTML comments54 for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):55 comment.extract()56 57 # Remove unwanted attributes from all tags58 allowed_attributes = {'id', 'class', 'href', 'src', 'alt', 'title'}59 for tag in soup.find_all(True):60 if hasattr(tag, 'attrs'):61 tag.attrs = {key: value for key, value in tag.attrs.items() 62 if key in allowed_attributes}63 64 # Truncate lists and tables65 list_and_table_tags = ['ul', 'ol', 'table', 'tbody', 'thead']66 for tag_name in list_and_table_tags:67 for tag in soup.find_all(tag_name):68 children = list(tag.children)69 # Filter out NavigableString objects (text nodes, whitespace)70 non_text_children = [child for child in children if not isinstance(child, NavigableString)]71 72 if len(non_text_children) > max_list_items:73 # Keep only the first max_list_items children74 for child in non_text_children[max_list_items:]:75 child.decompose()76 77 # Add a comment indicating truncation78 if tag.name in ['ul', 'ol']:79 truncation_notice = soup.new_tag("li")80 truncation_notice.string = f"... ({len(non_text_children) - max_list_items} more items)"81 tag.append(truncation_notice)82 elif tag.name == 'table':83 truncation_notice = soup.new_tag("tr")84 td = soup.new_tag("td")85 td.string = f"... ({len(non_text_children) - max_list_items} more rows)"86 truncation_notice.append(td)87 tag.append(truncation_notice)88 89 # Truncate long text content90 for element in soup.find_all(string=True):91 if isinstance(element, NavigableString) and not isinstance(element, Comment):92 text = str(element).strip()93 if len(text) > max_text_length:94 element.replace_with(text[:max_text_length] + "...")95 96 # Apply percentage-based pruning if specified97 if prune_percentage < 1.0:98 # Calculate target length based on percentage99 target_length = int(len(str(soup)) * prune_percentage)100 current_html = str(soup)101 102 if len(current_html) > target_length:103 # Additional aggressive pruning to meet percentage target104 # Remove more list items105 for tag_name in ['ul', 'ol', 'table', 'tbody', 'thead']:106 for tag in soup.find_all(tag_name):107 children = list(tag.children)108 non_text_children = [child for child in children if not isinstance(child, NavigableString)]109 110 # Keep even fewer items if we need more aggressive pruning111 aggressive_max = max(1, int(max_list_items * prune_percentage))112 if len(non_text_children) > aggressive_max:113 for child in non_text_children[aggressive_max:]:114 child.decompose()115 116 # More aggressive text truncation117 aggressive_text_length = int(max_text_length * prune_percentage)118 for element in soup.find_all(string=True):119 if isinstance(element, NavigableString) and not isinstance(element, Comment):120 text = str(element).strip()121 if len(text) > aggressive_text_length:122 element.replace_with(text[:aggressive_text_length] + "...")123 124 # Return the cleaned HTML125 return str(soup)126 127 except Exception as e:128 # If parsing fails, return original content truncated129 return html_content[:max_text_length * 10] if len(html_content) > max_text_length * 10 else html_content130
131
132def extract_text_content(html_content: str) -> str:133 """134 Extract clean text content from HTML.135 136 Args:137 html_content: HTML content to extract text from138 139 Returns:140 Clean text content141 """142 if not html_content:143 return ""144 145 try:146 soup = BeautifulSoup(html_content, 'html.parser')147 return soup.get_text(separator=' ', strip=True)148 except Exception:149 return html_content150
151
152def validate_html_structure(html_content: str) -> bool:153 """154 Validate basic HTML structure.155 156 Args:157 html_content: HTML content to validate158 159 Returns:160 True if HTML has basic valid structure161 """162 if not html_content:163 return False164 165 try:166 soup = BeautifulSoup(html_content, 'html.parser')167 168 # Check for basic HTML elements169 has_html_tag = soup.find('html') is not None170 has_body_tag = soup.find('body') is not None171 has_content = len(soup.get_text(strip=True)) > 0172 173 return has_html_tag or has_body_tag or has_content174 175 except Exception:176 return False
src/llmscraper/utils/llm_config_parser.py
1"""2Configuration parser for the LLM Scraper Actor.3"""4import os5import logging6from typing import Dict, Any, Optional7
8from ..llm_scraper.models import LLMScraperInput9
10
11class LLMScraperConfigParser:12 """Parses and validates configuration from input data and environment variables."""13 14 def __init__(self):15 self.logger = logging.getLogger(__name__)16 17 def parse_from_input(self, input_data: Dict[str, Any]) -> LLMScraperInput:18 """19 Parse configuration from input data with environment variable fallbacks.20 21 Args:22 input_data: Raw input data from Actor23 24 Returns:25 Parsed and validated LLMScraperInput26 """27 # Get required fields28 target_url = input_data.get('targetUrl')29 if not target_url:30 raise ValueError("targetUrl is required")31 32 extraction_goal = input_data.get('extractionGoal')33 if not extraction_goal:34 raise ValueError("extractionGoal is required")35 36 # Get API key from input or environment37 anthropic_api_key = (38 input_data.get('claudeApiKey') or 39 os.getenv('ANTHROPIC_API_KEY') or 40 os.getenv('CLAUDE_API_KEY')41 )42 43 if not anthropic_api_key:44 raise ValueError(45 "Claude API key is required. Provide it via 'claudeApiKey' input field "46 "or set ANTHROPIC_API_KEY environment variable."47 )48 49 # Parse optional configuration with defaults50 config = LLMScraperInput(51 target_url=target_url,52 extraction_goal=extraction_goal,53 max_actor_attempts=input_data.get('maxActorAttempts', 10),54 max_retries_per_actor=input_data.get('maxRetriesPerActor', 3),55 max_time_minutes=input_data.get('maxTimeMinutes', 30),56 anthropic_api_key=anthropic_api_key,57 mcp_url=input_data.get('mcpUrl', 'https://mcp.apify.com/sse?enableAddingActors=true'),58 model_name=input_data.get('modelName', 'claude-3-5-haiku-latest'),59 debug_mode=input_data.get('debugMode', False),60 prefer_specific_actors=input_data.get('preferSpecificActors', True),61 min_data_quality_score=input_data.get('minDataQualityScore', 0.7),62 enable_proxy=input_data.get('enableProxy', True)63 )64 65 self.logger.info(f"Parsed configuration for URL: {target_url}")66 if config.debug_mode:67 self.logger.info(f"Configuration: {config}")68 69 return config70 71 def validate_config(self, config: LLMScraperInput) -> bool:72 """73 Validate configuration and log any issues.74 75 Args:76 config: Configuration to validate77 78 Returns:79 True if configuration is valid80 """81 is_valid = True82 83 # Validate URL84 if not config.target_url.startswith(('http://', 'https://')):85 self.logger.error(f"Invalid target URL: {config.target_url}")86 is_valid = False87 88 # Validate API key format89 if not config.anthropic_api_key.startswith('sk-ant-'):90 self.logger.warning("API key format appears invalid (should start with 'sk-ant-')")91 92 # Validate numeric ranges93 if config.max_actor_attempts < 1 or config.max_actor_attempts > 50:94 self.logger.error(f"max_actor_attempts must be between 1 and 50, got: {config.max_actor_attempts}")95 is_valid = False96 97 if config.max_retries_per_actor < 1 or config.max_retries_per_actor > 10:98 self.logger.error(f"max_retries_per_actor must be between 1 and 10, got: {config.max_retries_per_actor}")99 is_valid = False100 101 if config.max_time_minutes < 1 or config.max_time_minutes > 240:102 self.logger.error(f"max_time_minutes must be between 1 and 240, got: {config.max_time_minutes}")103 is_valid = False104 105 if config.min_data_quality_score < 0.0 or config.min_data_quality_score > 1.0:106 self.logger.error(f"min_data_quality_score must be between 0.0 and 1.0, got: {config.min_data_quality_score}")107 is_valid = False108 109 # Validate model name110 valid_models = [111 'claude-3-5-haiku-latest',112 'claude-3-5-sonnet-latest',113 'claude-3-opus-latest',114 'claude-3-haiku-20240307',115 'claude-3-sonnet-20240229',116 'claude-3-5-sonnet-20241022'117 ]118 if config.model_name not in valid_models:119 self.logger.warning(f"Unknown model name: {config.model_name}. Valid options: {valid_models}")120 121 return is_valid122 123 def generate_example_config(self) -> Dict[str, Any]:124 """Generate an example configuration for documentation."""125 return {126 "targetUrl": "https://books.toscrape.com/",127 "extractionGoal": "Extract book information including title, price, rating, and availability",128 "claudeApiKey": "sk-ant-api03-...",129 "maxActorAttempts": 5,130 "maxRetriesPerActor": 3,131 "maxTimeMinutes": 20,132 "mcpUrl": "https://mcp.apify.com/sse?enableAddingActors=true",133 "modelName": "claude-3-5-haiku-latest",134 "debugMode": False,135 "preferSpecificActors": True,136 "minDataQualityScore": 0.8,137 "enableProxy": True138 }