
ScraperCodeGenerator
Pricing
Pay per usage
Go to Apify Store

ScraperCodeGenerator
An intelligent web scraping tool that automatically generates custom scraping code for any website.
0.0 (0)
Pricing
Pay per usage
0
4
1
Last modified
2 months ago
Pricing
Pay per usage
An intelligent web scraping tool that automatically generates custom scraping code for any website.
0.0 (0)
Pricing
Pay per usage
0
4
1
Last modified
2 months ago
# --- General ---.DS_Store.env.env.*
# --- Logs ---logs*.lognpm-debug.log*yarn-debug.log*yarn-error.log*pnpm-debug.log*lerna-debug.log*
# --- IDEs ---.vscode/*!.vscode/extensions.json.idea/*.suo*.ntvs**.njsproj*.sln*.sw?
# --- Python ---__pycache__/*.py[cod]*$py.class*.so.Python.venv/venv/ENV/env/.env.env.*.pytest_cache/.coveragehtmlcov/.tox/.cache.mypy_cache/.dmypy.jsondmypy.json
# Project specificscraped_results/*.html
# --- Node.js / Frontend ---frontend/node_modules/frontend/dist/frontend/dist-ssr/frontend/.pnpfrontend/.pnp.jsfrontend/.npmnode_modulesdistdist-ssr*.local# Added by Apify CLIstorage.venv
# --- Apify ---storage/apify_storage/
# --- Local test files ---input.jsontest_*
3.10
# Use the official Apify Python base imageFROM apify/actor-python:3.11
# Copy requirements and install dependenciesCOPY requirements.txt ./RUN pip install --no-cache-dir -r requirements.txt
# Copy the source codeCOPY . ./
# Set the entrypointCMD ["python", "main.py"]
1"""2Apify Actor entry point for LLMScraper.3"""4
5import asyncio6import os7import sys8
9# Add src to path10sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))11
12from apify import Actor13from llmscraper import run_intelligent_scraper14from llmscraper.utils.config_parser import ConfigurationParser15from llmscraper.models import PipelineConfig16
17
18async def main():19 """Main actor function."""20 async with Actor:21 Actor.log.info("DEBUG: Inside Actor context")22 Actor.log.info("🚀 LLMScraper Actor starting...")23 24 # Get input25 actor_input = await Actor.get_input() or {}26 27 # Parse configuration from input28 config_parser = ConfigurationParser()29 config = config_parser.parse_from_input(actor_input)30 31 # Validate configuration32 if not config_parser.validate_config(config):33 raise ValueError("Invalid configuration provided")34 35 # Extract core parameters36 target_url = actor_input.get('targetUrl')37 user_goal = actor_input.get('userGoal')38 claude_api_key = actor_input.get('claudeApiKey') 39 test_script = config.test_script40 41 # Use actor token for Apify operations42 apify_token = Actor.config.token43 44 if not target_url:45 raise ValueError("targetUrl is required")46 47 if not user_goal:48 raise ValueError("userGoal is required")49 50 if not claude_api_key:51 raise ValueError("claudeApiKey is required")52 53 Actor.log.info(f"Target URL: {target_url}")54 Actor.log.info(f"User Goal: {user_goal}")55 Actor.log.info(f"Configuration: {len(config.get_enabled_actors())} actors enabled")56 Actor.log.info(f"Claude Model: {config.claude_model.value}")57 Actor.log.info(f"HTML Pruning: {config.html_pruning.enabled} (percentage: {config.html_pruning.prune_percentage})")58 59 Actor.log.info("DEBUG: About to call run_intelligent_scraper")60 61 try:62 # Run the intelligent scraper63 Actor.log.info("DEBUG: About to call run_intelligent_scraper with parameters:")64 Actor.log.info(f" - target_url: {target_url}")65 Actor.log.info(f" - user_goal: {user_goal}")66 Actor.log.info(f" - has claude_api_key: {bool(claude_api_key)}")67 Actor.log.info(f" - has apify_token: {bool(apify_token)}")68 Actor.log.info(f" - test_script: {test_script}")69 70 results = await run_intelligent_scraper(71 target_url=target_url,72 user_goal=user_goal,73 claude_api_key=claude_api_key,74 apify_token=apify_token,75 for_actor=config.for_actor, # Use the parsed configuration76 actor_logger=Actor.log, # Pass Actor logger77 test_script=test_script, # Use the user's preference78 config=config # Pass the parsed configuration79 )80 81 Actor.log.info("DEBUG: run_intelligent_scraper completed")82 Actor.log.info(f"DEBUG: Results type: {type(results)}")83 Actor.log.info(f"DEBUG: Results success: {getattr(results, 'success', 'No success attr')}")84 85 if hasattr(results, 'error_message') and results.error_message:86 Actor.log.error(f"DEBUG: Results error: {results.error_message}")87 88 if hasattr(results, 'quality_scores') and results.quality_scores:89 Actor.log.info(f"DEBUG: Quality scores: {results.quality_scores}")90 91 if hasattr(results, 'best_actor') and results.best_actor:92 Actor.log.info(f"DEBUG: Best actor: {results.best_actor}")93 94 if hasattr(results, 'generated_script') and results.generated_script:95 Actor.log.info(f"DEBUG: Generated script length: {len(results.generated_script)} characters")96 97 if hasattr(results, 'extracted_data') and results.extracted_data:98 Actor.log.info(f"DEBUG: Test extraction successful: {len(results.extracted_data) if isinstance(results.extracted_data, list) else 1} items")99 100 if results.success:101 Actor.log.info(f"Scraping completed successfully!")102 103 # Save the generated script to key-value store (always done)104 if hasattr(results, 'generated_script') and results.generated_script:105 Actor.log.info("💾 Saving generated script to key-value store...")106 await Actor.set_value("GENERATED_SCRIPT.py", results.generated_script)107 Actor.log.info("✅ Generated script saved to key-value store as 'GENERATED_SCRIPT'")108 109 # Show the generated script in actor mode110 Actor.log.info("\\n" + "="*60)111 Actor.log.info("📄 GENERATED PYTHON SCRIPT - can be found in Storage: Key-value store and Dataset")112 Actor.log.info("="*60)113 Actor.log.info(results.generated_script)114 Actor.log.info("="*60)115 116 # Log test results if testing was enabled117 if test_script and hasattr(results, 'extracted_data') and results.extracted_data:118 data_count = len(results.extracted_data) if isinstance(results.extracted_data, list) else 1119 Actor.log.info(f"🧪 Script testing was enabled and successful: extracted {data_count} items")120 elif test_script:121 Actor.log.warning("🧪 Script testing was enabled but no test data was extracted")122 else:123 Actor.log.info("⏭️ Script testing was disabled - script saved without testing")124 125 # Create output data126 output_data = {127 "url": target_url,128 "title": getattr(results, 'title', 'N/A'),129 "data": results.extracted_data if hasattr(results, 'extracted_data') else {},130 "generated_script": getattr(results, 'generated_script', ''),131 "best_actor": getattr(results, 'best_actor', ''),132 "quality_scores": getattr(results, 'quality_scores', {}),133 "script_tested": test_script,134 "timestamp": "2025-06-29T20:00:00Z", # You might want to use actual timestamp135 "success": True,136 "error": None137 }138 139 # Push results to default dataset140 await Actor.push_data([output_data])141 else:142 Actor.log.error(f"Scraping failed: {results.error_message}")143 144 # Still push error data for tracking145 error_data = {146 "url": target_url,147 "title": "N/A",148 "data": {},149 "timestamp": "2025-06-29T20:00:00Z",150 "success": False,151 "error": results.error_message152 }153 154 await Actor.push_data([error_data])155 156 Actor.log.info("✅ LLMScraper Actor finished successfully!")157 158 except Exception as e:159 Actor.log.error(f"❌ Error during scraping: {str(e)}")160 raise161
162
163if __name__ == "__main__":164 asyncio.run(main())
{ "name": "scraper-code-generator", "version": "0.1.0", "description": "Intelligent web scraping framework using AI-powered quality evaluation", "main": "main.py", "scripts": { "start": "python main.py" }, "keywords": [ "web-scraping", "artificial-intelligence", "ai-powered", "data-extraction", "apify", "claude-ai", "intelligent-scraping", "automated-scraping" ], "dependencies": {}, "author": "", "license": "MIT"}
1[build-system]2requires = ["hatchling"]3build-backend = "hatchling.build"4
5[project]6name = "llmscraper"7version = "0.1.0"8description = "Intelligent web scraping framework using AI-powered quality evaluation and multiple scraping strategies"9readme = "README.md"10requires-python = ">=3.10"11dependencies = [12 "anthropic>=0.54.0",13 "apify-client>=1.11.0", 14 "beautifulsoup4>=4.12.0",15 "apify>=1.5.0",16]17
18[project.scripts]19llmscraper = "llmscraper.main:main"20
21[project.optional-dependencies]22dev = [23 "pytest>=7.0",24 "pytest-asyncio>=0.21.0",25 "black>=23.0",26 "isort>=5.12.0",27 "mypy>=1.5.0",28]29
30[tool.hatch.build.targets.wheel]31packages = ["src/llmscraper"]32
33[tool.hatch.build.targets.sdist]34include = [35 "/src",36 "/tests", 37 "/examples.json",38 "/README.md",39]
1apify>=1.5.02apify-client>=1.5.03anthropic>=0.7.04beautifulsoup4>=4.12.0
version = 1revision = 2requires-python = ">=3.10"
[[package]]name = "annotated-types"version = "0.7.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },]
[[package]]name = "anthropic"version = "0.54.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "anyio" }, { name = "distro" }, { name = "httpx" }, { name = "jiter" }, { name = "pydantic" }, { name = "sniffio" }, { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/89/28/80cb9bb6e7ce77d404145b51da4257455805c17f0a6be528ff3286e3882f/anthropic-0.54.0.tar.gz", hash = "sha256:5e6f997d97ce8e70eac603c3ec2e7f23addeff953fbbb76b19430562bb6ba815", size = 312376, upload-time = "2025-06-11T02:46:27.642Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/de/b9/6ffb48e82c5e97b03cecee872d134a6b6666c2767b2d32ed709f3a60a8fe/anthropic-0.54.0-py3-none-any.whl", hash = "sha256:c1062a0a905daeec17ca9c06c401e4b3f24cb0495841d29d752568a1d4018d56", size = 288774, upload-time = "2025-06-11T02:46:25.578Z" },]
[[package]]name = "anyio"version = "4.9.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, { name = "idna" }, { name = "sniffio" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" },]sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload-time = "2025-03-17T00:02:52.713Z" },]
[[package]]name = "apify-client"version = "1.11.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "apify-shared" }, { name = "colorama" }, { name = "httpx" }, { name = "more-itertools" },]sdist = { url = "https://files.pythonhosted.org/packages/49/44/b7cae857f2129d4093bc5a0a2267fcbba7905207a0b7cc424dc3c7c90291/apify_client-1.11.0.tar.gz", hash = "sha256:c2e151754c35be9bc7c1028bf7cb127aeb1ffa2fbd1ec1ad7e97b901deb32e08", size = 346095, upload-time = "2025-06-13T11:46:39.129Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/8f/24/d3273bfe5b4a96fd60c8d554edbab99274fae8cb2347b96f2e3fa0bc4d5b/apify_client-1.11.0-py3-none-any.whl", hash = "sha256:9d691960bdbeee17624a2a82aafc4f0bfba9b48820a48f559b7eba76bf01cb3c", size = 82550, upload-time = "2025-06-13T11:46:37.483Z" },]
[[package]]name = "apify-shared"version = "1.4.1"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/b2/a6/c8e2fa0b3bdc479d3ecde778e2381af199f910cf7c8baa3c207bcfe26e47/apify_shared-1.4.1.tar.gz", hash = "sha256:16e617c840fd27bf38d980f079c0b867c7378f68c7006b3d5a7d530d43930507", size = 13871, upload-time = "2025-04-28T12:20:01.113Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/1d/f3/3446c8a7986fdc087024d4e174e4b3f587097a9b28f6f8e8c788199225b2/apify_shared-1.4.1-py3-none-any.whl", hash = "sha256:abac5712b6e8eb96693204cbb2702905e1971d9084b1716e7337852b5005290e", size = 12706, upload-time = "2025-04-28T12:19:59.792Z" },]
[[package]]name = "certifi"version = "2025.6.15"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload-time = "2025-06-15T02:45:51.329Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" },]
[[package]]name = "colorama"version = "0.4.6"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },]
[[package]]name = "distro"version = "1.9.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },]
[[package]]name = "exceptiongroup"version = "1.3.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.13'" },]sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },]
[[package]]name = "h11"version = "0.16.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },]
[[package]]name = "httpcore"version = "1.0.9"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "certifi" }, { name = "h11" },]sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },]
[[package]]name = "httpx"version = "0.28.1"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "anyio" }, { name = "certifi" }, { name = "httpcore" }, { name = "idna" },]sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },]
[[package]]name = "idna"version = "3.10"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },]
[[package]]name = "jiter"version = "0.10.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ee/9d/ae7ddb4b8ab3fb1b51faf4deb36cb48a4fbbd7cb36bad6a5fca4741306f7/jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500", size = 162759, upload-time = "2025-05-18T19:04:59.73Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/be/7e/4011b5c77bec97cb2b572f566220364e3e21b51c48c5bd9c4a9c26b41b67/jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303", size = 317215, upload-time = "2025-05-18T19:03:04.303Z" }, { url = "https://files.pythonhosted.org/packages/8a/4f/144c1b57c39692efc7ea7d8e247acf28e47d0912800b34d0ad815f6b2824/jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e", size = 322814, upload-time = "2025-05-18T19:03:06.433Z" }, { url = "https://files.pythonhosted.org/packages/63/1f/db977336d332a9406c0b1f0b82be6f71f72526a806cbb2281baf201d38e3/jiter-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8b3e0068c26ddedc7abc6fac37da2d0af16b921e288a5a613f4b86f050354f", size = 345237, upload-time = "2025-05-18T19:03:07.833Z" }, { url = "https://files.pythonhosted.org/packages/d7/1c/aa30a4a775e8a672ad7f21532bdbfb269f0706b39c6ff14e1f86bdd9e5ff/jiter-0.10.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:286299b74cc49e25cd42eea19b72aa82c515d2f2ee12d11392c56d8701f52224", size = 370999, upload-time = "2025-05-18T19:03:09.338Z" }, { url = "https://files.pythonhosted.org/packages/35/df/f8257abc4207830cb18880781b5f5b716bad5b2a22fb4330cfd357407c5b/jiter-0.10.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ed5649ceeaeffc28d87fb012d25a4cd356dcd53eff5acff1f0466b831dda2a7", size = 491109, upload-time = "2025-05-18T19:03:11.13Z" }, { url = "https://files.pythonhosted.org/packages/06/76/9e1516fd7b4278aa13a2cc7f159e56befbea9aa65c71586305e7afa8b0b3/jiter-0.10.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2ab0051160cb758a70716448908ef14ad476c3774bd03ddce075f3c1f90a3d6", size = 388608, upload-time = "2025-05-18T19:03:12.911Z" }, { url = "https://files.pythonhosted.org/packages/6d/64/67750672b4354ca20ca18d3d1ccf2c62a072e8a2d452ac3cf8ced73571ef/jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03997d2f37f6b67d2f5c475da4412be584e1cec273c1cfc03d642c46db43f8cf", size = 352454, upload-time = "2025-05-18T19:03:14.741Z" }, { url = "https://files.pythonhosted.org/packages/96/4d/5c4e36d48f169a54b53a305114be3efa2bbffd33b648cd1478a688f639c1/jiter-0.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c404a99352d839fed80d6afd6c1d66071f3bacaaa5c4268983fc10f769112e90", size = 391833, upload-time = "2025-05-18T19:03:16.426Z" }, { url = "https://files.pythonhosted.org/packages/0b/de/ce4a6166a78810bd83763d2fa13f85f73cbd3743a325469a4a9289af6dae/jiter-0.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66e989410b6666d3ddb27a74c7e50d0829704ede652fd4c858e91f8d64b403d0", size = 523646, upload-time = "2025-05-18T19:03:17.704Z" }, { url = "https://files.pythonhosted.org/packages/a2/a6/3bc9acce53466972964cf4ad85efecb94f9244539ab6da1107f7aed82934/jiter-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b532d3af9ef4f6374609a3bcb5e05a1951d3bf6190dc6b176fdb277c9bbf15ee", size = 514735, upload-time = "2025-05-18T19:03:19.44Z" }, { url = "https://files.pythonhosted.org/packages/b4/d8/243c2ab8426a2a4dea85ba2a2ba43df379ccece2145320dfd4799b9633c5/jiter-0.10.0-cp310-cp310-win32.whl", hash = "sha256:da9be20b333970e28b72edc4dff63d4fec3398e05770fb3205f7fb460eb48dd4", size = 210747, upload-time = "2025-05-18T19:03:21.184Z" }, { url = "https://files.pythonhosted.org/packages/37/7a/8021bd615ef7788b98fc76ff533eaac846322c170e93cbffa01979197a45/jiter-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:f59e533afed0c5b0ac3eba20d2548c4a550336d8282ee69eb07b37ea526ee4e5", size = 207484, upload-time = "2025-05-18T19:03:23.046Z" }, { url = "https://files.pythonhosted.org/packages/1b/dd/6cefc6bd68b1c3c979cecfa7029ab582b57690a31cd2f346c4d0ce7951b6/jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978", size = 317473, upload-time = "2025-05-18T19:03:25.942Z" }, { url = "https://files.pythonhosted.org/packages/be/cf/fc33f5159ce132be1d8dd57251a1ec7a631c7df4bd11e1cd198308c6ae32/jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc", size = 321971, upload-time = "2025-05-18T19:03:27.255Z" }, { url = "https://files.pythonhosted.org/packages/68/a4/da3f150cf1d51f6c472616fb7650429c7ce053e0c962b41b68557fdf6379/jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d", size = 345574, upload-time = "2025-05-18T19:03:28.63Z" }, { url = "https://files.pythonhosted.org/packages/84/34/6e8d412e60ff06b186040e77da5f83bc158e9735759fcae65b37d681f28b/jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2", size = 371028, upload-time = "2025-05-18T19:03:30.292Z" }, { url = "https://files.pythonhosted.org/packages/fb/d9/9ee86173aae4576c35a2f50ae930d2ccb4c4c236f6cb9353267aa1d626b7/jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61", size = 491083, upload-time = "2025-05-18T19:03:31.654Z" }, { url = "https://files.pythonhosted.org/packages/d9/2c/f955de55e74771493ac9e188b0f731524c6a995dffdcb8c255b89c6fb74b/jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db", size = 388821, upload-time = "2025-05-18T19:03:33.184Z" }, { url = "https://files.pythonhosted.org/packages/81/5a/0e73541b6edd3f4aada586c24e50626c7815c561a7ba337d6a7eb0a915b4/jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5", size = 352174, upload-time = "2025-05-18T19:03:34.965Z" }, { url = "https://files.pythonhosted.org/packages/1c/c0/61eeec33b8c75b31cae42be14d44f9e6fe3ac15a4e58010256ac3abf3638/jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606", size = 391869, upload-time = "2025-05-18T19:03:36.436Z" }, { url = "https://files.pythonhosted.org/packages/41/22/5beb5ee4ad4ef7d86f5ea5b4509f680a20706c4a7659e74344777efb7739/jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605", size = 523741, upload-time = "2025-05-18T19:03:38.168Z" }, { url = "https://files.pythonhosted.org/packages/ea/10/768e8818538e5817c637b0df52e54366ec4cebc3346108a4457ea7a98f32/jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5", size = 514527, upload-time = "2025-05-18T19:03:39.577Z" }, { url = "https://files.pythonhosted.org/packages/73/6d/29b7c2dc76ce93cbedabfd842fc9096d01a0550c52692dfc33d3cc889815/jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7", size = 210765, upload-time = "2025-05-18T19:03:41.271Z" }, { url = "https://files.pythonhosted.org/packages/c2/c9/d394706deb4c660137caf13e33d05a031d734eb99c051142e039d8ceb794/jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812", size = 209234, upload-time = "2025-05-18T19:03:42.918Z" }, { url = "https://files.pythonhosted.org/packages/6d/b5/348b3313c58f5fbfb2194eb4d07e46a35748ba6e5b3b3046143f3040bafa/jiter-0.10.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1e274728e4a5345a6dde2d343c8da018b9d4bd4350f5a472fa91f66fda44911b", size = 312262, upload-time = "2025-05-18T19:03:44.637Z" }, { url = "https://files.pythonhosted.org/packages/9c/4a/6a2397096162b21645162825f058d1709a02965606e537e3304b02742e9b/jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7202ae396446c988cb2a5feb33a543ab2165b786ac97f53b59aafb803fef0744", size = 320124, upload-time = "2025-05-18T19:03:46.341Z" }, { url = "https://files.pythonhosted.org/packages/2a/85/1ce02cade7516b726dd88f59a4ee46914bf79d1676d1228ef2002ed2f1c9/jiter-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23ba7722d6748b6920ed02a8f1726fb4b33e0fd2f3f621816a8b486c66410ab2", size = 345330, upload-time = "2025-05-18T19:03:47.596Z" }, { url = "https://files.pythonhosted.org/packages/75/d0/bb6b4f209a77190ce10ea8d7e50bf3725fc16d3372d0a9f11985a2b23eff/jiter-0.10.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:371eab43c0a288537d30e1f0b193bc4eca90439fc08a022dd83e5e07500ed026", size = 369670, upload-time = "2025-05-18T19:03:49.334Z" }, { url = "https://files.pythonhosted.org/packages/a0/f5/a61787da9b8847a601e6827fbc42ecb12be2c925ced3252c8ffcb56afcaf/jiter-0.10.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c675736059020365cebc845a820214765162728b51ab1e03a1b7b3abb70f74c", size = 489057, upload-time = "2025-05-18T19:03:50.66Z" }, { url = "https://files.pythonhosted.org/packages/12/e4/6f906272810a7b21406c760a53aadbe52e99ee070fc5c0cb191e316de30b/jiter-0.10.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c5867d40ab716e4684858e4887489685968a47e3ba222e44cde6e4a2154f959", size = 389372, upload-time = "2025-05-18T19:03:51.98Z" }, { url = "https://files.pythonhosted.org/packages/e2/ba/77013b0b8ba904bf3762f11e0129b8928bff7f978a81838dfcc958ad5728/jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395bb9a26111b60141757d874d27fdea01b17e8fac958b91c20128ba8f4acc8a", size = 352038, upload-time = "2025-05-18T19:03:53.703Z" }, { url = "https://files.pythonhosted.org/packages/67/27/c62568e3ccb03368dbcc44a1ef3a423cb86778a4389e995125d3d1aaa0a4/jiter-0.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6842184aed5cdb07e0c7e20e5bdcfafe33515ee1741a6835353bb45fe5d1bd95", size = 391538, upload-time = "2025-05-18T19:03:55.046Z" }, { url = "https://files.pythonhosted.org/packages/c0/72/0d6b7e31fc17a8fdce76164884edef0698ba556b8eb0af9546ae1a06b91d/jiter-0.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:62755d1bcea9876770d4df713d82606c8c1a3dca88ff39046b85a048566d56ea", size = 523557, upload-time = "2025-05-18T19:03:56.386Z" }, { url = "https://files.pythonhosted.org/packages/2f/09/bc1661fbbcbeb6244bd2904ff3a06f340aa77a2b94e5a7373fd165960ea3/jiter-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:533efbce2cacec78d5ba73a41756beff8431dfa1694b6346ce7af3a12c42202b", size = 514202, upload-time = "2025-05-18T19:03:57.675Z" }, { url = "https://files.pythonhosted.org/packages/1b/84/5a5d5400e9d4d54b8004c9673bbe4403928a00d28529ff35b19e9d176b19/jiter-0.10.0-cp312-cp312-win32.whl", hash = "sha256:8be921f0cadd245e981b964dfbcd6fd4bc4e254cdc069490416dd7a2632ecc01", size = 211781, upload-time = "2025-05-18T19:03:59.025Z" }, { url = "https://files.pythonhosted.org/packages/9b/52/7ec47455e26f2d6e5f2ea4951a0652c06e5b995c291f723973ae9e724a65/jiter-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7c7d785ae9dda68c2678532a5a1581347e9c15362ae9f6e68f3fdbfb64f2e49", size = 206176, upload-time = "2025-05-18T19:04:00.305Z" }, { url = "https://files.pythonhosted.org/packages/2e/b0/279597e7a270e8d22623fea6c5d4eeac328e7d95c236ed51a2b884c54f70/jiter-0.10.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e0588107ec8e11b6f5ef0e0d656fb2803ac6cf94a96b2b9fc675c0e3ab5e8644", size = 311617, upload-time = "2025-05-18T19:04:02.078Z" }, { url = "https://files.pythonhosted.org/packages/91/e3/0916334936f356d605f54cc164af4060e3e7094364add445a3bc79335d46/jiter-0.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cafc4628b616dc32530c20ee53d71589816cf385dd9449633e910d596b1f5c8a", size = 318947, upload-time = "2025-05-18T19:04:03.347Z" }, { url = "https://files.pythonhosted.org/packages/6a/8e/fd94e8c02d0e94539b7d669a7ebbd2776e51f329bb2c84d4385e8063a2ad/jiter-0.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:520ef6d981172693786a49ff5b09eda72a42e539f14788124a07530f785c3ad6", size = 344618, upload-time = "2025-05-18T19:04:04.709Z" }, { url = "https://files.pythonhosted.org/packages/6f/b0/f9f0a2ec42c6e9c2e61c327824687f1e2415b767e1089c1d9135f43816bd/jiter-0.10.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:554dedfd05937f8fc45d17ebdf298fe7e0c77458232bcb73d9fbbf4c6455f5b3", size = 368829, upload-time = "2025-05-18T19:04:06.912Z" }, { url = "https://files.pythonhosted.org/packages/e8/57/5bbcd5331910595ad53b9fd0c610392ac68692176f05ae48d6ce5c852967/jiter-0.10.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bc299da7789deacf95f64052d97f75c16d4fc8c4c214a22bf8d859a4288a1c2", size = 491034, upload-time = "2025-05-18T19:04:08.222Z" }, { url = "https://files.pythonhosted.org/packages/9b/be/c393df00e6e6e9e623a73551774449f2f23b6ec6a502a3297aeeece2c65a/jiter-0.10.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5161e201172de298a8a1baad95eb85db4fb90e902353b1f6a41d64ea64644e25", size = 388529, upload-time = "2025-05-18T19:04:09.566Z" }, { url = "https://files.pythonhosted.org/packages/42/3e/df2235c54d365434c7f150b986a6e35f41ebdc2f95acea3036d99613025d/jiter-0.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e2227db6ba93cb3e2bf67c87e594adde0609f146344e8207e8730364db27041", size = 350671, upload-time = "2025-05-18T19:04:10.98Z" }, { url = "https://files.pythonhosted.org/packages/c6/77/71b0b24cbcc28f55ab4dbfe029f9a5b73aeadaba677843fc6dc9ed2b1d0a/jiter-0.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:15acb267ea5e2c64515574b06a8bf393fbfee6a50eb1673614aa45f4613c0cca", size = 390864, upload-time = "2025-05-18T19:04:12.722Z" }, { url = "https://files.pythonhosted.org/packages/6a/d3/ef774b6969b9b6178e1d1e7a89a3bd37d241f3d3ec5f8deb37bbd203714a/jiter-0.10.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:901b92f2e2947dc6dfcb52fd624453862e16665ea909a08398dde19c0731b7f4", size = 522989, upload-time = "2025-05-18T19:04:14.261Z" }, { url = "https://files.pythonhosted.org/packages/0c/41/9becdb1d8dd5d854142f45a9d71949ed7e87a8e312b0bede2de849388cb9/jiter-0.10.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d0cb9a125d5a3ec971a094a845eadde2db0de85b33c9f13eb94a0c63d463879e", size = 513495, upload-time = "2025-05-18T19:04:15.603Z" }, { url = "https://files.pythonhosted.org/packages/9c/36/3468e5a18238bdedae7c4d19461265b5e9b8e288d3f86cd89d00cbb48686/jiter-0.10.0-cp313-cp313-win32.whl", hash = "sha256:48a403277ad1ee208fb930bdf91745e4d2d6e47253eedc96e2559d1e6527006d", size = 211289, upload-time = "2025-05-18T19:04:17.541Z" }, { url = "https://files.pythonhosted.org/packages/7e/07/1c96b623128bcb913706e294adb5f768fb7baf8db5e1338ce7b4ee8c78ef/jiter-0.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:75f9eb72ecb640619c29bf714e78c9c46c9c4eaafd644bf78577ede459f330d4", size = 205074, upload-time = "2025-05-18T19:04:19.21Z" }, { url = "https://files.pythonhosted.org/packages/54/46/caa2c1342655f57d8f0f2519774c6d67132205909c65e9aa8255e1d7b4f4/jiter-0.10.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:28ed2a4c05a1f32ef0e1d24c2611330219fed727dae01789f4a335617634b1ca", size = 318225, upload-time = "2025-05-18T19:04:20.583Z" }, { url = "https://files.pythonhosted.org/packages/43/84/c7d44c75767e18946219ba2d703a5a32ab37b0bc21886a97bc6062e4da42/jiter-0.10.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a4c418b1ec86a195f1ca69da8b23e8926c752b685af665ce30777233dfe070", size = 350235, upload-time = "2025-05-18T19:04:22.363Z" }, { url = "https://files.pythonhosted.org/packages/01/16/f5a0135ccd968b480daad0e6ab34b0c7c5ba3bc447e5088152696140dcb3/jiter-0.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d7bfed2fe1fe0e4dda6ef682cee888ba444b21e7a6553e03252e4feb6cf0adca", size = 207278, upload-time = "2025-05-18T19:04:23.627Z" }, { url = "https://files.pythonhosted.org/packages/1c/9b/1d646da42c3de6c2188fdaa15bce8ecb22b635904fc68be025e21249ba44/jiter-0.10.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:5e9251a5e83fab8d87799d3e1a46cb4b7f2919b895c6f4483629ed2446f66522", size = 310866, upload-time = "2025-05-18T19:04:24.891Z" }, { url = "https://files.pythonhosted.org/packages/ad/0e/26538b158e8a7c7987e94e7aeb2999e2e82b1f9d2e1f6e9874ddf71ebda0/jiter-0.10.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:023aa0204126fe5b87ccbcd75c8a0d0261b9abdbbf46d55e7ae9f8e22424eeb8", size = 318772, upload-time = "2025-05-18T19:04:26.161Z" }, { url = "https://files.pythonhosted.org/packages/7b/fb/d302893151caa1c2636d6574d213e4b34e31fd077af6050a9c5cbb42f6fb/jiter-0.10.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c189c4f1779c05f75fc17c0c1267594ed918996a231593a21a5ca5438445216", size = 344534, upload-time = "2025-05-18T19:04:27.495Z" }, { url = "https://files.pythonhosted.org/packages/01/d8/5780b64a149d74e347c5128d82176eb1e3241b1391ac07935693466d6219/jiter-0.10.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15720084d90d1098ca0229352607cd68256c76991f6b374af96f36920eae13c4", size = 369087, upload-time = "2025-05-18T19:04:28.896Z" }, { url = "https://files.pythonhosted.org/packages/e8/5b/f235a1437445160e777544f3ade57544daf96ba7e96c1a5b24a6f7ac7004/jiter-0.10.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4f2fb68e5f1cfee30e2b2a09549a00683e0fde4c6a2ab88c94072fc33cb7426", size = 490694, upload-time = "2025-05-18T19:04:30.183Z" }, { url = "https://files.pythonhosted.org/packages/85/a9/9c3d4617caa2ff89cf61b41e83820c27ebb3f7b5fae8a72901e8cd6ff9be/jiter-0.10.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce541693355fc6da424c08b7edf39a2895f58d6ea17d92cc2b168d20907dee12", size = 388992, upload-time = "2025-05-18T19:04:32.028Z" }, { url = "https://files.pythonhosted.org/packages/68/b1/344fd14049ba5c94526540af7eb661871f9c54d5f5601ff41a959b9a0bbd/jiter-0.10.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c50c40272e189d50006ad5c73883caabb73d4e9748a688b216e85a9a9ca3b9", size = 351723, upload-time = "2025-05-18T19:04:33.467Z" }, { url = "https://files.pythonhosted.org/packages/41/89/4c0e345041186f82a31aee7b9d4219a910df672b9fef26f129f0cda07a29/jiter-0.10.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fa3402a2ff9815960e0372a47b75c76979d74402448509ccd49a275fa983ef8a", size = 392215, upload-time = "2025-05-18T19:04:34.827Z" }, { url = "https://files.pythonhosted.org/packages/55/58/ee607863e18d3f895feb802154a2177d7e823a7103f000df182e0f718b38/jiter-0.10.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:1956f934dca32d7bb647ea21d06d93ca40868b505c228556d3373cbd255ce853", size = 522762, upload-time = "2025-05-18T19:04:36.19Z" }, { url = "https://files.pythonhosted.org/packages/15/d0/9123fb41825490d16929e73c212de9a42913d68324a8ce3c8476cae7ac9d/jiter-0.10.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:fcedb049bdfc555e261d6f65a6abe1d5ad68825b7202ccb9692636c70fcced86", size = 513427, upload-time = "2025-05-18T19:04:37.544Z" }, { url = "https://files.pythonhosted.org/packages/d8/b3/2bd02071c5a2430d0b70403a34411fc519c2f227da7b03da9ba6a956f931/jiter-0.10.0-cp314-cp314-win32.whl", hash = "sha256:ac509f7eccca54b2a29daeb516fb95b6f0bd0d0d8084efaf8ed5dfc7b9f0b357", size = 210127, upload-time = "2025-05-18T19:04:38.837Z" }, { url = "https://files.pythonhosted.org/packages/03/0c/5fe86614ea050c3ecd728ab4035534387cd41e7c1855ef6c031f1ca93e3f/jiter-0.10.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5ed975b83a2b8639356151cef5c0d597c68376fc4922b45d0eb384ac058cfa00", size = 318527, upload-time = "2025-05-18T19:04:40.612Z" }, { url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" },]
[[package]]name = "llmscraper"version = "0.1.0"source = { virtual = "." }dependencies = [ { name = "anthropic" }, { name = "apify-client" },]
[package.metadata]requires-dist = [ { name = "anthropic", specifier = ">=0.54.0" }, { name = "apify-client", specifier = ">=1.11.0" },]
[[package]]name = "more-itertools"version = "10.7.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671, upload-time = "2025-04-22T14:17:41.838Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278, upload-time = "2025-04-22T14:17:40.49Z" },]
[[package]]name = "pydantic"version = "2.11.7"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "annotated-types" }, { name = "pydantic-core" }, { name = "typing-extensions" }, { name = "typing-inspection" },]sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },]
[[package]]name = "pydantic-core"version = "2.33.2"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/e5/92/b31726561b5dae176c2d2c2dc43a9c5bfba5d32f96f8b4c0a600dd492447/pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8", size = 2028817, upload-time = "2025-04-23T18:30:43.919Z" }, { url = "https://files.pythonhosted.org/packages/a3/44/3f0b95fafdaca04a483c4e685fe437c6891001bf3ce8b2fded82b9ea3aa1/pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d", size = 1861357, upload-time = "2025-04-23T18:30:46.372Z" }, { url = "https://files.pythonhosted.org/packages/30/97/e8f13b55766234caae05372826e8e4b3b96e7b248be3157f53237682e43c/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d", size = 1898011, upload-time = "2025-04-23T18:30:47.591Z" }, { url = "https://files.pythonhosted.org/packages/9b/a3/99c48cf7bafc991cc3ee66fd544c0aae8dc907b752f1dad2d79b1b5a471f/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572", size = 1982730, upload-time = "2025-04-23T18:30:49.328Z" }, { url = "https://files.pythonhosted.org/packages/de/8e/a5b882ec4307010a840fb8b58bd9bf65d1840c92eae7534c7441709bf54b/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02", size = 2136178, upload-time = "2025-04-23T18:30:50.907Z" }, { url = "https://files.pythonhosted.org/packages/e4/bb/71e35fc3ed05af6834e890edb75968e2802fe98778971ab5cba20a162315/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b", size = 2736462, upload-time = "2025-04-23T18:30:52.083Z" }, { url = "https://files.pythonhosted.org/packages/31/0d/c8f7593e6bc7066289bbc366f2235701dcbebcd1ff0ef8e64f6f239fb47d/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2", size = 2005652, upload-time = "2025-04-23T18:30:53.389Z" }, { url = "https://files.pythonhosted.org/packages/d2/7a/996d8bd75f3eda405e3dd219ff5ff0a283cd8e34add39d8ef9157e722867/pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a", size = 2113306, upload-time = "2025-04-23T18:30:54.661Z" }, { url = "https://files.pythonhosted.org/packages/ff/84/daf2a6fb2db40ffda6578a7e8c5a6e9c8affb251a05c233ae37098118788/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac", size = 2073720, upload-time = "2025-04-23T18:30:56.11Z" }, { url = "https://files.pythonhosted.org/packages/77/fb/2258da019f4825128445ae79456a5499c032b55849dbd5bed78c95ccf163/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a", size = 2244915, upload-time = "2025-04-23T18:30:57.501Z" }, { url = "https://files.pythonhosted.org/packages/d8/7a/925ff73756031289468326e355b6fa8316960d0d65f8b5d6b3a3e7866de7/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b", size = 2241884, upload-time = "2025-04-23T18:30:58.867Z" }, { url = "https://files.pythonhosted.org/packages/0b/b0/249ee6d2646f1cdadcb813805fe76265745c4010cf20a8eba7b0e639d9b2/pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22", size = 1910496, upload-time = "2025-04-23T18:31:00.078Z" }, { url = "https://files.pythonhosted.org/packages/66/ff/172ba8f12a42d4b552917aa65d1f2328990d3ccfc01d5b7c943ec084299f/pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640", size = 1955019, upload-time = "2025-04-23T18:31:01.335Z" }, { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" }, { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" }, { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" }, { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" }, { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" }, { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" }, { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" }, { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" }, { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" }, { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" }, { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" }, { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" }, { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" }, { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" }, { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" }, { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" }, { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" }, { url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" }, { url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" }, { url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" }, { url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" }, { url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" }, { url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" }, { url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" }, { url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" }, { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" }, { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" }, { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" }, { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" }, { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" }, { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" }, { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" }, { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" }, { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" }, { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" }, { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" }, { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" }, { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" }, { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" }, { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" }, { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" }, { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" }, { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, { url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" }, { url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" }, { url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" }, { url = "https://files.pythonhosted.org/packages/12/73/8cd57e20afba760b21b742106f9dbdfa6697f1570b189c7457a1af4cd8a0/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e", size = 2067527, upload-time = "2025-04-23T18:32:59.771Z" }, { url = "https://files.pythonhosted.org/packages/e3/d5/0bb5d988cc019b3cba4a78f2d4b3854427fc47ee8ec8e9eaabf787da239c/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c", size = 2108225, upload-time = "2025-04-23T18:33:04.51Z" }, { url = "https://files.pythonhosted.org/packages/f1/c5/00c02d1571913d496aabf146106ad8239dc132485ee22efe08085084ff7c/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec", size = 2069490, upload-time = "2025-04-23T18:33:06.391Z" }, { url = "https://files.pythonhosted.org/packages/22/a8/dccc38768274d3ed3a59b5d06f59ccb845778687652daa71df0cab4040d7/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052", size = 2237525, upload-time = "2025-04-23T18:33:08.44Z" }, { url = "https://files.pythonhosted.org/packages/d4/e7/4f98c0b125dda7cf7ccd14ba936218397b44f50a56dd8c16a3091df116c3/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c", size = 2238446, upload-time = "2025-04-23T18:33:10.313Z" }, { url = "https://files.pythonhosted.org/packages/ce/91/2ec36480fdb0b783cd9ef6795753c1dea13882f2e68e73bce76ae8c21e6a/pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808", size = 2066678, upload-time = "2025-04-23T18:33:12.224Z" }, { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" }, { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" }, { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" }, { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" }, { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" }, { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" }, { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" }, { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" }, { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },]
[[package]]name = "sniffio"version = "1.3.1"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },]
[[package]]name = "typing-extensions"version = "4.14.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/d1/bc/51647cd02527e87d05cb083ccc402f93e441606ff1f01739a62c8ad09ba5/typing_extensions-4.14.0.tar.gz", hash = "sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4", size = 107423, upload-time = "2025-06-02T14:52:11.399Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/69/e0/552843e0d356fbb5256d21449fa957fa4eff3bbc135a74a691ee70c7c5da/typing_extensions-4.14.0-py3-none-any.whl", hash = "sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af", size = 43839, upload-time = "2025-06-02T14:52:10.026Z" },]
[[package]]name = "typing-inspection"version = "0.4.1"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },]
1# 🧠 AI-Powered Web Scraper & Code Generator2
3**Stop writing scraping code manually!** This intelligent actor doesn't just scrape websites - it **automatically generates custom Python scraping code** tailored to your specific needs. 4
5You get both the *extracted data* AND the *code* to replicate it anytime.6
7## 🚀 What This Actor Does8
9The actor will automatically:10
11- **Test multiple scraping methods**: Runs multiple scraping strategies (Cheerio, Web Scraper, Website Content Crawler, Playwright, etc.) **in parallel** for faster results12- **Evaluate which works best using AI**: Claude AI analyzes each result and selects the best extraction13- **Extract your requested data**: Automatically structures the extracted data based on your requirements14- **🔥 Generate custom Python code that scrapes YOUR website**: Creates personalized Python scraping code that you can run independently15- **Provide the code as a downloadable script you can run anywhere**: Complete, ready-to-use BeautifulSoup script saved to key-value store16
17## ✨ Key Benefits18
19- **No Technical Knowledge Required**: Just describe what data you want in plain English20- **Resilient Scraping**: Multiple strategies ensure success even if one method fails21- **AI-Powered**: Uses Claude AI to understand content context and select optimal results22- **🎯 Custom Code Generation**: Get personalized Python code that scrapes YOUR specific website23- **Production Ready**: Generated code is clean, documented, and ready to run independently24- **Reusable**: Use the generated code in your own projects, scripts, or applications25
26## 📊 Output Data27
28The actor saves comprehensive results to your default dataset AND saves the generated script to the key-value store.29
30> **💡 How to Access**: After the actor finishes, go to the "Key-value store" tab in your run details and download the `GENERATED_SCRIPT` file. Rename it to have the extension: **.py**.31
32### 🎯 What You Get33
34- **Extracted Data**: The actual data from the website, structured according to your goal35- **🔥 Generated Python Code**: Ready-to-use BeautifulSoup script that you can run on your own computer36- **💾 Separate Script File**: The Python code is also saved as a downloadable file in the key-value store37- **Quality Scores**: Performance ratings for each scraping method (0-10 scale)38- **Best Method**: Which scraping approach worked best for your website39
40> **💡 Pro Tip**: The generated Python code is completely standalone - you can copy it, modify it, and use it in your own projects without needing this actor again!41
42## 🎯 Usage Examples43
44### E-commerce Product Scraping45
46```json47{48 "targetUrl": "https://books.toscrape.com/",49 "userGoal": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock.",50 "claudeApiKey": "sk-ant-..."51}52```53
54### News Website Scraping55
56```json57{58 "targetUrl": "https://www.theverge.com/",59 "userGoal": "I want to scrape the main articles from The Verge homepage. For each article, get me the headline, the author's name, and the link to the full article.",60 "claudeApiKey": "sk-ant-..."61}62```63
64### Job Listings Scraping65
66```json67{68 "targetUrl": "https://www.python.org/jobs/",69 "userGoal": "List all the jobs posted. For each job, I want the job title, the company name, the location, and the date it was posted.",70 "claudeApiKey": "sk-ant-..."71}72```73
74### Quote Collection75
76```json77{78 "targetUrl": "https://quotes.toscrape.com/",79 "userGoal": "I want a list of all quotes on this page. For each one, get the quote text itself, the name of the author, and a list of the tags associated with it.",80 "claudeApiKey": "sk-ant-..."81}82```83
84### Business Directory Scraping85
86```json87{88 "targetUrl": "https://directory.com/restaurants",89 "userGoal": "Get restaurant names, addresses, phone numbers, and ratings",90 "claudeApiKey": "sk-ant-..."91}92```93
94## 🔧 How to Use95
961. **Enter Target URL**: Paste the website URL you want to scrape972. **Describe Your Goal**: Be specific about what data you need (e.g., "product names and prices" not just "products")983. **Add Claude API Key**: Your Anthropic API key for AI analysis994. **Configure Advanced Settings** (optional): Customize Claude model, HTML processing, and actor selection1005. **Run the Actor**: Click "Start" and watch the magic happen!101
102## ⚙️ Advanced Configuration103
104### 🤖 Claude Model Selection105
106Choose the AI model that best fits your needs:107
108- **Claude 4 Sonnet** (Default): Latest and most capable model109- **Claude 4 Opus**: Maximum quality for the most complex tasks110- **Claude 3.7 Sonnet**: Enhanced capabilities over 3.5111- **Claude 3.5 Sonnet**: Reliable and well-tested112- **Claude 3.5 Haiku**: Fastest and most cost-effective113- **Claude 3 Sonnet**: Good balance for most tasks114- **Claude 3 Haiku**: Basic tasks with minimal cost115
116### 🔧 HTML Processing Settings117
118Fine-tune how HTML content is processed:119
120- **Enable HTML Pruning**: Reduces processing time by removing unnecessary content121- **Max List Items**: Controls how many items to keep in lists/tables (1-20)122- **Max Text Length**: Maximum text length in any element (100-2000 chars)123- **Prune Percentage**: How much content to keep (10%-100%)124
125### 🎯 Actor Selection126
127Choose which scraping methods to use:128
129- **Cheerio Scraper**: Fast jQuery-like scraping (enabled by default)130- **Web Scraper**: Versatile with JavaScript support (enabled by default)131- **Website Content Crawler**: Advanced Playwright crawler (enabled by default)132- **Playwright Scraper**: Modern browser automation (disabled by default)133- **Puppeteer Scraper**: Chrome-based scraping (disabled by default)134
135> **💡 Pro Tip**: Enable 2-3 actors for the best balance of speed and reliability. More actors = better chances of success but slower execution.136
137### 🚀 Performance Settings138
139- **Concurrent Actors**: Run multiple actors simultaneously for faster results140- **Test Generated Script**: Validate the generated code before saving141
142The actor will automatically:143
144- Test multiple scraping methods145- Evaluate which works best using AI146- Extract your requested data147- **🔥 Generate custom Python code that scrapes YOUR website**148- Provide the code as a downloadable script you can run anywhere149
150### Common Use Cases151
152- **Market Research**: Track competitor pricing and products + get code to monitor them daily153- **Content Aggregation**: Collect news articles or blog posts + get code to update your database154- **Lead Generation**: Extract business contact information + get code to scrape new listings155- **Data Analysis**: Gather data for research projects + get code to repeat the process156- **Price Monitoring**: Track product prices over time + get code to check prices automatically157
158## 🔍 Troubleshooting159
160### "No content found" errors161
162- Try different goal descriptions163- Some websites block automated scraping164- Check if the URL is accessible165
166### Poor quality scores167
168- Be more specific in your goal description169- The website might have complex structure170- Try simpler pages first171
172### 🔑 Getting Your Claude API Key173
1741. Go to [Anthropic Console](https://console.anthropic.com/)1752. Sign up or log in1763. Navigate to API Keys section1774. Create a new API key1785. Copy and paste it into the "Claude API Key" field179
180### Claude API errors181
182- Verify your API key is correct183- Check your Claude API usage limits184- Ensure you have sufficient API credits185
186## 📋 Input Parameters187
188| Parameter | Type | Required | Description |189|-----------|------|----------|-------------|190| **Target URL** | String | Yes | The website URL you want to scrape |191| **User Goal** | String | Yes | Describe what data you want (e.g., "Extract all product names, prices, and ratings") |192| **Claude API Key** | String | Yes | Your Anthropic Claude API key ([Get one here](https://console.anthropic.com/)) |193| **Test Generated Script** | Boolean | No | Whether to test the generated script (default: true) |194| **Claude Model** | String | No | AI model to use (default: Claude 4 Sonnet) |195| **Max Retries** | Number | No | Maximum retry attempts (default: 3) |196| **Timeout** | Number | No | Timeout per attempt in seconds (default: 60) |197| **HTML Pruning Enabled** | Boolean | No | Enable HTML content processing (default: true) |198| **HTML Max List Items** | Number | No | Maximum items in lists to keep (1-20, default: 3) |199| **HTML Max Text Length** | Number | No | Maximum text length in elements (50-2000, default: 200) |200| **HTML Prune Before Evaluation** | Boolean | No | Apply pruning before AI evaluation (default: true) |201| **HTML Prune Percentage** | Number | No | Percentage of content to keep (0-100, default: 80) |202| **Actors** | Array | No | Detailed actor configurations with custom inputs |203| **Concurrent Actors** | Boolean | No | Run actors simultaneously (default: true) |204205### Advanced Configuration Examples206
207#### Custom Claude Model208
209```json210{211 "targetUrl": "https://example.com",212 "userGoal": "Extract product data",213 "claudeApiKey": "sk-ant-...",214 "claudeModel": "claude-sonnet-4-20250514"215}216```217
218#### Custom HTML Processing219
220```json221{222 "targetUrl": "https://example.com",223 "userGoal": "Extract product data",224 "claudeApiKey": "sk-ant-...",225 "htmlPruningEnabled": true,226 "htmlMaxListItems": 10,227 "htmlMaxTextLength": 1000,228 "htmlPrunePercentage": 90229}230```231
232#### Custom Actor Selection233
234```json235{236 "targetUrl": "https://example.com",237 "userGoal": "Extract product data",238 "claudeApiKey": "sk-ant-...",239 "actors": [240 {241 "name": "cheerio-scraper",242 "enabled": true,243 "input": {244 "maxRequestRetries": 5,245 "requestTimeoutSecs": 60,246 "maxPagesPerCrawl": 1,247 "proxyConfiguration": {"useApifyProxy": true}248 }249 },250 {251 "name": "web-scraper",252 "enabled": false,253 "input": {}254 },255 {256 "name": "playwright-scraper",257 "enabled": true,258 "input": {259 "maxRequestRetries": 3,260 "requestTimeoutSecs": 90,261 "maxPagesPerCrawl": 1262 }263 }264 ],265 "concurrentActors": true266}267```268
269#### Full Configuration Example270
271```json272{273 "targetUrl": "https://books.toscrape.com/",274 "userGoal": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock.",275 "claudeApiKey": "sk-ant-...",276 "claudeModel": "claude-sonnet-4-20250514",277 "testScript": true,278 "maxRetries": 3,279 "timeout": 60,280 "htmlPruningEnabled": true,281 "htmlMaxListItems": 5,282 "htmlMaxTextLength": 500,283 "htmlPruneBeforeEvaluation": true,284 "htmlPrunePercentage": 80,285 "concurrentActors": true,286 "actors": [287 {288 "name": "cheerio-scraper",289 "enabled": true,290 "input": {291 "maxRequestRetries": 3,292 "requestTimeoutSecs": 30,293 "maxPagesPerCrawl": 1,294 "proxyConfiguration": {"useApifyProxy": true}295 }296 },297 {298 "name": "web-scraper",299 "enabled": true,300 "input": {301 "maxRequestRetries": 3,302 "requestTimeoutSecs": 30,303 "maxPagesPerCrawl": 1,304 "proxyConfiguration": {"useApifyProxy": true}305 }306 },307 {308 "name": "playwright-scraper",309 "enabled": true,310 "input": {311 "maxRequestRetries": 2,312 "requestTimeoutSecs": 45,313 "maxPagesPerCrawl": 1314 }315 }316 ]317}318```
{ "actorSpecification": 1, "name": "ScraperCodeGenerator", "title": "🧠 AI-Powered Web Scraper & Code Generator", "description": "Intelligent web scraping tool that generates custom Python code. Configure AI models, HTML processing, and multiple scraping strategies for optimal results.", "version": "1.1", "buildTag": "latest", "environmentVariables": { "CLAUDE_API_KEY": "@claudeApiKey" }, "dockerfile": "./Dockerfile", "input": "./input_schema.json", "storages": { "dataset": { "actorSpecification": 1, "views": { "scraped_data": { "title": "Scraped Data", "transformation": {}, "display": { "component": "table", "properties": { "url": { "label": "Source URL", "format": "link" }, "title": { "label": "Page Title", "format": "text" }, "data": { "label": "Extracted Data", "format": "object" }, "script_tested": { "label": "Script Tested", "format": "boolean" }, "timestamp": { "label": "Scraped At", "format": "datetime" }, "success": { "label": "Success", "format": "boolean" }, "error": { "label": "Error Message", "format": "text" } } } } } } }}
{ "title": "Advanced Scraper Configuration", "description": "Configure AI models, HTML processing, actor selection, and other advanced settings for optimal scraping results", "type": "object", "schemaVersion": 1, "properties": { "targetUrl": { "title": "Target URL", "description": "The URL of the website you want to scrape", "type": "string", "editor": "textfield", "prefill": "https://books.toscrape.com/" }, "userGoal": { "title": "Scraping Goal", "description": "Describe what data you want to extract from the website", "type": "string", "editor": "textarea", "prefill": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock." }, "claudeApiKey": { "title": "Claude API Key", "description": "Your Anthropic Claude API key for AI-powered code generation", "type": "string", "editor": "textfield", "isSecret": true }, "maxRetries": { "title": "Max Retries", "description": "Maximum number of retry attempts for scraping", "type": "integer", "editor": "number", "minimum": 1, "maximum": 10, "default": 3 }, "timeout": { "title": "Timeout (seconds)", "description": "Timeout for each scraping attempt in seconds", "type": "integer", "editor": "number", "minimum": 10, "maximum": 300, "default": 60, "unit": "seconds" }, "testScript": { "title": "Test Generated Script", "description": "Whether to test the generated scraping script before saving it", "type": "boolean", "default": true, "editor": "checkbox" }, "claudeModel": { "title": "Claude Model", "description": "Choose which Claude model to use for AI analysis", "type": "string", "editor": "select", "default": "claude-sonnet-4-20250514", "enum": [ "claude-sonnet-4-20250514", "claude-opus-4-20250514", "claude-3-7-sonnet-20250219", "claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022", "claude-3-sonnet-20240229", "claude-3-haiku-20240307" ], "enumTitles": [ "Claude 4 Sonnet (Latest & Best)", "Claude 4 Opus (Maximum Quality)", "Claude 3.7 Sonnet (Enhanced)", "Claude 3.5 Sonnet (Reliable)", "Claude 3.5 Haiku (Fast & Cheap)", "Claude 3 Sonnet (Balanced)", "Claude 3 Haiku (Fastest)" ] }, "htmlPruningEnabled": { "title": "Enable HTML Pruning", "description": "Enable HTML content processing before analysis", "type": "boolean", "default": true, "editor": "checkbox", "sectionCaption": "HTML Processing Settings", "sectionDescription": "Configure how HTML content is processed before analysis" }, "htmlMaxListItems": { "title": "Max List Items", "description": "Maximum number of items to keep in lists when pruning HTML", "type": "integer", "editor": "number", "minimum": 1, "maximum": 20, "default": 3 }, "htmlMaxTextLength": { "title": "Max Text Length", "description": "Maximum length of text content to keep when pruning HTML", "type": "integer", "editor": "number", "minimum": 50, "maximum": 2000, "default": 200, "unit": "characters" }, "htmlPruneBeforeEvaluation": { "title": "Prune Before Evaluation", "description": "Apply HTML pruning before quality evaluation", "type": "boolean", "default": true, "editor": "checkbox" }, "htmlPrunePercentage": { "title": "Prune Percentage", "description": "Percentage of HTML content to prune (0-100)", "type": "integer", "editor": "number", "minimum": 0, "maximum": 100, "default": 80, "unit": "%" }, "actors": { "title": "Scraping Actors Configuration", "description": "Select and configure which Apify actors to use for scraping", "type": "array", "editor": "json", "sectionCaption": "Scrapers Configuration", "sectionDescription": "Configure which scraping actors to use and their settings", "prefill": [ { "name": "cheerio-scraper", "enabled": true, "input": { "maxRequestRetries": 3, "requestTimeoutSecs": 30, "maxPagesPerCrawl": 1, "pageFunction": "async function pageFunction(context) {\n const { request, log, $ } = context;\n try {\n const title = $('title').text() || '';\n const html = $('html').html() || '';\n return {\n url: request.url,\n title: title,\n html: html\n };\n } catch (error) {\n log.error('Error in pageFunction:', error);\n return {\n url: request.url,\n title: '',\n html: ''\n };\n }\n}", "proxyConfiguration": {"useApifyProxy": true} } }, { "name": "web-scraper", "enabled": false, "input": { "maxRequestRetries": 3, "requestTimeoutSecs": 30, "maxPagesPerCrawl": 1, "pageFunction": "async function pageFunction(context) {\n const { request, log, page } = context;\n try {\n const title = await page.title();\n const html = await page.content();\n return {\n url: request.url,\n title: title,\n html: html\n };\n } catch (error) {\n log.error('Error in pageFunction:', error);\n return {\n url: request.url,\n title: '',\n html: ''\n };\n }\n}", "proxyConfiguration": {"useApifyProxy": true} } }, { "name": "website-content-crawler", "enabled": true, "input": { "maxCrawlPages": 1, "crawler": "playwright", "proxyConfiguration": {"useApifyProxy": true} } }, { "name": "playwright-scraper", "enabled": false, "input": { "maxRequestRetries": 2, "requestTimeoutSecs": 45, "maxPagesPerCrawl": 1, "pageFunction": "async function pageFunction(context) {\n const { request, log, page } = context;\n try {\n const title = await page.title();\n const html = await page.content();\n return {\n url: request.url,\n title: title,\n html: html\n };\n } catch (error) {\n log.error('Error in pageFunction:', error);\n return {\n url: request.url,\n title: '',\n html: ''\n };\n }\n}", "proxyConfiguration": {"useApifyProxy": true} } } ] }, "concurrentActors": { "title": "Concurrent Actors", "description": "Run multiple actors simultaneously for faster results", "type": "boolean", "default": true, "editor": "checkbox" }, "forActor": { "title": "Generate for Apify Actor", "description": "Choose the output format for the generated script", "type": "boolean", "default": true, "editor": "checkbox", "sectionCaption": "Output Settings", "sectionDescription": "Configure how the generated script should be formatted" } }, "required": ["targetUrl", "userGoal", "claudeApiKey"]}
1"""2Apify Actor Runner Module3
4This module provides functionality to run Apify actors and retrieve their results5using the official Apify Python client library.6"""7
8import logging9from typing import Optional, Dict, Any, List, Union10
11from apify_client import ApifyClient12
13
14def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:15 """16 Run an Apify actor and retrieve its dataset results.17 18 This function is maintained for backward compatibility. For more flexible19 data retrieval (including key-value stores), use run_apify_actor_with_flexible_retrieval.20 21 Args:22 actor_id: The ID or name of the Apify actor to run (e.g., 'apify/web-scraper')23 actor_input: Dictionary containing the input configuration for the actor24 api_token: Apify API token for authentication (keyword-only argument)25 26 Returns:27 List of dictionaries containing the dataset items if successful, None otherwise.28 Returns an empty list if the run succeeds but produces no data.29 30 Raises:31 ValueError: If actor_id or api_token is empty32 33 Example:34 >>> input_data = {"startUrls": [{"url": "https://example.com"}]}35 >>> results = run_apify_actor("apify/web-scraper", input_data, api_token="your_token")36 >>> if results is not None:37 ... print(f"Scraped {len(results)} items")38 """39 result = run_apify_actor_with_flexible_retrieval(40 actor_id=actor_id,41 actor_input=actor_input,42 api_token=api_token,43 retrieve_from="dataset"44 )45 46 # Ensure we return a list for backward compatibility47 if isinstance(result, list):48 return result49 else:50 return None51
52
53def run_apify_actor_with_flexible_retrieval(54 actor_id: str, 55 actor_input: dict, 56 *, 57 api_token: str,58 retrieve_from: str = "auto"59) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:60 """61 Run an Apify actor and retrieve its results from dataset or key-value store.62 63 This function starts an Apify actor with the provided input, waits for the run64 to complete, and returns data from either the dataset or key-value store based65 on the actor type or explicit configuration.66 67 Args:68 actor_id: The ID or name of the Apify actor to run (e.g., 'apify/web-scraper')69 actor_input: Dictionary containing the input configuration for the actor70 api_token: Apify API token for authentication (keyword-only argument)71 retrieve_from: Where to retrieve data from. Options:72 - "auto": Automatically detect based on actor_id73 - "dataset": Retrieve from dataset only74 - "key-value-store": Retrieve from key-value store only75 - "both": Retrieve from both and return combined results76 77 Returns:78 - If retrieve_from is "dataset": List of dictionaries from dataset79 - If retrieve_from is "key-value-store": Dictionary with key-value store items80 - If retrieve_from is "both": Dictionary with 'dataset' and 'key_value_store' keys81 - If retrieve_from is "auto": Appropriate format based on actor type82 Returns None if the run fails.83 84 Raises:85 ValueError: If actor_id, api_token is empty, or retrieve_from is invalid86 87 Example:88 >>> input_data = {"startUrls": [{"url": "https://example.com"}]}89 >>> # Auto-detect storage type90 >>> results = run_apify_actor_with_flexible_retrieval(91 ... "apify/website-content-crawler", 92 ... input_data, 93 ... api_token="your_token"94 ... )95 >>> # Explicitly use key-value store96 >>> results = run_apify_actor_with_flexible_retrieval(97 ... "apify/web-scraper", 98 ... input_data, 99 ... api_token="your_token",100 ... retrieve_from="key-value-store"101 ... )102 """103 # Input validation104 if not actor_id or not actor_id.strip():105 raise ValueError("actor_id cannot be empty")106 if not api_token or not api_token.strip():107 raise ValueError("api_token cannot be empty")108 if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:109 raise ValueError("retrieve_from must be 'auto', 'dataset', 'key-value-store', or 'both'")110 111 # Configure logging112 logger = logging.getLogger(__name__)113 114 # Determine storage type based on actor if auto mode115 if retrieve_from == "auto":116 if "website-content-crawler" in actor_id:117 retrieve_from = "key-value-store"118 else:119 retrieve_from = "dataset"120 121 try:122 # Initialize the Apify client123 client = ApifyClient(api_token)124 125 logger.info(f"Starting Apify actor: {actor_id}")126 127 # Start the actor run128 run = client.actor(actor_id).call(run_input=actor_input)129 130 # Check if the run was created successfully131 if not run:132 logger.error(f"Failed to start actor run for {actor_id}")133 return None134 135 run_id = run.get('id')136 status = run.get('status')137 138 logger.info(f"Actor run started with ID: {run_id}, Status: {status}")139 140 # Check the final status of the run141 if status != 'SUCCEEDED':142 logger.warning(143 f"Actor run {run_id} did not succeed. "144 f"Final status: {status}. "145 f"Exit code: {run.get('exitCode', 'N/A')}"146 )147 return None148 149 logger.info(f"Actor run {run_id} completed successfully")150 151 # Retrieve data based on the specified method152 if retrieve_from == "dataset":153 dataset_client = client.dataset(run.get('defaultDatasetId'))154 dataset_items = list(dataset_client.iterate_items())155 logger.info(f"Retrieved {len(dataset_items)} items from dataset")156 return dataset_items157 158 elif retrieve_from == "key-value-store":159 kv_store_client = client.key_value_store(run.get('defaultKeyValueStoreId'))160 161 # List all keys in the key-value store162 keys_response = kv_store_client.list_keys()163 keys = [item['key'] for item in keys_response.get('items', [])]164 165 if not keys:166 logger.warning("No keys found in key-value store")167 return {}168 169 # Retrieve all key-value pairs170 kv_items = {}171 for key in keys:172 try:173 value = kv_store_client.get_record(key)174 if value:175 kv_items[key] = value.get('value')176 except Exception as e:177 logger.warning(f"Failed to retrieve key '{key}': {str(e)}")178 179 logger.info(f"Retrieved {len(kv_items)} items from key-value store")180 return kv_items181 182 elif retrieve_from == "both":183 # Retrieve from both sources184 results = {"dataset": [], "key_value_store": {}}185 186 # Get dataset items187 try:188 dataset_client = client.dataset(run.get('defaultDatasetId'))189 dataset_items = list(dataset_client.iterate_items())190 results["dataset"] = dataset_items191 logger.info(f"Retrieved {len(dataset_items)} items from dataset")192 except Exception as e:193 logger.warning(f"Failed to retrieve dataset items: {str(e)}")194 195 # Get key-value store items196 try:197 kv_store_client = client.key_value_store(run.get('defaultKeyValueStoreId'))198 keys_response = kv_store_client.list_keys()199 keys = [item['key'] for item in keys_response.get('items', [])]200 201 kv_items = {}202 for key in keys:203 try:204 value = kv_store_client.get_record(key)205 if value:206 kv_items[key] = value.get('value')207 except Exception as e:208 logger.warning(f"Failed to retrieve key '{key}': {str(e)}")209 210 results["key_value_store"] = kv_items211 logger.info(f"Retrieved {len(kv_items)} items from key-value store")212 except Exception as e:213 logger.warning(f"Failed to retrieve key-value store items: {str(e)}")214 215 return results216 217 except Exception as e:218 logger.error(f"Error running Apify actor {actor_id}: {str(e)}", exc_info=True)219 return None
1"""2Multi-Actor Website Scraper3
4This module provides functionality to scrape websites using multiple Apify actors5simultaneously and return a dictionary of actor-scraped HTML pairs.6"""7
8import asyncio9import logging10from typing import Dict, List, Optional, Any11from concurrent.futures import ThreadPoolExecutor, as_completed12import json13
14from .apify_runner import run_apify_actor_with_flexible_retrieval, run_apify_actor15
16
17class MultiActorScraper:18 """19 A class to scrape websites using multiple Apify actors simultaneously.20 """21 22 def __init__(self, api_token: str):23 """24 Initialize the MultiActorScraper.25 26 Args:27 api_token: Apify API token for authentication28 """29 self.api_token = api_token30 self.logger = logging.getLogger(__name__)31 32 # Configure logging33 logging.basicConfig(34 level=logging.INFO,35 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'36 )37 38 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:39 """40 Get the configuration for each actor with the target URL.41 42 Args:43 target_url: The URL to scrape44 45 Returns:46 Dictionary mapping actor names to their configurations47 """48 return {49 "cheerio-scraper": {50 "actor_id": "apify/cheerio-scraper",51 "input": {52 "debugLog": False,53 "forceResponseEncoding": False,54 "ignoreSslErrors": False,55 "keepUrlFragments": False,56 "pageFunction": """async function pageFunction(context) {57 const { $, request, log } = context;58
59 const url = request.url;60 const html = $.html(); // Get the full HTML of the page61
62 log.info('Page scraped', { url });63
64 return {65 url,66 html67 };68}""",69 "postNavigationHooks": """// We need to return array of (possibly async) functions here.70// The functions accept a single argument: the "crawlingContext" object.71[72 async (crawlingContext) => {73 // ...74 },75]""",76 "preNavigationHooks": """// We need to return array of (possibly async) functions here.77// The functions accept two arguments: the "crawlingContext" object78// and "requestAsBrowserOptions" which are passed to the `requestAsBrowser()`79// function the crawler calls to navigate..80[81 async (crawlingContext, requestAsBrowserOptions) => {82 // ...83 }84]""",85 "proxyConfiguration": {86 "useApifyProxy": True87 },88 "respectRobotsTxtFile": False,89 "startUrls": [90 {91 "url": target_url,92 "method": "GET"93 }94 ]95 }96 },97 98 "website-content-crawler": {99 "actor_id": "apify/website-content-crawler",100 "input": {101 "aggressivePrune": False,102 "clickElementsCssSelector": "[aria-expanded=\"false\"]",103 "clientSideMinChangePercentage": 15,104 "crawlerType": "playwright:adaptive",105 "debugLog": False,106 "debugMode": False,107 "dynamicContentWaitSecs": 15,108 "expandIframes": True,109 "ignoreCanonicalUrl": False,110 "keepUrlFragments": False,111 "maxCrawlDepth": 0,112 "proxyConfiguration": {113 "useApifyProxy": True,114 "apifyProxyGroups": [115 "RESIDENTIAL"116 ]117 },118 "readableTextCharThreshold": 100,119 "removeCookieWarnings": True,120 "renderingTypeDetectionPercentage": 10,121 "respectRobotsTxtFile": False,122 "saveFiles": False,123 "saveHtml": False,124 "saveHtmlAsFile": True,125 "saveMarkdown": False,126 "saveScreenshots": False,127 "startUrls": [128 {129 "url": target_url,130 "method": "GET"131 }132 ],133 "useSitemaps": False134 }135 },136 137 "web-scraper": {138 "actor_id": "apify/web-scraper",139 "input": {140 "breakpointLocation": "NONE",141 "browserLog": False,142 "closeCookieModals": False,143 "debugLog": False,144 "downloadCss": True,145 "downloadMedia": True,146 "headless": True,147 "ignoreCorsAndCsp": False,148 "ignoreSslErrors": False,149 "injectJQuery": True,150 "keepUrlFragments": False,151 "pageFunction": """// The function accepts a single argument: the "context" object.152// For a complete list of its properties and functions,153// see https://apify.com/apify/web-scraper#page-function 154async function pageFunction(context) {155 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!156 // debugger; 157
158 // jQuery is handy for finding DOM elements and extracting data from them.159 // To use it, make sure to enable the "Inject jQuery" option.160 const $ = context.jQuery;161 const url = context.request.url;162 const html = $('html').html(); // Get the full HTML of the page163
164 // Print some information to Actor log165 context.log.info(`URL: ${url}, HTML length: ${html ? html.length : 0}`);166
167 // Return an object with the data extracted from the page.168 // It will be stored to the resulting dataset.169 return {170 url,171 html172 };173}""",174 "postNavigationHooks": """// We need to return array of (possibly async) functions here.175// The functions accept a single argument: the "crawlingContext" object.176[177 async (crawlingContext) => {178 // ...179 },180]""",181 "preNavigationHooks": """// We need to return array of (possibly async) functions here.182// The functions accept two arguments: the "crawlingContext" object183// and "gotoOptions".184[185 async (crawlingContext, gotoOptions) => {186 // ...187 },188]""",189 "proxyConfiguration": {190 "useApifyProxy": True191 },192 "respectRobotsTxtFile": False,193 "runMode": "PRODUCTION",194 "startUrls": [195 {196 "url": target_url,197 "method": "GET"198 }199 ],200 "useChrome": False,201 "waitUntil": [202 "networkidle2"203 ]204 }205 }206 }207 208 def _run_single_actor(self, actor_name: str, actor_config: Dict[str, Any]) -> tuple[str, Optional[str]]:209 """210 Run a single actor and extract HTML content.211 212 Args:213 actor_name: Name of the actor214 actor_config: Configuration for the actor215 216 Returns:217 Tuple of (actor_name, html_content or None)218 """219 try:220 self.logger.info(f"Starting {actor_name}...")221 222 # Determine retrieval method based on actor type223 if "website-content-crawler" in actor_config["actor_id"]:224 retrieve_from = "key-value-store"225 else:226 retrieve_from = "dataset"227 228 results = run_apify_actor_with_flexible_retrieval(229 actor_config["actor_id"],230 actor_config["input"],231 api_token=self.api_token,232 retrieve_from=retrieve_from233 )234 235 if not results:236 self.logger.warning(f"{actor_name} returned no results")237 return actor_name, None238 239 # Extract HTML from results240 html_content = None241 242 # Handle different result formats243 if isinstance(results, list):244 # Dataset results (list format)245 for item in results:246 if isinstance(item, dict):247 # For cheerio-scraper and web-scraper248 if 'html' in item:249 html_content = item['html']250 break251 # For other actors that might use different keys252 elif 'text' in item:253 html_content = item['text']254 break255 elif 'content' in item:256 html_content = item['content']257 break258 # If it's a string (sometimes the whole result is HTML)259 elif isinstance(item, str):260 html_content = item261 break262 263 elif isinstance(results, dict):264 # Key-value store results (dict format) - for website-content-crawler265 # Look for HTML files in the key-value store266 for key, value in results.items():267 if key.endswith('.html') or 'html' in key.lower():268 if isinstance(value, str):269 html_content = value270 break271 elif isinstance(value, dict) and 'value' in value:272 html_content = value['value']273 break274 275 # If no HTML file found, look for other common keys276 if not html_content:277 for key, value in results.items():278 if isinstance(value, str) and len(value) > 100: # Likely HTML content279 html_content = value280 break281 elif isinstance(value, dict):282 # Check if the value dict contains HTML283 if 'html' in value:284 html_content = value['html']285 break286 elif 'content' in value:287 html_content = value['content']288 break289 290 if html_content:291 self.logger.info(f"{actor_name} completed successfully - HTML length: {len(html_content)}")292 else:293 self.logger.warning(f"{actor_name} completed but no HTML content found in results")294 # Log the structure of the first result for debugging295 if isinstance(results, list) and results:296 self.logger.debug(f"{actor_name} result structure: {list(results[0].keys()) if isinstance(results[0], dict) else type(results[0])}")297 elif isinstance(results, dict):298 self.logger.debug(f"{actor_name} key-value store keys: {list(results.keys())}")299 300 return actor_name, html_content301 302 except Exception as e:303 self.logger.error(f"Error running {actor_name}: {str(e)}", exc_info=True)304 return actor_name, None305 306 def scrape_with_multiple_actors(self, target_url: str, max_workers: int = 4) -> Dict[str, Optional[str]]:307 """308 Scrape a website using multiple Apify actors simultaneously.309 310 Args:311 target_url: The URL to scrape312 max_workers: Maximum number of concurrent workers313 314 Returns:315 Dictionary mapping actor names to their scraped HTML content316 """317 self.logger.info(f"Starting multi-actor scraping for URL: {target_url}")318 319 actor_configs = self._get_actor_configs(target_url)320 results = {}321 322 with ThreadPoolExecutor(max_workers=max_workers) as executor:323 # Submit all actor runs324 future_to_actor = {325 executor.submit(self._run_single_actor, actor_name, config): actor_name326 for actor_name, config in actor_configs.items()327 }328 329 # Collect results as they complete330 for future in as_completed(future_to_actor):331 actor_name = future_to_actor[future]332 try:333 actor_name_result, html_content = future.result()334 results[actor_name_result] = html_content335 336 except Exception as e:337 self.logger.error(f"Error getting result for {actor_name}: {str(e)}")338 results[actor_name] = None339 340 # Log summary341 successful_actors = [name for name, content in results.items() if content is not None]342 failed_actors = [name for name, content in results.items() if content is None]343 344 self.logger.info(f"Scraping completed!")345 self.logger.info(f"Successful actors: {successful_actors}")346 if failed_actors:347 self.logger.warning(f"Failed actors: {failed_actors}")348 349 return results350 351 def save_results_to_files(self, results: Dict[str, Optional[str]], output_dir: str = "scraped_results") -> None:352 """353 Save the scraped HTML results to separate files.354 355 Args:356 results: Dictionary of actor-HTML pairs357 output_dir: Directory to save the files358 """359 import os360 361 if not os.path.exists(output_dir):362 os.makedirs(output_dir)363 364 for actor_name, html_content in results.items():365 if html_content:366 filename = f"{actor_name.replace('/', '_')}_result.html"367 filepath = os.path.join(output_dir, filename)368 369 with open(filepath, 'w', encoding='utf-8') as f:370 f.write(html_content)371 372 self.logger.info(f"Saved {actor_name} result to {filepath}")373 else:374 self.logger.warning(f"No content to save for {actor_name}")375 376 def scrape_with_single_actor_flexible(target_url: str, actor_id: str, api_token: str, 377 custom_input: Optional[Dict[str, Any]] = None) -> Optional[str]:378 """379 Convenience function to scrape a website using a single Apify actor with flexible retrieval.380 381 This function automatically detects whether to use dataset or key-value store based on the actor type.382 383 Args:384 target_url: The URL to scrape385 actor_id: The Apify actor ID (e.g., 'apify/website-content-crawler')386 api_token: Apify API token387 custom_input: Optional custom input configuration for the actor388 389 Returns:390 HTML content as string if successful, None otherwise391 392 Example:393 >>> # For website-content-crawler (uses key-value store)394 >>> html = scrape_with_single_actor_flexible(395 ... "https://example.com", 396 ... "apify/website-content-crawler", 397 ... api_token398 ... )399 >>> # For web-scraper (uses dataset)400 >>> html = scrape_with_single_actor_flexible(401 ... "https://example.com", 402 ... "apify/web-scraper", 403 ... api_token404 ... )405 """406 logger = logging.getLogger(__name__)407 408 try:409 # Use default input if none provided410 if custom_input is None:411 if "website-content-crawler" in actor_id:412 actor_input = {413 "aggressivePrune": False,414 "clickElementsCssSelector": "[aria-expanded=\"false\"]",415 "clientSideMinChangePercentage": 15,416 "crawlerType": "playwright:adaptive",417 "debugLog": False,418 "debugMode": False,419 "dynamicContentWaitSecs": 15,420 "expandIframes": True,421 "ignoreCanonicalUrl": False,422 "keepUrlFragments": False,423 "proxyConfiguration": {424 "useApifyProxy": True,425 "apifyProxyGroups": ["RESIDENTIAL"]426 },427 "readableTextCharThreshold": 100,428 "removeCookieWarnings": True,429 "renderingTypeDetectionPercentage": 10,430 "respectRobotsTxtFile": False,431 "saveFiles": False,432 "saveHtml": False,433 "saveHtmlAsFile": True,434 "saveMarkdown": False,435 "saveScreenshots": False,436 "startUrls": [{"url": target_url, "method": "GET"}],437 "useSitemaps": False438 }439 else:440 # Default input for other actors441 actor_input = {442 "startUrls": [{"url": target_url, "method": "GET"}],443 "proxyConfiguration": {"useApifyProxy": True}444 }445 else:446 actor_input = custom_input447 448 # Determine retrieval method based on actor type449 if "website-content-crawler" in actor_id:450 retrieve_from = "key-value-store"451 else:452 retrieve_from = "dataset"453 454 # Run the actor with flexible retrieval455 results = run_apify_actor_with_flexible_retrieval(456 actor_id=actor_id,457 actor_input=actor_input,458 api_token=api_token,459 retrieve_from=retrieve_from460 )461 462 if not results:463 logger.warning(f"No results returned from {actor_id}")464 return None465 466 # Extract HTML content467 html_content = None468 469 if isinstance(results, list):470 # Dataset results471 for item in results:472 if isinstance(item, dict) and 'html' in item:473 html_content = item['html']474 break475 elif isinstance(results, dict):476 # Key-value store results477 for key, value in results.items():478 if key.endswith('.html') or 'html' in key.lower():479 if isinstance(value, str):480 html_content = value481 break482 elif isinstance(value, dict) and 'value' in value:483 html_content = value['value']484 break485 486 if html_content:487 logger.info(f"Successfully scraped {len(html_content)} characters from {target_url}")488 else:489 logger.warning(f"No HTML content found in results from {actor_id}")490 logger.debug(f"Result structure: {type(results)} with keys: {list(results.keys()) if isinstance(results, dict) else 'N/A'}")491 492 return html_content493 494 except Exception as e:495 logger.error(f"Error scraping with {actor_id}: {str(e)}", exc_info=True)496 return None497
498
499def scrape_website_with_multiple_actors(target_url: str, api_token: str) -> Dict[str, Optional[str]]:500 """501 Convenience function to scrape a website using multiple Apify actors.502 503 Args:504 target_url: The URL to scrape505 api_token: Apify API token506 507 Returns:508 Dictionary mapping actor names to their scraped HTML content509 """510 scraper = MultiActorScraper(api_token)511 return scraper.scrape_with_multiple_actors(target_url)512
513
514if __name__ == "__main__":515 # Example usage516 import os517 518 # Get API token from environment variable519 api_token = os.getenv("APIFY_TOKEN")520 if not api_token:521 print("Please set the APIFY_TOKEN environment variable")522 exit(1)523 524 # Example URL to scrape525 target_url = "https://fbref.com/en/players/3d1f29d9/matchlogs/2021-2022/Jordyn-Huitema-Match-Logs"526 527 # Create scraper and run528 scraper = MultiActorScraper(api_token)529 results = scraper.scrape_with_multiple_actors(target_url)530 531 # Print results summary532 print("\n=== SCRAPING RESULTS ===")533 for actor_name, html_content in results.items():534 if html_content:535 print(f"{actor_name}: SUCCESS (HTML length: {len(html_content)})")536 else:537 print(f"{actor_name}: FAILED")538 539 # Optionally save results to files540 scraper.save_results_to_files(results)
1
1"""2HTML Quality Evaluator Module3
4This module provides functionality to evaluate the quality of HTML documents 5fetched by automated browsers. It determines if the HTML is suitable for data 6extraction or if it represents a failure page (e.g., CAPTCHA, block page, etc.).7
8Uses Claude AI to perform intelligent analysis based on the user's extraction goals.9"""10
11import json12import logging13import re14from typing import Dict, Any, Optional15from dataclasses import dataclass16
17import anthropic18
19
20@dataclass21class EvaluationResult:22 """Result of HTML quality evaluation."""23 score: int # 1-10 scale24 reasoning: str25
26
27@dataclass28class PreEvaluationResult:29 """Result of pre-evaluation checks before sending to Claude."""30 is_valid_html: bool31 score: Optional[int] = None # If we can determine score without Claude32 reasoning: Optional[str] = None33 should_continue_to_claude: bool = True34
35
36class HTMLQualityEvaluator:37 """38 Evaluates HTML quality for web scraping using Claude AI.39 40 This class provides methods to analyze HTML content and determine41 if it's suitable for data extraction based on the user's goals.42 """43 44 def __init__(self, claude_api_key: str):45 """46 Initialize the HTML Quality Evaluator.47 48 Args:49 claude_api_key: Anthropic API key for Claude access50 """51 if not claude_api_key or not claude_api_key.strip():52 raise ValueError("claude_api_key cannot be empty")53 54 self.client = anthropic.Anthropic(api_key=claude_api_key)55 self.logger = logging.getLogger(__name__)56 57 def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:58 """59 Perform basic validation checks on HTML content before sending to Claude.60 61 This method checks for obvious issues that don't require AI analysis:62 - Whether content is actually HTML63 - Empty or whitespace-only content64 65 Args:66 html_content: The HTML content to pre-evaluate67 68 Returns:69 PreEvaluationResult indicating if validation passed and whether to continue70 """71 if not html_content or not html_content.strip():72 return PreEvaluationResult(73 is_valid_html=False,74 score=1,75 reasoning="Content is empty or contains only whitespace",76 should_continue_to_claude=False77 )78 79 content = html_content.strip()80 content_lower = content.lower()81 82 # Check if content looks like HTML at all83 has_opening_closing_tags = '<' in content and '>' in content84 85 if not has_opening_closing_tags:86 return PreEvaluationResult(87 is_valid_html=False,88 score=1,89 reasoning="Content does not contain HTML tags",90 should_continue_to_claude=False91 )92 93 # Check for obvious non-HTML content94 if content.startswith('{') and content.endswith('}'):95 # Looks like JSON96 return PreEvaluationResult(97 is_valid_html=False,98 score=1,99 reasoning="Content appears to be JSON, not HTML",100 should_continue_to_claude=False101 )102 103 if content.startswith('<?xml'):104 # XML but not HTML105 return PreEvaluationResult(106 is_valid_html=False,107 score=1,108 reasoning="Content appears to be XML, not HTML",109 should_continue_to_claude=False110 )111 112 # If it looks like valid HTML, let Claude evaluate it113 return PreEvaluationResult(114 is_valid_html=True,115 should_continue_to_claude=True116 )117 118 def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:119 """120 Create the prompt for Claude to evaluate HTML quality.121 122 Args:123 user_goal: The user's data extraction goal124 html_content: The HTML content to evaluate125 126 Returns:127 Formatted prompt for Claude128 """129 return f"""You are a QA automation engineer specializing in web scraping. Your task is to evaluate the quality of an HTML document that was fetched by an automated browser. The goal is to determine if the HTML is suitable for data extraction or if it looks like a failure page (e.g., a CAPTCHA, block page, or empty loading page).130
131You will be given the user's original data extraction goal and the full HTML content from a scraping attempt.132
133REQUIREMENTS:134- Analysis: Analyze the provided HTML for common indicators of success or failure.135- Success Indicators: Meaningful content in the <body>, presence of <h1>-<h6> tags, lists (<ul>, <ol>), tables (<table>), structured <div>s with descriptive class names.136- Failure Indicators: Common keywords like "CAPTCHA", "bot detection", "enable JavaScript", "access denied", "rate limit". An empty or near-empty <body> tag. A structure that looks like a loading spinner.137- Output Format: Your response MUST be a single, well-formed JSON object with the following two keys:138 "score": An integer from 1 (unusable) to 10 (perfectly rendered and content-rich).139 "reasoning": A brief, one-sentence explanation for your score.140- Contextual Awareness: Use the "User's Goal" to inform your analysis. If the user wants "product listings" and you see a grid of items, that's a high score. If you see a login form, that's a low score.141
142[INPUT 1] User's Goal:143{user_goal}144
145[INPUT 2] HTML Content from an Actor Run:146{html_content}147
148Please analyze the HTML and provide your evaluation as a JSON object."""149
150 def _parse_claude_response(self, response_text: str) -> Optional[EvaluationResult]:151 """152 Parse Claude's response and extract the evaluation result.153 154 Args:155 response_text: Raw response from Claude156 157 Returns:158 EvaluationResult if parsing successful, None otherwise159 """160 try:161 # Try to find JSON in the response162 json_match = re.search(r'\{[^}]*"score"[^}]*\}', response_text, re.DOTALL)163 if not json_match:164 self.logger.error("No JSON object found in Claude's response")165 return None166 167 json_str = json_match.group(0)168 result_dict = json.loads(json_str)169 170 # Validate required fields171 if 'score' not in result_dict or 'reasoning' not in result_dict:172 self.logger.error("Missing required fields in Claude's response")173 return None174 175 score = int(result_dict['score'])176 if not (1 <= score <= 10):177 self.logger.error(f"Score {score} is outside valid range 1-10")178 return None179 180 return EvaluationResult(181 score=score,182 reasoning=result_dict['reasoning']183 )184 185 except (json.JSONDecodeError, ValueError, KeyError) as e:186 self.logger.error(f"Failed to parse Claude's response: {e}")187 return None188 189 def _preprocess_html(self, html_content: str) -> str:190 """191 Preprocess HTML content for analysis.192 193 Args:194 html_content: Raw HTML content195 196 Returns:197 Preprocessed HTML content198 """199 # Truncate very long HTML to avoid token limits200 max_length = 100000 # Adjust based on Claude's token limits201 if len(html_content) > max_length:202 self.logger.warning(f"HTML content truncated from {len(html_content)} to {max_length} characters")203 html_content = html_content[:max_length] + "\n<!-- Content truncated for analysis -->"204 205 return html_content206 207 def get_pre_evaluation_info(self, html_content: str) -> PreEvaluationResult:208 """209 Get detailed pre-evaluation information without performing the full evaluation.210 211 This method allows users to see what the pre-evaluation checks detected212 without calling Claude if they just want to understand the basic validation.213 214 Args:215 html_content: The HTML content to pre-evaluate216 217 Returns:218 PreEvaluationResult with detailed validation information219 """220 return self._pre_evaluate_html(html_content)221 222 def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:223 """224 Evaluate the quality of HTML content for data extraction.225 226 This is the main function that analyzes HTML content to determine if it's 227 suitable for data extraction based on the user's goals.228 229 The evaluation process consists of two stages:230 1. Pre-evaluation: Basic validation checks for HTML structure, error pages, 231 and obvious content issues that don't require AI analysis232 2. Claude AI analysis: Detailed content evaluation using AI if pre-evaluation passes233 234 Args:235 user_goal: The user's data extraction goal/objective236 html_content: The HTML content to evaluate237 238 Returns:239 EvaluationResult containing score (1-10) and reasoning, or None if evaluation failed240 241 Example:242 >>> evaluator = HTMLQualityEvaluator("your-claude-api-key")243 >>> result = evaluator.evaluate_html_quality(244 ... "I want to get a list of all articles on the homepage",245 ... "<html><body><article>...</article></body></html>"246 ... )247 >>> if result:248 ... print(f"Score: {result.score}, Reasoning: {result.reasoning}")249 """250 if not user_goal or not user_goal.strip():251 raise ValueError("user_goal cannot be empty")252 253 # Perform pre-evaluation checks on the HTML content254 self.logger.info("Starting pre-evaluation checks on HTML content")255 pre_eval_result = self._pre_evaluate_html(html_content)256 257 if not pre_eval_result.should_continue_to_claude:258 self.logger.info(f"Pre-evaluation completed without Claude - Score: {pre_eval_result.score}")259 return EvaluationResult(260 score=pre_eval_result.score,261 reasoning=pre_eval_result.reasoning262 )263 264 # Log pre-evaluation results265 if not pre_eval_result.is_valid_html:266 self.logger.warning("Pre-evaluation detected non-HTML content, but continuing to Claude")267 else:268 self.logger.info("Pre-evaluation passed, proceeding to Claude analysis")269 270 try:271 # Preprocess HTML content272 processed_html = self._preprocess_html(html_content)273 274 # Create the evaluation prompt275 prompt = self._create_evaluation_prompt(user_goal, processed_html)276 277 self.logger.info("Sending HTML evaluation request to Claude")278 279 # Send request to Claude280 response = self.client.messages.create(281 model="claude-3-5-sonnet-20241022", # TODO use 4282 max_tokens=1000,283 messages=[284 {285 "role": "user",286 "content": prompt287 }288 ]289 )290 291 # Extract and parse the response292 response_text = response.content[0].text293 result = self._parse_claude_response(response_text)294 295 if result:296 self.logger.info(f"HTML evaluation completed - Score: {result.score}")297 else:298 self.logger.error("Failed to parse evaluation result from Claude")299 300 return result301 302 except anthropic.APIError as e:303 self.logger.error(f"Claude API error: {e}")304 return None305 except Exception as e:306 self.logger.error(f"Unexpected error during HTML evaluation: {e}", exc_info=True)307 return None
[ { "name": "Use Case 1", "page_url": "https://books.toscrape.com/", "goal": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock." }, { "name": "Use Case 2", "page_url": "https://www.theverge.com/", "goal": "I want to scrape the main articles from The Verge homepage. For each article, get me the headline, the author's name, and the link to the full article." }, { "name": "Use Case 3", "page_url": "https://en.wikipedia.org/wiki/Python_(programming_language)", "goal": "Get me information about the Python programming language from its Wikipedia page. I need the 'First appeared' date, the 'Stable release' version, and the official website from the infobox on the right." }, { "name": "Use Case 4", "page_url": "https://www.python.org/jobs/", "goal": "List all the jobs posted. For each job, I want the job title, the company name, the location, and the date it was posted." }, { "name": "Use Case 5", "page_url": "https://quotes.toscrape.com/", "goal": "I want a list of all quotes on this page. For each one, get the quote text itself, the name of the author, and a list of the tags associated with it." }]
1"""2ScraperCodeGenerator - Intelligent Web Scraping with AI3
4A smart web scraping framework that uses multiple scraping strategies5and AI-powered quality evaluation to extract data from websites.6"""7
8from .pipeline import IntelligentScraperPipeline, run_intelligent_scraper9from .models import ScrapingResult, GoalExtractionResult, PipelineConfig, ClaudeModel10from .utils.config_parser import ConfigurationParser11from .scraping.actor_multi_scraper import ActorMultiScraper12
13__version__ = "0.1.0"14__all__ = [15 "IntelligentScraperPipeline",16 "run_intelligent_scraper",17 "ScrapingResult",18 "GoalExtractionResult",19 "PipelineConfig",20 "ClaudeModel",21 "ConfigurationParser",22 "ActorMultiScraper"23]
1"""2Data models for the ScraperCodeGenerator pipeline.3"""4
5from dataclasses import dataclass, field6from typing import Dict, Any, Optional, List7from enum import Enum8
9
10class ClaudeModel(Enum):11 """Available Claude model versions."""12 # Claude 4 models (latest)13 CLAUDE_4_SONNET = "claude-sonnet-4-20250514"14 CLAUDE_4_OPUS = "claude-opus-4-20250514"15 16 # Claude 3.7 models17 CLAUDE_3_7_SONNET = "claude-3-7-sonnet-20250219"18 19 # Claude 3.5 models20 CLAUDE_3_5_SONNET = "claude-3-5-sonnet-20241022"21 CLAUDE_3_5_HAIKU = "claude-3-5-haiku-20241022"22 23 # Claude 3 models24 CLAUDE_3_SONNET = "claude-3-sonnet-20240229"25 CLAUDE_3_HAIKU = "claude-3-haiku-20240307"26
27
28@dataclass29class ActorConfig:30 """Configuration for an individual Apify actor."""31 actor_id: str32 enabled: bool = True33 input: Dict[str, Any] = field(default_factory=dict)34 name: Optional[str] = None35 description: Optional[str] = None36
37
38@dataclass39class HTMLPruningConfig:40 """Configuration for HTML pruning behavior."""41 enabled: bool = True42 max_list_items: int = 543 max_text_length: int = 50044 prune_before_evaluation: bool = True45 prune_percentage: float = 0.8 # Keep 80% of content, remove 20%46
47
48@dataclass49class PipelineConfig:50 """Complete pipeline configuration."""51 # Core settings52 for_actor: bool = False53 test_script: bool = False54 output_script_path: Optional[str] = None55 56 # Claude settings57 claude_model: ClaudeModel = ClaudeModel.CLAUDE_4_SONNET58 claude_api_key: Optional[str] = None59 60 # HTML processing settings61 html_pruning: HTMLPruningConfig = field(default_factory=HTMLPruningConfig)62 63 # Actor configurations64 actors: Dict[str, ActorConfig] = field(default_factory=dict)65 66 # Execution settings67 max_retries: int = 368 timeout_seconds: int = 6069 concurrent_actors: bool = True70 71 def get_enabled_actors(self) -> Dict[str, ActorConfig]:72 """Get only enabled actors."""73 return {name: config for name, config in self.actors.items() if config.enabled}74
75
76@dataclass77class ScrapingResult:78 """Result of the complete scraping pipeline."""79 success: bool80 generated_script: Optional[str] = None81 best_actor: Optional[str] = None82 schema: Optional[Dict[str, Any]] = None83 error_message: Optional[str] = None84 quality_scores: Optional[Dict[str, int]] = None85 extracted_data: Optional[List[Dict[str, Any]]] = None86
87
88@dataclass89class EvaluationResult:90 """Result of HTML quality evaluation."""91 score: int # 1-10 scale92 reasoning: str93
94
95@dataclass96class PreEvaluationResult:97 """Result of pre-evaluation checks before sending to Claude."""98 is_valid_html: bool99 score: Optional[int] = None # If we can determine score without Claude100 reasoning: Optional[str] = None101 should_continue_to_claude: bool = True102
103
104@dataclass105class GoalExtractionResult:106 """Result of extracting goal from natural language prompt."""107 goal: str108 url: str109 success: bool110 error_message: Optional[str] = None111
112
113def get_default_actor_configs() -> Dict[str, ActorConfig]:114 """Get default actor configurations with common Apify actors."""115 return {116 "cheerio-scraper": ActorConfig(117 actor_id="apify/cheerio-scraper",118 name="Cheerio Scraper",119 description="Fast jQuery-like server-side scraping",120 enabled=True,121 input={122 "maxRequestRetries": 3,123 "requestTimeoutSecs": 30,124 "maxRequestsPerCrawl": 1,125 "pseudoUrls": [],126 "linkSelector": "",127 "pageFunction": """128 async function pageFunction(context) {129 const { request, log, skipLinks, $ } = context;130 return {131 url: request.url,132 title: $('title').text(),133 html: $('html').html()134 };135 }136 """,137 "proxyConfiguration": {"useApifyProxy": True}138 }139 ),140 "web-scraper": ActorConfig(141 actor_id="apify/web-scraper",142 name="Web Scraper",143 description="Versatile web scraper with JavaScript support",144 enabled=True,145 input={146 "maxRequestRetries": 3,147 "requestTimeoutSecs": 30,148 "maxPagesPerCrawl": 1,149 "pageFunction": """150 async function pageFunction(context) {151 const { request, log, skipLinks, $ } = context;152 return {153 url: request.url,154 title: $('title').text(),155 html: $('html').html()156 };157 }158 """,159 "proxyConfiguration": {"useApifyProxy": True}160 }161 ),162 "website-content-crawler": ActorConfig(163 actor_id="apify/website-content-crawler",164 name="Website Content Crawler",165 description="Advanced crawler with Playwright support",166 enabled=True,167 input={168 "maxCrawlPages": 1,169 "crawler": "playwright",170 "proxyConfiguration": {"useApifyProxy": True}171 }172 ),173 "playwright-scraper": ActorConfig(174 actor_id="apify/playwright-scraper",175 name="Playwright Scraper",176 description="Modern browser automation with Playwright",177 enabled=False,178 input={179 "maxRequestRetries": 3,180 "requestTimeoutSecs": 30,181 "maxPagesPerCrawl": 1,182 "pageFunction": """183 async function pageFunction(context) {184 const { request, log, page } = context;185 const title = await page.title();186 const html = await page.content();187 return {188 url: request.url,189 title: title,190 html: html191 };192 }193 """,194 "proxyConfiguration": {"useApifyProxy": True}195 }196 ),197 "puppeteer-scraper": ActorConfig(198 actor_id="apify/puppeteer-scraper",199 name="Puppeteer Scraper",200 description="Chrome-based scraping with Puppeteer",201 enabled=False,202 input={203 "maxRequestRetries": 3,204 "requestTimeoutSecs": 30,205 "maxPagesPerCrawl": 1,206 "pageFunction": """207 async function pageFunction(context) {208 const { request, log, page } = context;209 const title = await page.title();210 const html = await page.content();211 return {212 url: request.url,213 title: title,214 html: html215 };216 }217 """,218 "proxyConfiguration": {"useApifyProxy": True}219 }220 ),221 "jsdom-scraper": ActorConfig(222 actor_id="apify/jsdom-scraper",223 name="JSDOM Scraper",224 description="Lightweight JavaScript DOM scraping",225 enabled=False,226 input={227 "maxRequestRetries": 3,228 "requestTimeoutSecs": 30,229 "maxPagesPerCrawl": 1,230 "pageFunction": """231 async function pageFunction(context) {232 const { request, log, window } = context;233 const $ = window.$;234 return {235 url: request.url,236 title: $('title').text(),237 html: $('html').html()238 };239 }240 """,241 "proxyConfiguration": {"useApifyProxy": True}242 }243 )244 }
1"""2Main pipeline for intelligent web scraping.3"""4
5import logging6from typing import Optional7
8from .models import ScrapingResult, PipelineConfig9from .scraping import MultiActorScraper10from .scraping.actor_multi_scraper import ActorMultiScraper11from .evaluation import HTMLQualityEvaluator12from .generation import ScriptGenerator, ScriptExecutor13from .utils import prune_html, validate_required_keys, get_api_key14
15
16class IntelligentScraperPipeline:17 """Main pipeline class that orchestrates the intelligent web scraping process."""18 19 def __init__(self, apify_token: str, claude_api_key: str, actor_logger=None, config: Optional[PipelineConfig] = None):20 """21 Initialize the pipeline with required API tokens.22 23 Args:24 apify_token: Apify API token for web scraping25 claude_api_key: Anthropic Claude API key for AI analysis26 actor_logger: Optional Actor logger for actor mode27 config: Optional pipeline configuration28 """29 # Validate API keys30 validated_keys = validate_required_keys(31 apify_token=apify_token,32 claude_api_key=claude_api_key33 )34 35 self.apify_token = validated_keys['apify_token']36 self.claude_api_key = validated_keys['claude_api_key']37 self.config = config or PipelineConfig()38 39 # Initialize components with configuration40 self.multi_scraper = MultiActorScraper(self.apify_token)41 self.actor_scraper = ActorMultiScraper() # For actor-to-actor communication42 self.quality_evaluator = HTMLQualityEvaluator(self.claude_api_key, self.config.claude_model)43 self.script_generator = ScriptGenerator(self.claude_api_key, self.config.claude_model)44 self.script_executor = ScriptExecutor()45 46 # Setup logging - use Actor logger if provided, otherwise standard logging47 self.logger = actor_logger if actor_logger else logging.getLogger(__name__)48 self.is_actor_mode = actor_logger is not None49 50 async def run_complete_pipeline(self, target_url: str, user_goal: str, 51 output_script_path: Optional[str] = None,52 prune_before_evaluation: bool = True,53 test_script: bool = False,54 for_actor: bool = False) -> ScrapingResult:55 """56 Run the complete intelligent scraping pipeline.57 58 Args:59 target_url: The URL to scrape60 user_goal: Natural language description of what to extract61 output_script_path: Path where to save the generated script (None for actor mode)62 prune_before_evaluation: If True, prune HTML before quality evaluation63 test_script: If True, test the generated script before finalizing64 for_actor: If True, generate script for Apify actor format65 66 Returns:67 ScrapingResult containing the outcome and generated artifacts68 """69 self.logger.info(f"PIPELINE: Starting intelligent scraping pipeline for: {target_url}")70 self.logger.info(f"PIPELINE: User goal: {user_goal}")71 self.logger.info(f"PIPELINE: Actor mode: {for_actor}")72 73 try:74 # Step 1: Run multiple actors to scrape the website75 self.logger.info("PIPELINE: Step 1: Running multi-actor scraping...")76 77 # Use actor-aware scraper if running inside an Apify actor78 if for_actor:79 self.logger.info("PIPELINE: Using actor-to-actor communication...")80 scraping_results = await self.actor_scraper.scrape_with_multiple_actors(target_url)81 else:82 self.logger.info("PIPELINE: Using client-based scraping...")83 # Use configured actors instead of hardcoded ones84 enabled_actors = self.config.get_enabled_actors()85 if enabled_actors:86 scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url, enabled_actors)87 else:88 scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url)89 90 if not any(content for content in scraping_results.values() if content):91 return ScrapingResult(92 success=False,93 error_message="All scraping actors failed to retrieve content"94 )95 96 # Step 2: Evaluate quality of each result97 self.logger.info("PIPELINE: Step 2: Evaluating HTML quality for each actor...")98 quality_scores, best_actor, best_html = self._evaluate_html_quality(99 scraping_results, user_goal, prune_before_evaluation100 )101 102 if not best_html:103 return ScrapingResult(104 success=False,105 error_message="No actor produced quality HTML content",106 quality_scores=quality_scores107 )108 109 self.logger.info(f"PIPELINE: Best actor selected: {best_actor} with score {quality_scores[best_actor]}/10")110 111 # Step 3: Prune the best HTML to reduce token count112 self.logger.info("PIPELINE: Step 3: Pruning HTML content...")113 114 # Use configuration for pruning settings115 if self.config.html_pruning.enabled:116 pruned_html = prune_html(117 best_html, 118 max_list_items=self.config.html_pruning.max_list_items, 119 max_text_length=self.config.html_pruning.max_text_length,120 prune_percentage=self.config.html_pruning.prune_percentage121 )122 else:123 pruned_html = best_html124 125 original_length = len(best_html)126 pruned_length = len(pruned_html)127 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0128 129 self.logger.info(f"PIPELINE: HTML pruned: {original_length:,} → {pruned_length:,} chars ({reduction:.1f}% reduction)")130 131 # Step 4: Generate Python scraping script132 self.logger.info("PIPELINE: Step 4: Generating Python scraping script...")133 generated_script = self.script_generator.generate_scraping_script(134 target_url, best_actor, pruned_html, user_goal, for_actor135 )136 137 if not generated_script:138 return ScrapingResult(139 success=False,140 error_message="Failed to generate scraping script",141 best_actor=best_actor,142 quality_scores=quality_scores143 )144 145 # Step 5: Test the script if requested146 extracted_data = None147 if test_script:148 self.logger.info("PIPELINE: Step 5: Testing generated script...")149 test_result = self.script_executor.test_script(generated_script, best_html)150 151 if test_result["success"]:152 self.logger.info(f"PIPELINE: ✅ Script test passed! Extracted {test_result.get('item_count', 0)} items")153 extracted_data = test_result["data"]154 else:155 self.logger.warning(f"PIPELINE: ⚠️ Script test failed: {test_result['error']}")156 # Continue anyway, but log the issue157 158 # Step 6: Save the generated script (only if not actor mode)159 if output_script_path and not for_actor:160 self.logger.info(f"PIPELINE: Step 6: Saving generated script to {output_script_path}")161 with open(output_script_path, 'w', encoding='utf-8') as f:162 f.write(generated_script)163 164 self.logger.info("PIPELINE: ✅ Pipeline completed successfully!")165 166 return ScrapingResult(167 success=True,168 generated_script=generated_script,169 best_actor=best_actor,170 quality_scores=quality_scores,171 extracted_data=extracted_data172 )173 174 except Exception as e:175 self.logger.error(f"PIPELINE: Pipeline failed with error: {str(e)}")176 return ScrapingResult(177 success=False,178 error_message=f"Pipeline error: {str(e)}"179 )180 181 def _evaluate_html_quality(self, scraping_results: dict, user_goal: str, 182 prune_before_evaluation: bool) -> tuple[dict, str, str]:183 """Evaluate HTML quality for each scraping result."""184 quality_scores = {}185 best_actor = None186 best_html = None187 best_score = 0188 189 for actor_name, html_content in scraping_results.items():190 if html_content:191 self.logger.info(f"PIPELINE: Evaluating {actor_name}...")192 193 # Optionally prune HTML before evaluation194 evaluation_html = html_content195 if prune_before_evaluation:196 original_length = len(html_content)197 # Use more aggressive pruning for evaluation198 evaluation_html = prune_html(199 html_content, 200 max_list_items=3, 201 max_text_length=100,202 prune_percentage=0.5 # More aggressive for evaluation203 )204 pruned_length = len(evaluation_html)205 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0206 self.logger.info(f"PIPELINE: {actor_name} HTML pruned for evaluation: {original_length:,} → {pruned_length:,} chars ({reduction:.1f}% reduction)")207 208 evaluation = self.quality_evaluator.evaluate_html_quality(user_goal, evaluation_html)209 210 if evaluation:211 quality_scores[actor_name] = evaluation.score212 self.logger.info(f"PIPELINE: {actor_name} quality score: {evaluation.score}/10 - {evaluation.reasoning}")213 214 if evaluation.score > best_score:215 best_score = evaluation.score216 best_actor = actor_name217 best_html = html_content # Keep original HTML, not pruned version218 else:219 quality_scores[actor_name] = 0220 self.logger.warning(f"PIPELINE: Failed to evaluate {actor_name}")221 else:222 quality_scores[actor_name] = 0223 self.logger.warning(f"PIPELINE: {actor_name} returned no content")224 225 return quality_scores, best_actor, best_html226
227
228async def run_intelligent_scraper(target_url: str, user_goal: str, 229 apify_token: Optional[str] = None,230 claude_api_key: Optional[str] = None,231 output_path: Optional[str] = "generated_scraper.py",232 prune_before_evaluation: bool = True,233 test_script: bool = False,234 for_actor: bool = False,235 actor_logger=None,236 config: Optional[PipelineConfig] = None) -> ScrapingResult:237 """238 Convenience function to run the complete intelligent scraping pipeline.239 240 Args:241 target_url: URL to scrape242 user_goal: Natural language description of extraction goal243 apify_token: Apify API token (uses APIFY_TOKEN env var if not provided)244 claude_api_key: Claude API key (uses CLAUDE_API_KEY env var if not provided)245 output_path: Path to save the generated script (None for actor mode)246 prune_before_evaluation: If True, prune HTML before quality evaluation247 test_script: If True, test the generated script before finalizing248 for_actor: If True, generate script for Apify actor format249 actor_logger: Optional Actor logger for actor mode250 config: Optional pipeline configuration251 252 Returns:253 ScrapingResult with the outcome254 """255 # Get tokens from environment if not provided256 if not apify_token:257 apify_token = get_api_key("APIFY_TOKEN")258 if not claude_api_key:259 claude_api_key = get_api_key("CLAUDE_API_KEY")260 261 if not apify_token:262 return ScrapingResult(263 success=False,264 error_message="APIFY_TOKEN not provided and not found in environment variables"265 )266 267 if not claude_api_key:268 return ScrapingResult(269 success=False,270 error_message="CLAUDE_API_KEY not provided and not found in environment variables"271 )272 273 # Create and run pipeline274 pipeline = IntelligentScraperPipeline(apify_token, claude_api_key, actor_logger, config)275 return await pipeline.run_complete_pipeline(276 target_url, user_goal, output_path, prune_before_evaluation, test_script, for_actor277 )
1generated_scraper
1"""2Evaluation module for ScraperCodeGenerator.3"""4
5from .html_quality_evaluator import HTMLQualityEvaluator6
7__all__ = ["HTMLQualityEvaluator"]
1"""2HTML quality evaluation using Claude AI.3"""4
5import json6import logging7import re8from typing import Optional9
10import anthropic11
12from ..models import EvaluationResult, PreEvaluationResult, ClaudeModel13
14
15class HTMLQualityEvaluator:16 """Evaluates HTML quality for web scraping using Claude AI."""17 18 def __init__(self, claude_api_key: str, claude_model: ClaudeModel = ClaudeModel.CLAUDE_3_5_SONNET):19 """Initialize with Claude API key and model."""20 if not claude_api_key or not claude_api_key.strip():21 raise ValueError("Claude API key cannot be empty")22 23 self.client = anthropic.Anthropic(api_key=claude_api_key)24 self.claude_model = claude_model25 self.logger = logging.getLogger(__name__)26 27 def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:28 """29 Evaluate HTML quality for data extraction.30 31 Args:32 user_goal: User's extraction goal33 html_content: HTML content to evaluate34 35 Returns:36 EvaluationResult or None if evaluation fails37 """38 try:39 # Pre-evaluation checks40 pre_eval = self._pre_evaluate_html(html_content)41 if not pre_eval.should_continue_to_claude:42 if pre_eval.score is not None:43 return EvaluationResult(score=pre_eval.score, reasoning=pre_eval.reasoning)44 return None45 46 # Claude evaluation47 return self._evaluate_with_claude(user_goal, html_content)48 49 except Exception as e:50 self.logger.error(f"Error evaluating HTML quality: {str(e)}")51 return None52 53 def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:54 """Perform basic HTML validation checks."""55 if not html_content or not html_content.strip():56 return PreEvaluationResult(57 is_valid_html=False,58 score=1,59 reasoning="Empty or whitespace-only HTML content",60 should_continue_to_claude=False61 )62 63 # Check for common failure indicators64 content_lower = html_content.lower()65 66 # Bot detection/blocking indicators67 blocking_indicators = [68 'please verify you are a human',69 'access denied',70 'blocked',71 'captcha',72 'cloudflare',73 'ddos protection',74 'security check',75 'bot detected'76 ]77 78 for indicator in blocking_indicators:79 if indicator in content_lower:80 return PreEvaluationResult(81 is_valid_html=False,82 score=1,83 reasoning=f"HTML appears to be blocked/bot-detected (found: '{indicator}')",84 should_continue_to_claude=False85 )86 87 # Check for minimal HTML structure88 if not re.search(r'<html|<body|<div|<p|<span', content_lower):89 return PreEvaluationResult(90 is_valid_html=False,91 score=2,92 reasoning="HTML lacks basic structural elements",93 should_continue_to_claude=False94 )95 96 return PreEvaluationResult(97 is_valid_html=True,98 should_continue_to_claude=True99 )100 101 def _evaluate_with_claude(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:102 """Evaluate HTML using Claude AI."""103 try:104 prompt = self._create_evaluation_prompt(user_goal, html_content)105 106 response = self.client.messages.create(107 model=self.claude_model.value,108 max_tokens=500,109 messages=[{"role": "user", "content": prompt}]110 )111 112 content = response.content[0].text113 return self._parse_evaluation_response(content)114 115 except Exception as e:116 self.logger.error(f"Error in Claude evaluation: {str(e)}")117 return None118 119 def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:120 """Create the evaluation prompt for Claude."""121 return f"""You are an expert web scraper evaluator. Analyze the provided HTML and determine how suitable it is for extracting the requested data.122
123USER EXTRACTION GOAL:124{user_goal}125
126HTML CONTENT TO EVALUATE:127{html_content}128
129Evaluate the HTML on a scale of 1-10 based on:1301. Presence of the target data elements1312. HTML structure quality and accessibility1323. Whether the page loaded correctly (not blocked, error page, etc.)1334. How easy it would be to extract the requested data134
135Return your evaluation in this EXACT JSON format:136{{137 "score": [1-10 integer],138 "reasoning": "[brief explanation of the score]"139}}140
141Only return the JSON, no other text.142"""143 144 def _parse_evaluation_response(self, response: str) -> Optional[EvaluationResult]:145 """Parse Claude's evaluation response."""146 try:147 # Extract JSON from response148 json_match = re.search(r'\{.*\}', response, re.DOTALL)149 if not json_match:150 raise ValueError("No JSON found in response")151 152 data = json.loads(json_match.group())153 154 score = data.get('score')155 reasoning = data.get('reasoning', '')156 157 if not isinstance(score, int) or score < 1 or score > 10:158 raise ValueError(f"Invalid score: {score}")159 160 return EvaluationResult(score=score, reasoning=reasoning)161 162 except Exception as e:163 self.logger.error(f"Error parsing evaluation response: {str(e)}")164 return None
1"""2Generation module for ScraperCodeGenerator.3"""4
5from .script_generator import ScriptGenerator6from .script_executor import ScriptExecutor7
8__all__ = ["ScriptGenerator", "ScriptExecutor"]
1"""2Script execution and testing functionality.3"""4
5import subprocess6import tempfile7import os8import json9import logging10from typing import Dict, Any, Optional11import ast12import traceback13
14
15class ScriptExecutor:16 """Executes and tests generated scraping scripts."""17 18 def __init__(self):19 """Initialize the script executor."""20 self.logger = logging.getLogger(__name__)21 22 def test_script(self, script_content: str, html_content: str) -> Dict[str, Any]:23 """24 Test a scraping script against sample HTML content.25 26 Args:27 script_content: The Python script to test28 html_content: Sample HTML to test against29 30 Returns:31 Dict with test results including success, data, and errors32 """33 try:34 # Extract the extract_data function from the script35 extract_function = self._extract_function_from_script(script_content, 'extract_data')36 37 if not extract_function:38 return {39 "success": False,40 "error": "Could not find extract_data function in script",41 "data": None42 }43 44 # Create a safe execution environment45 safe_globals = {46 '__builtins__': {47 'len': len,48 'str': str,49 'int': int,50 'float': float,51 'bool': bool,52 'list': list,53 'dict': dict,54 'range': range,55 'enumerate': enumerate,56 'zip': zip,57 'isinstance': isinstance,58 'hasattr': hasattr,59 'getattr': getattr,60 'print': print,61 '__import__': __import__,62 }63 }64 65 # Import necessary modules into the environment66 exec("from bs4 import BeautifulSoup", safe_globals)67 exec("import re", safe_globals)68 exec("import json", safe_globals)69 70 # Execute the function definition71 exec(extract_function, safe_globals)72 73 # Call the function with the HTML content74 extracted_data = safe_globals['extract_data'](html_content)75 76 return {77 "success": True,78 "data": extracted_data,79 "error": None,80 "data_type": type(extracted_data).__name__,81 "item_count": len(extracted_data) if isinstance(extracted_data, (list, dict)) else 182 }83 84 except Exception as e:85 self.logger.error(f"Error testing script: {str(e)}")86 return {87 "success": False,88 "error": str(e),89 "data": None,90 "traceback": traceback.format_exc()91 }92 93 def _extract_function_from_script(self, script_content: str, function_name: str) -> Optional[str]:94 """Extract a specific function from a script."""95 try:96 # Parse the script into an AST97 tree = ast.parse(script_content)98 99 # Find the function definition100 for node in ast.walk(tree):101 if isinstance(node, ast.FunctionDef) and node.name == function_name:102 # Get the source code of the function103 lines = script_content.split('\n')104 start_line = node.lineno - 1105 106 # Find the end of the function107 end_line = start_line + 1108 while end_line < len(lines):109 line = lines[end_line]110 # Check if this line starts a new function or class111 if line.strip() and not line.startswith(' ') and not line.startswith('\t'):112 break113 end_line += 1114 115 return '\n'.join(lines[start_line:end_line])116 117 return None118 119 except Exception as e:120 self.logger.error(f"Error extracting function: {str(e)}")121 return None122 123 def validate_script_syntax(self, script_content: str) -> Dict[str, Any]:124 """125 Validate the syntax of a Python script.126 127 Args:128 script_content: The Python script to validate129 130 Returns:131 Dict with validation results132 """133 try:134 # Try to parse the script135 ast.parse(script_content)136 137 return {138 "valid": True,139 "error": None140 }141 142 except SyntaxError as e:143 return {144 "valid": False,145 "error": f"Syntax error: {str(e)}",146 "line": e.lineno,147 "offset": e.offset148 }149 except Exception as e:150 return {151 "valid": False,152 "error": f"Parse error: {str(e)}"153 }154 155 def run_script_in_sandbox(self, script_content: str, timeout: int = 60) -> Dict[str, Any]:156 """157 Run a complete script in a sandboxed environment.158 159 Args:160 script_content: The complete Python script161 timeout: Maximum execution time in seconds162 163 Returns:164 Dict with execution results165 """166 try:167 # Create a temporary file168 with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file:169 temp_file.write(script_content)170 temp_file_path = temp_file.name171 172 try:173 # Run the script174 result = subprocess.run(175 ['python', temp_file_path],176 capture_output=True,177 text=True,178 timeout=timeout,179 cwd=os.path.dirname(temp_file_path)180 )181 182 return {183 "success": result.returncode == 0,184 "stdout": result.stdout,185 "stderr": result.stderr,186 "return_code": result.returncode187 }188 189 finally:190 # Clean up the temporary file191 os.unlink(temp_file_path)192 193 except subprocess.TimeoutExpired:194 return {195 "success": False,196 "stdout": "",197 "stderr": f"Script execution timed out after {timeout} seconds",198 "return_code": -1199 }200 except Exception as e:201 return {202 "success": False,203 "stdout": "",204 "stderr": str(e),205 "return_code": -1206 }
1"""2Code generation functionality for creating scraping scripts.3"""4
5import logging6from typing import Optional7import re8
9import anthropic10from ..models import ClaudeModel11
12
13class ScriptGenerator:14 """Generates Python scraping scripts using Claude AI."""15 16 def __init__(self, claude_api_key: str, claude_model: ClaudeModel = ClaudeModel.CLAUDE_3_5_SONNET):17 """Initialize with Claude API key and model."""18 if not claude_api_key or not claude_api_key.strip():19 raise ValueError("Claude API key cannot be empty")20 21 self.client = anthropic.Anthropic(api_key=claude_api_key)22 self.claude_model = claude_model23 self.logger = logging.getLogger(__name__)24 25 def generate_scraping_script(self, target_url: str, best_actor: str, 26 pruned_html: str, user_goal: str, 27 for_actor: bool = False) -> Optional[str]:28 """29 Generate a complete Python scraping script.30 31 Args:32 target_url: The target URL to scrape33 best_actor: Name of the best performing actor34 pruned_html: Sample HTML content for reference35 user_goal: User's extraction goal36 for_actor: If True, generate for Apify actor (key-value store output)37 38 Returns:39 Complete Python script as string, or None if generation fails40 """41 try:42 # Generate the HTML parsing code from Claude43 parsing_code = self._generate_html_parsing_code(pruned_html, user_goal)44 45 if not parsing_code:46 self.logger.error("Failed to generate HTML parsing code")47 return None48 49 # Create the complete script50 if for_actor:51 return self._create_actor_script(target_url, best_actor, parsing_code, user_goal)52 else:53 return self._create_standalone_script(target_url, best_actor, parsing_code, user_goal)54 55 except Exception as e:56 self.logger.error(f"Error generating script: {str(e)}")57 return None58 59 def _generate_html_parsing_code(self, pruned_html: str, user_goal: str) -> Optional[str]:60 """Generate HTML parsing/extraction code using Claude."""61 try:62 prompt = f"""Generate Python code that parses HTML content and extracts data based on the user's goal. You should generate ONLY the extraction logic, not a complete script.63
64## USER GOAL:65{user_goal}66
67## SAMPLE HTML (for reference):68{pruned_html}69
70## REQUIREMENTS:711. Create a function called `extract_data(html_content)` that takes HTML string as input722. Use BeautifulSoup to parse the HTML733. Extract the data according to the user's goal using CSS selectors, attributes, text content, etc.744. Return the extracted data as a Python dictionary or list of dictionaries755. Handle missing or malformed data gracefully766. Include appropriate error handling77
78## EXAMPLE OUTPUT FORMAT:79```python80def extract_data(html_content):81 from bs4 import BeautifulSoup82 83 soup = BeautifulSoup(html_content, 'html.parser')84 results = []85 86 # Your extraction logic here87 # Use soup.find(), soup.find_all(), CSS selectors, etc.88 89 return results90```91
92Generate ONLY the `extract_data` function and any helper functions needed. Do not include imports outside the function, full scripts, or other boilerplate code."""93
94 self.logger.info("Requesting HTML parsing code generation from Claude...")95 96 response = self.client.messages.create(97 model=self.claude_model.value,98 max_tokens=2000,99 messages=[{"role": "user", "content": prompt}]100 )101 102 parsing_code = response.content[0].text103 104 # Extract Python code from response if wrapped in code blocks105 if "```python" in parsing_code:106 code_match = re.search(r'```python\n(.*?)\n```', parsing_code, re.DOTALL)107 if code_match:108 parsing_code = code_match.group(1)109 110 return parsing_code111 112 except Exception as e:113 self.logger.error(f"Error generating HTML parsing code: {str(e)}")114 return None115 116 def _create_standalone_script(self, target_url: str, best_actor: str, 117 parsing_code: str, user_goal: str) -> str:118 """Create a standalone Python script."""119 return f'''#!/usr/bin/env python3120"""121Generated Web Scraper122Target: {target_url}123Goal: {user_goal}124Best Actor: {best_actor}125Generated by: ScraperCodeGenerator126
127This script is completely standalone and does not require the original ScraperCodeGenerator project.128"""129
130import os131import json132import logging133from typing import Dict, Any, List, Optional134
135# Check and import required libraries136try:137 import requests138except ImportError:139 raise ImportError("requests not installed. Please install using: pip install requests")140
141try:142 from bs4 import BeautifulSoup143except ImportError:144 raise ImportError("beautifulsoup4 not installed. Please install using: pip install beautifulsoup4")145
146try:147 from apify_client import ApifyClient148except ImportError:149 raise ImportError("apify-client not installed. Please install using: pip install apify-client")150
151
152{parsing_code}153
154
155def run_actor_scraping(target_url: str, apify_token: str) -> Optional[str]:156 """157 Run the best performing actor to get HTML content.158 159 Args:160 target_url: URL to scrape161 apify_token: Apify API token162 163 Returns:164 HTML content or None if failed165 """166 client = ApifyClient(apify_token)167 168 # Actor configuration for {best_actor}169 actor_input = {{170 "startUrls": [{{"url": target_url}}],171 "maxRequestRetries": 3,172 "requestTimeoutSecs": 30,173 "maxPagesPerCrawl": 1,174 }}175 176 # Add actor-specific configuration177 if "{best_actor}" == "cheerio-scraper":178 actor_input.update(\{{179 "pageFunction": \'\'\'180 async function pageFunction(context) {{181 const {{ request, log, $ }} = context;182 try {{183 const title = $('title').text() || '';184 const html = $('html').html() || '';185 return {{186 url: request.url,187 title: title,188 html: html189 }};190 }} catch (error) {{191 log.error('Error in pageFunction:', error);192 return {{193 url: request.url,194 title: '',195 html: ''196 }};197 }}198 }}199 \'\'\',200 "proxyConfiguration": {{"useApifyProxy": True}}201 }})202 actor_id = "apify/cheerio-scraper"203 elif "{best_actor}" == "web-scraper":204 actor_input.update({{205 "pageFunction": \'\'\'206 async function pageFunction(context) {{207 const {{ request, log, page }} = context;208 try {{209 const title = await page.title();210 const html = await page.content();211 return {{212 url: request.url,213 title: title,214 html: html215 }};216 }} catch (error) {{217 log.error('Error in pageFunction:', error);218 return {{219 url: request.url,220 title: '',221 html: ''222 }};223 }}224 }}225 \'\'\',226 "proxyConfiguration": {{"useApifyProxy": True}}227 }})228 actor_id = "apify/web-scraper"229 elif "{best_actor}" == "website-content-crawler":230 actor_input = {{231 "startUrls": [{{"url": target_url}}],232 "maxCrawlPages": 1,233 "crawler": "playwright",234 "proxyConfiguration": {{"useApifyProxy": True}}235 }}236 actor_id = "apify/website-content-crawler"237 else:238 # Fallback to simple requests if actor not recognized239 logging.warning(f"Unknown actor '{best_actor}', falling back to requests")240 try:241 response = requests.get(target_url, timeout=30)242 response.raise_for_status()243 return response.text244 except Exception as e:245 logging.error(f"Failed to fetch with requests: {{e}}")246 return None247 248 try:249 # Run the actor250 logging.info(f"Running {{actor_id}} actor...")251 run = client.actor(actor_id).call(run_input=actor_input)252 253 # Get the dataset items254 dataset_client = client.dataset(run["defaultDatasetId"])255 items = list(dataset_client.iterate_items())256 257 if not items:258 logging.warning("No items returned from actor")259 return None260 261 # Extract HTML content262 item = items[0]263 html_content = item.get('html') or item.get('text') or item.get('markdown', '')264 265 if not html_content:266 logging.warning("No HTML content found in actor result")267 return None268 269 return html_content270 271 except Exception as e:272 logging.error(f"Error running actor: {{e}}")273 return None274
275
276def main():277 """Main function to run the scraper."""278 # Setup logging279 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')280 logger = logging.getLogger(__name__)281 282 # Configuration283 target_url = "{target_url}"284 apify_token = os.getenv("APIFY_TOKEN")285 286 if not apify_token:287 logger.error("APIFY_TOKEN environment variable not set")288 logger.info("Please set your Apify API token: export APIFY_TOKEN='your_token_here'")289 logger.info("Get your token at: https://console.apify.com/")290 return291 292 try:293 logger.info(f"🚀 Starting scraper for: {{target_url}}")294 logger.info(f"📝 Goal: {user_goal}")295 logger.info(f"🏆 Using best actor: {best_actor}")296 297 # Get HTML content using the best performing actor298 html_content = run_actor_scraping(target_url, apify_token)299 300 if not html_content:301 logger.error("Failed to get HTML content")302 return303 304 logger.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")305 306 # Extract data using the generated parsing code307 logger.info("🔍 Extracting data from HTML...")308 extracted_data = extract_data(html_content)309 310 if not extracted_data:311 logger.warning("No data was extracted from the HTML")312 return313 314 # Prepare final results315 results = {{316 "target_url": target_url,317 "extraction_goal": "{user_goal}",318 "actor_used": "{best_actor}",319 "data": extracted_data,320 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1321 }}322 323 # Output results324 print("\\n" + "="*60)325 print("📊 EXTRACTION RESULTS")326 print("="*60)327 print(json.dumps(results, indent=2, ensure_ascii=False))328 329 # Save to file330 output_file = "extracted_data.json"331 with open(output_file, 'w', encoding='utf-8') as f:332 json.dump(results, f, indent=2, ensure_ascii=False)333 334 logger.info(f"💾 Results saved to {{output_file}}")335 logger.info(f"🎉 Successfully extracted {{results['total_items']}} items!")336 337 except Exception as e:338 logger.error(f"❌ Scraping failed: {{e}}")339 import traceback340 traceback.print_exc()341
342
343if __name__ == "__main__":344 main()345'''346 347 def _create_actor_script(self, target_url: str, best_actor: str, 348 parsing_code: str, user_goal: str) -> str:349 """Create a script for Apify actor."""350 return f'''"""351Apify Actor Script352Target: {target_url}353Goal: {user_goal}354Best Actor: {best_actor}355Generated by: ScraperCodeGenerator356
357This script is completely standalone and does not require the original ScraperCodeGenerator project.358"""359
360import json361from typing import Optional362
363# Check and import required libraries364try:365 from apify import Actor366except ImportError:367 raise ImportError("apify not installed. Please install using: pip install apify")368
369try:370 from bs4 import BeautifulSoup371except ImportError:372 raise ImportError("beautifulsoup4 not installed. Please install using: pip install beautifulsoup4")373
374try:375 from apify_client import ApifyClient376except ImportError:377 raise ImportError("apify-client not installed. Please install using: pip install apify-client")378
379
380{parsing_code}381
382
383async def run_actor_scraping(target_url: str, apify_token: str) -> Optional[str]:384 """385 Run the best performing actor to get HTML content.386 387 Args:388 target_url: URL to scrape389 apify_token: Apify API token390 391 Returns:392 HTML content or None if failed393 """394 client = ApifyClient(apify_token)395 396 # Actor configuration for {best_actor}397 actor_input = {{398 "startUrls": [{{"url": target_url}}],399 "maxRequestRetries": 3,400 "requestTimeoutSecs": 30,401 "maxPagesPerCrawl": 1,402 }}403 404 # Add actor-specific configuration405 if "{best_actor}" == "cheerio-scraper":406 actor_input.update({{407 "pageFunction": \'\'\'408 async function pageFunction(context) {{409 const {{ request, log, $ }} = context;410 try {{411 const title = $('title').text() || '';412 const html = $('html').html() || '';413 return {{414 url: request.url,415 title: title,416 html: html417 }};418 }} catch (error) {{419 log.error('Error in pageFunction:', error);420 return {{421 url: request.url,422 title: '',423 html: ''424 }};425 }}426 }}427 \'\'\',428 "proxyConfiguration": {{"useApifyProxy": True}}429 }})430 actor_id = "apify/cheerio-scraper"431 elif "{best_actor}" == "web-scraper":432 actor_input.update({{433 "pageFunction": \'\'\'434 async function pageFunction(context) {{435 const {{ request, log, page }} = context;436 try {{437 const title = await page.title();438 const html = await page.content();439 return {{440 url: request.url,441 title: title,442 html: html443 }};444 }} catch (error) {{445 log.error('Error in pageFunction:', error);446 return {{447 url: request.url,448 title: '',449 html: ''450 }};451 }}452 }}453 \'\'\',454 "proxyConfiguration": {{"useApifyProxy": True}}455 }})456 actor_id = "apify/web-scraper"457 elif "{best_actor}" == "website-content-crawler":458 actor_input = {{459 "startUrls": [{{"url": target_url}}],460 "maxCrawlPages": 1,461 "crawler": "playwright",462 "proxyConfiguration": {{"useApifyProxy": True}}463 }}464 actor_id = "apify/website-content-crawler"465 else:466 Actor.log.error(f"Unknown actor: {best_actor}")467 return None468 469 try:470 # Run the actor471 Actor.log.info(f"Running {{actor_id}} actor...")472 run = client.actor(actor_id).call(run_input=actor_input)473 474 # Get the dataset items475 dataset_client = client.dataset(run["defaultDatasetId"])476 items = list(dataset_client.iterate_items())477 478 if not items:479 Actor.log.warning("No items returned from actor")480 return None481 482 # Extract HTML content483 item = items[0]484 html_content = item.get('html') or item.get('text') or item.get('markdown', '')485 486 if not html_content:487 Actor.log.warning("No HTML content found in actor result")488 return None489 490 return html_content491 492 except Exception as e:493 Actor.log.error(f"Error running actor: {{e}}")494 return None495
496
497async def main():498 """Main actor function."""499 async with Actor:500 # Get input501 actor_input = await Actor.get_input() or {{}}502 target_url = actor_input.get('targetUrl', '{target_url}')503 user_goal = actor_input.get('userGoal', '{user_goal}')504 apify_token = actor_input.get('apifyToken') or Actor.config.token505 506 Actor.log.info(f"🚀 Starting scraper for: {{target_url}}")507 Actor.log.info(f"📝 Goal: {{user_goal}}")508 Actor.log.info(f"🏆 Using best actor: {best_actor}")509 510 try:511 # Get HTML content using the best performing actor512 html_content = await run_actor_scraping(target_url, apify_token)513 514 if not html_content:515 await Actor.fail(f"Failed to get HTML content from {best_actor} actor")516 return517 518 Actor.log.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")519 520 # Extract data using the generated parsing code521 Actor.log.info("🔍 Extracting data from HTML...")522 extracted_data = extract_data(html_content)523 524 if not extracted_data:525 Actor.log.warning("No data was extracted from the HTML")526 extracted_data = []527 528 # Prepare final results529 results = {{530 "target_url": target_url,531 "extraction_goal": user_goal,532 "actor_used": "{best_actor}",533 "data": extracted_data,534 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1535 }}536 537 # Save to key-value store538 await Actor.set_value('OUTPUT', results)539 540 Actor.log.info(f"🎉 Successfully extracted {{results['total_items']}} items!")541 Actor.log.info("💾 Results saved to key-value store as 'OUTPUT'")542 543 except Exception as e:544 Actor.log.error(f"❌ Scraping failed: {{e}}")545 await Actor.fail(str(e))546
547
548if __name__ == "__main__":549 import asyncio550 asyncio.run(main())551'''
1"""2Scraping module for ScraperCodeGenerator.3"""4
5from .apify_runner import ApifyRunner6from .multi_actor_scraper import MultiActorScraper7from .actor_multi_scraper import ActorMultiScraper8
9__all__ = ["ApifyRunner", "MultiActorScraper", "ActorMultiScraper"]
1"""2Apify Actor-specific scraping module for running other actors from within an Apify actor.3"""4
5import asyncio6import logging7from typing import Dict, Any, List, Optional, Tuple8from apify import Actor9
10
11class ActorMultiScraper:12 """Handles running multiple Apify actors from within an Apify actor context."""13 14 def __init__(self):15 """Initialize the actor scraper."""16 self.logger = logging.getLogger(__name__)17 18 async def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, Optional[str]]:19 """20 Run multiple actors in parallel to scrape the target URL and return HTML content.21 22 Args:23 target_url: The URL to scrape24 25 Returns:26 Dictionary mapping actor names to their HTML content (or None if failed)27 """28 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors called for {target_url}")29 30 # Define actor configurations31 actor_configs = self._get_actor_configs(target_url)32 Actor.log.info(f"DEBUG: Will run {len(actor_configs)} actors in parallel: {list(actor_configs.keys())}")33 34 # Create tasks for parallel execution35 tasks = []36 actor_names = []37 38 for actor_name, config in actor_configs.items():39 Actor.log.info(f"DEBUG: Creating task for {actor_name}...")40 task = self._run_single_actor_with_name(actor_name, config)41 tasks.append(task)42 actor_names.append(actor_name)43 44 # Run all actors in parallel45 Actor.log.info("DEBUG: Starting all actors in parallel...")46 results_list = await asyncio.gather(*tasks, return_exceptions=True)47 48 # Process results49 results = {}50 for i, (actor_name, result) in enumerate(zip(actor_names, results_list)):51 if isinstance(result, Exception):52 Actor.log.error(f"DEBUG: {actor_name} failed: {str(result)}")53 results[actor_name] = None54 else:55 results[actor_name] = result56 if result:57 Actor.log.info(f"DEBUG: {actor_name} succeeded: {len(result):,} characters")58 else:59 Actor.log.warning(f"DEBUG: {actor_name} returned no content")60 61 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors completed. Results: {list(results.keys())}")62 Actor.log.info(f"DEBUG: Results with content: {[name for name, content in results.items() if content]}")63 return results64 65 async def _run_single_actor_with_name(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:66 """67 Run a single actor and return its HTML content.68 69 Args:70 actor_name: Name of the actor for logging71 config: Actor configuration72 73 Returns:74 HTML content or None if failed75 """76 try:77 Actor.log.info(f"DEBUG: Starting {actor_name}...")78 return await self._run_single_actor(actor_name, config)79 except Exception as e:80 Actor.log.error(f"DEBUG: {actor_name} failed: {str(e)}")81 return None82 83 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:84 """Get configurations for all actors to run."""85 return {86 "cheerio-scraper": {87 "actor_id": "apify/cheerio-scraper",88 "input": {89 "startUrls": [{"url": target_url}],90 "maxRequestRetries": 3,91 "requestTimeoutSecs": 30,92 "maxPagesPerCrawl": 1,93 "pageFunction": """94 async function pageFunction(context) {95 const { request, log, $ } = context;96 try {97 const title = $('title').text() || '';98 const html = $('html').html() || '';99 return {100 url: request.url,101 title: title,102 html: html103 };104 } catch (error) {105 log.error('Error in pageFunction:', error);106 return {107 url: request.url,108 title: '',109 html: ''110 };111 }112 }113 """,114 "proxyConfiguration": {"useApifyProxy": True}115 }116 },117 "web-scraper": {118 "actor_id": "apify/web-scraper",119 "input": {120 "startUrls": [{"url": target_url}],121 "maxRequestRetries": 3,122 "requestTimeoutSecs": 30,123 "maxPagesPerCrawl": 1,124 "pageFunction": """125 async function pageFunction(context) {126 const { request, log, page } = context;127 try {128 const title = await page.title();129 const html = await page.content();130 return { url: request.url, title, html };131 } catch (error) {132 log.error('Error in pageFunction:', error);133 return { url: request.url, title: '', html: '' };134 }135 }136 """,137 "proxyConfiguration": {"useApifyProxy": True}138 }139 },140 "website-content-crawler": {141 "actor_id": "apify/website-content-crawler",142 "input": {143 "startUrls": [{"url": target_url}],144 "maxRequestsPerCrawl": 1,145 "maxCrawlDepth": 0,146 "htmlTransformer": "readableText",147 "readableTextCharThreshold": 100,148 "removeCookieWarnings": True,149 "clickElementsCssSelector": "",150 "proxyConfiguration": {"useApifyProxy": True}151 }152 }153 }154 155 async def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:156 """157 Run a single actor and extract HTML content.158 159 Args:160 actor_name: Name of the actor (for logging)161 config: Actor configuration including actor_id and input162 163 Returns:164 HTML content as string, or None if failed165 """166 try:167 actor_id = config["actor_id"]168 actor_input = config["input"]169 170 Actor.log.info(f"DEBUG: Calling actor {actor_id}")171 172 # Call the actor using Apify SDK - use the exact same pattern as working code173 run = await Actor.call(174 actor_id=actor_id,175 run_input=actor_input176 )177 178 if not run:179 Actor.log.error(f"DEBUG: Actor {actor_name} failed to start - run is None")180 return None181 182 Actor.log.info(f"DEBUG: Actor {actor_name} run created with ID: {run.id}")183 Actor.log.info(f"DEBUG: Default dataset ID: {run.default_dataset_id}")184 185 # Use the exact same pattern as your working code186 if run.default_dataset_id:187 try:188 Actor.log.info(f"DEBUG: Getting dataset items for {actor_name}...")189 items = (await Actor.apify_client.dataset(run.default_dataset_id).list_items()).items190 191 if items:192 Actor.log.info(f"DEBUG: Found {len(items)} items in dataset for {actor_name}")193 194 for i, item in enumerate(items):195 Actor.log.info(f"DEBUG: Dataset item {i} keys: {list(item.keys()) if isinstance(item, dict) else type(item)}")196 197 # Look for HTML content in the item198 html_content = self._extract_html_from_item(item, actor_name)199 if html_content:200 Actor.log.info(f"DEBUG: Found HTML content in dataset item {i} for {actor_name}: {len(html_content)} characters")201 return html_content202 else:203 Actor.log.info(f"DEBUG: No dataset items found for {actor_name}")204 205 except Exception as e:206 Actor.log.error(f"DEBUG: Dataset retrieval failed for {actor_name}: {e}")207 import traceback208 Actor.log.error(f"DEBUG: Dataset traceback: {traceback.format_exc()}")209 210 # Fallback: Try key-value store (simplified)211 if run.default_key_value_store_id:212 try:213 Actor.log.info(f"DEBUG: Trying key-value store as fallback for {actor_name}...")214 kvs_client = Actor.apify_client.key_value_store(run.default_key_value_store_id)215 216 # Try common keys that might contain HTML217 common_keys = ['OUTPUT', 'RESULTS', 'DATA']218 for key_name in common_keys:219 try:220 record = await kvs_client.get_record(key_name)221 if record:222 Actor.log.info(f"DEBUG: Found record for key {key_name}")223 html_content = self._extract_html_from_record(record, actor_name)224 if html_content:225 Actor.log.info(f"DEBUG: Found HTML content in key {key_name} for {actor_name}")226 return html_content227 except Exception:228 pass # Key doesn't exist, continue229 230 except Exception as e:231 Actor.log.error(f"DEBUG: Key-value store retrieval failed for {actor_name}: {e}")232 233 Actor.log.warning(f"DEBUG: No HTML content found for {actor_name}")234 return None235 236 except Exception as e:237 Actor.log.error(f"DEBUG: Error running {actor_name}: {str(e)}")238 import traceback239 Actor.log.error(f"DEBUG: Full traceback: {traceback.format_exc()}")240 return None241 242 def _extract_html_from_item(self, item: Dict[str, Any], actor_name: str) -> Optional[str]:243 """Extract HTML content from a dataset item."""244 Actor.log.info(f"DEBUG: Extracting HTML from item for {actor_name}, item keys: {list(item.keys()) if isinstance(item, dict) else 'not a dict'}")245 246 # Look for HTML in common fields247 html_fields = ['html', 'content', 'body', 'pageContent', 'text', 'data']248 249 for field in html_fields:250 if field in item and item[field]:251 content = item[field]252 Actor.log.info(f"DEBUG: Found content in field '{field}': {type(content)}, length: {len(content) if isinstance(content, str) else 'N/A'}")253 254 if isinstance(content, str) and len(content) > 100:255 # Check if it looks like HTML256 if '<' in content and '>' in content:257 Actor.log.info(f"DEBUG: Found HTML content in field '{field}' for {actor_name}")258 return content259 elif actor_name == "website-content-crawler":260 # For website-content-crawler, text content is also acceptable261 Actor.log.info(f"DEBUG: Found text content in field '{field}' for {actor_name}")262 html_content = f"<html><body><div>{content}</div></body></html>"263 return html_content264 265 # For website-content-crawler, look for any text-like content266 if actor_name == "website-content-crawler":267 for key, value in item.items():268 if isinstance(value, str) and len(value) > 50:269 Actor.log.info(f"DEBUG: Using text content from field '{key}' for website-content-crawler")270 html_content = f"<html><body><div>{value}</div></body></html>"271 return html_content272 273 Actor.log.info(f"DEBUG: No HTML content found in item for {actor_name}")274 return None275 276 def _extract_html_from_record(self, record: Any, actor_name: str) -> Optional[str]:277 """Extract HTML content from a key-value store record."""278 try:279 # The record might be the content directly or wrapped in a dict280 content = record281 282 if hasattr(record, 'value'):283 content = record.value284 elif isinstance(record, dict) and 'value' in record:285 content = record['value']286 287 # If content is a string, check if it's HTML288 if isinstance(content, str):289 if len(content) > 100 and ('<' in content or actor_name == "website-content-crawler"):290 return content291 292 # If content is a dict, look for HTML fields293 elif isinstance(content, dict):294 html_content = self._extract_html_from_item(content, actor_name)295 if html_content:296 return html_content297 298 return None299 300 except Exception as e:301 Actor.log.debug(f"DEBUG: Error extracting HTML from record for {actor_name}: {e}")302 return None
1"""2Apify integration for running actors and retrieving results.3"""4
5import logging6from typing import Optional, Dict, Any, List, Union7
8from apify_client import ApifyClient9
10
11class ApifyRunner:12 """Handles running Apify actors and retrieving results."""13 14 def __init__(self, api_token: str):15 """Initialize with API token."""16 if not api_token or not api_token.strip():17 raise ValueError("API token cannot be empty")18 19 self.client = ApifyClient(api_token)20 self.logger = logging.getLogger(__name__)21 22 def run_actor(self, actor_id: str, actor_input: dict, 23 retrieve_from: str = "auto") -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:24 """25 Run an Apify actor and retrieve results.26 27 Args:28 actor_id: The ID of the Apify actor29 actor_input: Input configuration for the actor30 retrieve_from: "auto", "dataset", "key-value-store", or "both"31 32 Returns:33 Retrieved data or None if failed34 """35 if not actor_id or not actor_id.strip():36 raise ValueError("actor_id cannot be empty")37 38 if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:39 raise ValueError("Invalid retrieve_from option")40 41 # Determine storage type42 if retrieve_from == "auto":43 retrieve_from = "key-value-store" if "website-content-crawler" in actor_id else "dataset"44 45 try:46 self.logger.info(f"Starting Apify actor: {actor_id}")47 48 # Start the actor run49 run = self.client.actor(actor_id).call(run_input=actor_input)50 51 if not run or run.get('status') != 'SUCCEEDED':52 self.logger.error(f"Actor run failed: {run.get('status') if run else 'No run created'}")53 return None54 55 run_id = run.get('id')56 self.logger.info(f"Actor run {run_id} completed successfully")57 58 # Retrieve results based on type59 if retrieve_from == "dataset":60 return self._get_dataset_items(run_id)61 elif retrieve_from == "key-value-store":62 return self._get_key_value_store_items(run_id)63 elif retrieve_from == "both":64 return {65 "dataset": self._get_dataset_items(run_id),66 "key_value_store": self._get_key_value_store_items(run_id)67 }68 69 except Exception as e:70 self.logger.error(f"Error running actor {actor_id}: {str(e)}")71 return None72 73 def _get_dataset_items(self, run_id: str) -> List[Dict[str, Any]]:74 """Get items from the dataset of a run."""75 try:76 dataset_id = self.client.run(run_id).get().get('defaultDatasetId')77 if not dataset_id:78 self.logger.warning(f"No dataset found for run {run_id}")79 return []80 81 dataset_items = list(self.client.dataset(dataset_id).iterate_items())82 self.logger.info(f"Retrieved {len(dataset_items)} items from dataset")83 return dataset_items84 85 except Exception as e:86 self.logger.error(f"Error retrieving dataset items: {str(e)}")87 return []88 89 def _get_key_value_store_items(self, run_id: str) -> Dict[str, Any]:90 """Get items from the key-value store of a run."""91 try:92 kvs_id = self.client.run(run_id).get().get('defaultKeyValueStoreId')93 if not kvs_id:94 self.logger.warning(f"No key-value store found for run {run_id}")95 return {}96 97 kvs = self.client.key_value_store(kvs_id)98 keys = list(kvs.list_keys())99 100 items = {}101 for key_info in keys:102 # Handle case where key_info might be a string or dict103 if isinstance(key_info, dict):104 key_name = key_info.get('key')105 else:106 key_name = str(key_info)107 108 if key_name:109 try:110 value = kvs.get_record(key_name)111 if value:112 # Handle case where value might be a string or dict113 if isinstance(value, dict):114 items[key_name] = value.get('value', value)115 else:116 items[key_name] = value117 except Exception as e:118 self.logger.warning(f"Failed to retrieve key '{key_name}': {str(e)}")119 120 self.logger.info(f"Retrieved {len(items)} items from key-value store")121 return items122 123 except Exception as e:124 self.logger.error(f"Error retrieving key-value store items: {str(e)}")125 return {}126
127
128# Legacy functions for backward compatibility129def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:130 """Legacy function - use ApifyRunner class instead."""131 runner = ApifyRunner(api_token)132 result = runner.run_actor(actor_id, actor_input, "dataset")133 return result if isinstance(result, list) else None134
135
136def run_apify_actor_with_flexible_retrieval(137 actor_id: str, actor_input: dict, *, api_token: str, retrieve_from: str = "auto"138) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:139 """Legacy function - use ApifyRunner class instead."""140 runner = ApifyRunner(api_token)141 return runner.run_actor(actor_id, actor_input, retrieve_from)
1"""2Multi-actor scraping functionality.3"""4
5import logging6from typing import Dict, Any7from concurrent.futures import ThreadPoolExecutor, as_completed8
9from .apify_runner import ApifyRunner10from ..models import ActorConfig11
12
13class MultiActorScraper:14 """Scrapes websites using multiple Apify actors simultaneously."""15 16 def __init__(self, api_token: str):17 """Initialize with Apify API token."""18 self.api_token = api_token19 self.runner = ApifyRunner(api_token)20 self.logger = logging.getLogger(__name__)21 22 def scrape_with_multiple_actors(self, target_url: str, 23 actor_configs: Dict[str, ActorConfig] = None) -> Dict[str, str]:24 """25 Scrape a URL with multiple actors and return HTML content.26 27 Args:28 target_url: URL to scrape29 actor_configs: Dictionary of actor configurations to use30 31 Returns:32 Dict mapping actor names to HTML content33 """34 if actor_configs is None:35 # Use default configurations for backward compatibility36 actor_configs = self._get_default_actor_configs(target_url)37 38 # Filter to only enabled actors39 enabled_configs = {name: config for name, config in actor_configs.items() if config.enabled}40 41 if not enabled_configs:42 self.logger.warning("No enabled actors found")43 return {}44 45 results = {}46 47 # Use ThreadPoolExecutor for concurrent execution48 with ThreadPoolExecutor(max_workers=len(enabled_configs)) as executor:49 future_to_actor = {50 executor.submit(self._run_single_actor, name, config): name51 for name, config in enabled_configs.items()52 }53 54 for future in as_completed(future_to_actor):55 actor_name = future_to_actor[future]56 try:57 name, html_content = future.result()58 results[name] = html_content59 except Exception as e:60 self.logger.error(f"Actor {actor_name} failed: {str(e)}")61 results[actor_name] = None62 63 return results64 65 def _get_default_actor_configs(self, target_url: str) -> Dict[str, ActorConfig]:66 """Get default actor configurations for backward compatibility."""67 from ..models import get_default_actor_configs68 69 configs = get_default_actor_configs()70 # Add target URL to all configs71 for config in configs.values():72 config.input['startUrls'] = [{"url": target_url}]73 74 return configs75 76 def _run_single_actor(self, actor_name: str, config) -> tuple[str, str]:77 """78 Run a single actor and extract HTML content.79 80 Args:81 actor_name: Name of the actor82 config: Actor configuration (can be ActorConfig or dict for backward compatibility)83 84 Returns:85 Tuple of (actor_name, html_content)86 """87 try:88 self.logger.info(f"Starting {actor_name}...")89 90 # Handle both ActorConfig and dict formats91 if hasattr(config, 'actor_id'):92 actor_id = config.actor_id93 actor_input = config.input94 else:95 actor_id = config["actor_id"]96 actor_input = config["input"]97 98 result = self.runner.run_actor(99 actor_id,100 actor_input,101 "auto"102 )103 104 if not result:105 self.logger.warning(f"{actor_name} returned no results")106 return actor_name, None107 108 # Extract HTML based on result type109 html_content = self._extract_html_from_result(result, actor_name)110 111 if html_content:112 self.logger.info(f"{actor_name} completed successfully: {len(html_content):,} chars")113 else:114 self.logger.warning(f"{actor_name} returned no HTML content")115 116 return actor_name, html_content117 118 except Exception as e:119 self.logger.error(f"Error running {actor_name}: {str(e)}")120 return actor_name, None121 122 def _extract_html_from_result(self, result: Any, actor_name: str) -> str:123 """Extract HTML content from actor result."""124 try:125 if isinstance(result, list) and result:126 # Dataset result127 item = result[0]128 return item.get('html') or item.get('content', '')129 elif isinstance(result, dict):130 # Key-value store result131 if 'OUTPUT' in result:132 output = result['OUTPUT']133 if isinstance(output, dict):134 return output.get('html') or output.get('content', '')135 elif isinstance(output, str):136 return output137 138 self.logger.warning(f"Unexpected result format from {actor_name}")139 return None140 141 except Exception as e:142 self.logger.error(f"Error extracting HTML from {actor_name}: {str(e)}")143 return None
1"""2Utilities module for ScraperCodeGenerator.3"""4
5from .html_utils import is_html, prune_html, extract_text_content, validate_html_structure6from .config import get_api_key, validate_required_keys, setup_logging7
8__all__ = [9 "is_html",10 "prune_html", 11 "extract_text_content",12 "validate_html_structure",13 "get_api_key",14 "validate_required_keys",15 "setup_logging"16]
1"""2Configuration and environment utilities.3"""4
5import os6from typing import Optional7
8
9def get_api_key(key_name: str, provided_key: Optional[str] = None) -> Optional[str]:10 """11 Get API key from provided value or environment variable.12 13 Args:14 key_name: Name of the environment variable15 provided_key: Explicitly provided key (takes precedence)16 17 Returns:18 API key or None if not found19 """20 if provided_key and provided_key.strip():21 return provided_key.strip()22 23 return os.getenv(key_name)24
25
26def validate_required_keys(**keys) -> dict[str, str]:27 """28 Validate that all required API keys are present.29 30 Args:31 **keys: Key-value pairs of key names and values32 33 Returns:34 Dict of validated keys35 36 Raises:37 ValueError: If any required key is missing38 """39 validated = {}40 missing = []41 42 for key_name, key_value in keys.items():43 if not key_value or not key_value.strip():44 missing.append(key_name)45 else:46 validated[key_name] = key_value.strip()47 48 if missing:49 raise ValueError(f"Missing required API keys: {', '.join(missing)}")50 51 return validated52
53
54def setup_logging(level: str = "INFO") -> None:55 """56 Setup logging configuration.57 58 Args:59 level: Logging level (DEBUG, INFO, WARNING, ERROR)60 """61 import logging62 63 logging.basicConfig(64 level=getattr(logging, level.upper()),65 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',66 handlers=[67 logging.StreamHandler()68 ]69 )
1"""2Configuration parser for the ScraperCodeGenerator pipeline.3"""4
5import json6import logging7from typing import Dict, Any, Optional, Union8
9from ..models import (10 PipelineConfig, ActorConfig, HTMLPruningConfig, ClaudeModel, 11 get_default_actor_configs12)13
14
15class ConfigurationParser:16 """Parses and validates configuration from input data."""17 18 def __init__(self):19 self.logger = logging.getLogger(__name__)20 21 def parse_from_input(self, input_data: Dict[str, Any]) -> PipelineConfig:22 """23 Parse configuration from input data.24 25 Args:26 input_data: Raw input data from Actor or CLI27 28 Returns:29 Parsed and validated PipelineConfig30 """31 config = PipelineConfig()32 33 # Parse core settings34 config.for_actor = input_data.get('forActor', False)35 config.test_script = input_data.get('testScript', False)36 config.output_script_path = input_data.get('outputScriptPath')37 38 # Parse Claude settings39 claude_model_str = input_data.get('claudeModel', 'claude-3-5-sonnet-20241022')40 config.claude_model = self._parse_claude_model(claude_model_str)41 config.claude_api_key = input_data.get('claudeApiKey')42 43 # Parse HTML pruning settings44 config.html_pruning = self._parse_html_pruning_config(input_data)45 46 # Parse actor configurations47 config.actors = self._parse_actor_configs(48 input_data.get('actors', []), input_data.get('targetUrl', '')49 )50 51 # Parse execution settings52 config.max_retries = input_data.get('maxRetries', 3)53 config.timeout_seconds = input_data.get('timeout', 60)54 config.concurrent_actors = input_data.get('concurrentActors', True)55 56 return config57 58 def _parse_claude_model(self, model_str: str) -> ClaudeModel:59 """Parse Claude model from string."""60 model_mapping = {61 # Claude 4 models62 'claude-sonnet-4-20250514': ClaudeModel.CLAUDE_4_SONNET,63 'claude-opus-4-20250514': ClaudeModel.CLAUDE_4_OPUS,64 'claude-sonnet-4-0': ClaudeModel.CLAUDE_4_SONNET,65 'claude-opus-4-0': ClaudeModel.CLAUDE_4_OPUS,66 67 # Claude 3.7 models68 'claude-3-7-sonnet-20250219': ClaudeModel.CLAUDE_3_7_SONNET,69 'claude-3-7-sonnet-latest': ClaudeModel.CLAUDE_3_7_SONNET,70 71 # Claude 3.5 models72 'claude-3-5-sonnet-20241022': ClaudeModel.CLAUDE_3_5_SONNET,73 'claude-3-5-sonnet-latest': ClaudeModel.CLAUDE_3_5_SONNET,74 'claude-3-5-haiku-20241022': ClaudeModel.CLAUDE_3_5_HAIKU,75 76 # Claude 3 models77 'claude-3-sonnet-20240229': ClaudeModel.CLAUDE_3_SONNET,78 'claude-3-haiku-20240307': ClaudeModel.CLAUDE_3_HAIKU,79 80 # Aliases81 'claude-4': ClaudeModel.CLAUDE_4_SONNET,82 'claude-4-sonnet': ClaudeModel.CLAUDE_4_SONNET,83 'claude-4-opus': ClaudeModel.CLAUDE_4_OPUS,84 'sonnet-4': ClaudeModel.CLAUDE_4_SONNET,85 'opus-4': ClaudeModel.CLAUDE_4_OPUS,86 'sonnet-3.7': ClaudeModel.CLAUDE_3_7_SONNET,87 'sonnet-3.5': ClaudeModel.CLAUDE_3_5_SONNET,88 'haiku-3.5': ClaudeModel.CLAUDE_3_5_HAIKU,89 'haiku': ClaudeModel.CLAUDE_3_HAIKU,90 'sonnet': ClaudeModel.CLAUDE_3_SONNET,91 }92 93 return model_mapping.get(model_str.lower(), ClaudeModel.CLAUDE_4_SONNET)94 95 def _parse_html_pruning_config(self, input_data: Dict[str, Any]) -> HTMLPruningConfig:96 """Parse HTML pruning configuration from flat input structure."""97 # Convert percentage from 0-100 to 0.0-1.0 if needed98 prune_percentage = input_data.get('htmlPrunePercentage', 80)99 if prune_percentage > 1:100 prune_percentage = prune_percentage / 100.0101 102 return HTMLPruningConfig(103 enabled=input_data.get('htmlPruningEnabled', True),104 max_list_items=input_data.get('htmlMaxListItems', 5),105 max_text_length=input_data.get('htmlMaxTextLength', 500),106 prune_before_evaluation=input_data.get('htmlPruneBeforeEvaluation', True),107 prune_percentage=prune_percentage108 )109 110 def _parse_actor_configs(self, actors_data: Any, target_url: str) -> Dict[str, ActorConfig]:111 """Parse actor configurations with improved validation."""112 # Start with default configurations113 default_configs = get_default_actor_configs()114 115 # Handle both array and object formats116 if isinstance(actors_data, list):117 # New array format: [{"name": "actor-name", "enabled": true, "input": {...}}]118 return self._parse_actor_configs_from_array(actors_data, target_url, default_configs)119 elif isinstance(actors_data, dict):120 # Legacy object format: {"actor-name": true, "other-actor": {"enabled": true, "input": {...}}}121 return self._parse_actor_configs_from_object(actors_data, target_url, default_configs)122 else:123 # No actor configuration provided, use defaults124 for config in default_configs.values():125 config.input['startUrls'] = [{"url": target_url}]126 return default_configs127 128 def _parse_actor_configs_from_array(self, actors_data: list, target_url: str, default_configs: Dict[str, ActorConfig]) -> Dict[str, ActorConfig]:129 """Parse actor configurations from array format."""130 parsed_configs = {}131 132 for actor_item in actors_data:133 if not isinstance(actor_item, dict):134 self.logger.warning(f"Invalid actor configuration format: {actor_item}")135 continue136 137 actor_name = actor_item.get('name')138 if not actor_name:139 self.logger.warning(f"Actor configuration missing 'name' field: {actor_item}")140 continue141 142 try:143 # Check if this is a known actor144 if actor_name in default_configs:145 config = default_configs[actor_name]146 config.enabled = actor_item.get('enabled', True)147 # Merge custom input with defaults148 if 'input' in actor_item:149 config.input.update(actor_item['input'])150 else:151 # Custom actor152 config = ActorConfig(153 actor_id=actor_item.get('actorId', actor_name),154 name=actor_name,155 description=actor_item.get('description', ''),156 enabled=actor_item.get('enabled', True),157 input=actor_item.get('input', {})158 )159 160 # Ensure startUrls is set161 if 'startUrls' not in config.input:162 config.input['startUrls'] = [{"url": target_url}]163 164 parsed_configs[actor_name] = config165 166 except Exception as e:167 self.logger.error(f"Error parsing configuration for actor '{actor_name}': {e}")168 continue169 170 # If no valid configs, use defaults171 if not parsed_configs:172 for config in default_configs.values():173 config.input['startUrls'] = [{"url": target_url}]174 return default_configs175 176 return parsed_configs177 178 def _parse_actor_configs_from_object(self, actors_data: dict, target_url: str, default_configs: Dict[str, ActorConfig]) -> Dict[str, ActorConfig]:179 """Parse actor configurations from legacy object format."""180 parsed_configs = {}181 182 for actor_name, actor_data in actors_data.items():183 try:184 if isinstance(actor_data, dict):185 # Full configuration object186 if 'actorId' in actor_data:187 # Custom actor with explicit ID188 config = ActorConfig(189 actor_id=actor_data.get('actorId'),190 name=actor_data.get('name', actor_name),191 description=actor_data.get('description', ''),192 enabled=actor_data.get('enabled', True),193 input=actor_data.get('input', {})194 )195 else:196 # Partial configuration for known actor197 if actor_name in default_configs:198 config = default_configs[actor_name]199 config.enabled = actor_data.get('enabled', True)200 # Merge custom input with defaults201 if 'input' in actor_data:202 config.input.update(actor_data['input'])203 else:204 self.logger.warning(f"Unknown actor '{actor_name}' with partial config, skipping")205 continue206 207 # Ensure startUrls is set208 if 'startUrls' not in config.input:209 config.input['startUrls'] = [{"url": target_url}]210 211 # Validate actor ID212 if not config.actor_id:213 self.logger.error(f"Actor '{actor_name}' missing actor_id")214 continue215 216 parsed_configs[actor_name] = config217 218 elif isinstance(actor_data, bool):219 # Simple boolean enable/disable220 if actor_name in default_configs:221 config = default_configs[actor_name]222 config.enabled = actor_data223 config.input['startUrls'] = [{"url": target_url}]224 parsed_configs[actor_name] = config225 else:226 self.logger.warning(f"Unknown actor '{actor_name}' with boolean config, skipping")227 228 elif isinstance(actor_data, str):229 # Just actor ID provided230 config = ActorConfig(231 actor_id=actor_data,232 name=actor_name,233 enabled=True,234 input={'startUrls': [{"url": target_url}]}235 )236 parsed_configs[actor_name] = config237 238 else:239 self.logger.warning(f"Invalid configuration format for actor '{actor_name}': {type(actor_data)}")240 241 except Exception as e:242 self.logger.error(f"Error parsing configuration for actor '{actor_name}': {e}")243 continue244 245 # Ensure at least one actor is enabled246 if not any(config.enabled for config in parsed_configs.values()):247 self.logger.warning("No actors enabled, falling back to defaults")248 for config in default_configs.values():249 config.input['startUrls'] = [{"url": target_url}]250 return default_configs251 252 return parsed_configs253 254 def generate_example_config(self) -> Dict[str, Any]:255 """Generate an example configuration for documentation."""256 return {257 "targetUrl": "https://example.com",258 "userGoal": "Extract product information",259 "claudeApiKey": "sk-ant-api03-...",260 261 # Core settings262 "forActor": False,263 "testScript": True,264 "outputScriptPath": "generated_scraper.py",265 266 # Claude model selection267 "claudeModel": "claude-sonnet-4-20250514", # or "claude-4", "sonnet-4", "opus-4", "sonnet-3.7", "sonnet-3.5", "haiku"268 269 # HTML pruning settings270 "htmlPruningEnabled": True,271 "htmlMaxListItems": 5,272 "htmlMaxTextLength": 500,273 "htmlPrunePercentage": 80,274 "htmlPruneBeforeEvaluation": True,275 276 # Actor configurations (new array format)277 "actors": [278 {279 "name": "cheerio-scraper",280 "enabled": True,281 "input": {282 "maxRequestRetries": 3,283 "requestTimeoutSecs": 30,284 "maxPagesPerCrawl": 1,285 "pageFunction": """286 async function pageFunction(context) {287 const { request, log, $ } = context;288 try {289 const title = $('title').text() || '';290 const html = $('html').html() || '';291 return {292 url: request.url,293 title: title,294 html: html295 };296 } catch (error) {297 log.error('Error in pageFunction:', error);298 return {299 url: request.url,300 title: '',301 html: ''302 };303 }304 }305 """,306 "proxyConfiguration": {"useApifyProxy": True}307 }308 },309 {310 "name": "web-scraper",311 "enabled": True,312 "input": {313 "maxRequestRetries": 3,314 "requestTimeoutSecs": 30,315 "maxPagesPerCrawl": 1,316 "pageFunction": """317 async function pageFunction(context) {318 const { request, log, page } = context;319 try {320 const title = await page.title();321 const html = await page.content();322 return {323 url: request.url,324 title: title,325 html: html326 };327 } catch (error) {328 log.error('Error in pageFunction:', error);329 return {330 url: request.url,331 title: '',332 html: ''333 };334 }335 }336 """,337 "proxyConfiguration": {"useApifyProxy": True}338 }339 },340 {341 "name": "website-content-crawler",342 "enabled": False,343 "input": {344 "maxCrawlPages": 1,345 "crawler": "playwright",346 "proxyConfiguration": {"useApifyProxy": True}347 }348 },349 {350 "name": "custom-scraper",351 "actorId": "your-username/custom-scraper",352 "description": "My custom scraping actor",353 "enabled": True,354 "input": {355 "maxRequestRetries": 5,356 "requestTimeoutSecs": 60,357 "customParam": "value"358 }359 },360 {361 "name": "playwright-scraper",362 "enabled": True,363 "input": {364 "maxRequestRetries": 2,365 "requestTimeoutSecs": 45,366 "maxPagesPerCrawl": 1,367 "pageFunction": """368 async function pageFunction(context) {369 const { request, log, page } = context;370 try {371 const title = await page.title();372 const html = await page.content();373 return {374 url: request.url,375 title: title,376 html: html377 };378 } catch (error) {379 log.error('Error in pageFunction:', error);380 return {381 url: request.url,382 title: '',383 html: ''384 };385 }386 }387 """,388 "proxyConfiguration": {"useApifyProxy": True}389 }390 }391 ],392 393 # Execution settings394 "maxRetries": 3,395 "timeout": 60,396 "concurrentActors": True397 }398 399 def validate_config(self, config: PipelineConfig) -> bool:400 """401 Validate configuration and log any issues.402 403 Args:404 config: Configuration to validate405 406 Returns:407 True if configuration is valid408 """409 is_valid = True410 411 # Check if at least one actor is enabled412 enabled_actors = config.get_enabled_actors()413 if not enabled_actors:414 self.logger.error("No actors are enabled in configuration")415 is_valid = False416 417 # Check Claude API key418 if not config.claude_api_key:419 self.logger.error("Claude API key is required")420 is_valid = False421 422 # Validate HTML pruning settings423 prune_percentage = config.html_pruning.prune_percentage424 if prune_percentage < 0 or prune_percentage > 1:425 self.logger.error("HTML pruning percentage must be between 0 and 1")426 is_valid = False427 428 if config.html_pruning.max_list_items < 1:429 self.logger.error("Max list items must be at least 1")430 is_valid = False431 432 if config.html_pruning.max_text_length < 1:433 self.logger.error("Max text length must be at least 1")434 is_valid = False435 436 # Validate actor configurations437 for actor_name, actor_config in enabled_actors.items():438 if not actor_config.actor_id:439 self.logger.error(f"Actor '{actor_name}' missing actor_id")440 is_valid = False441 442 # Validate actor_id format443 if actor_config.actor_id and '/' not in actor_config.actor_id:444 self.logger.warning(f"Actor '{actor_name}' has unusual actor_id format: {actor_config.actor_id}")445 446 # Validate required input fields447 if not actor_config.input.get('startUrls'):448 self.logger.error(f"Actor '{actor_name}' missing startUrls in input")449 is_valid = False450 451 return is_valid
1"""2HTML utility functions for processing web content.3"""4
5from typing import Optional6from bs4 import BeautifulSoup, Comment, NavigableString7import re8
9
10def is_html(text_content: str) -> bool:11 """12 Check if a string is likely HTML content.13 14 Args:15 text_content: The text content to check16 17 Returns:18 True if the content appears to be HTML19 """20 if not text_content or not isinstance(text_content, str):21 return False22 23 content_lower = text_content.lower()24 return '<html>' in content_lower and '<body>' in content_lower25
26
27def prune_html(html_content: str, max_list_items: int = 5, max_text_length: int = 500, 28 prune_percentage: float = 0.8) -> str:29 """30 Clean and shorten HTML content to reduce token count while preserving structure.31 32 Args:33 html_content: The raw HTML content to process34 max_list_items: Maximum number of list items to keep35 max_text_length: Maximum length of text content in any tag36 prune_percentage: Percentage of content to keep (0.0-1.0)37 38 Returns:39 The cleaned and shortened HTML40 """41 if not html_content or not isinstance(html_content, str):42 return ""43 44 try:45 soup = BeautifulSoup(html_content, 'html.parser')46 47 # Remove unwanted tags entirely48 unwanted_tags = ['script', 'style', 'svg', 'noscript', 'meta', 'link']49 for tag_name in unwanted_tags:50 for tag in soup.find_all(tag_name):51 tag.decompose()52 53 # Remove HTML comments54 for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):55 comment.extract()56 57 # Remove unwanted attributes from all tags58 allowed_attributes = {'id', 'class', 'href', 'src', 'alt', 'title'}59 for tag in soup.find_all(True):60 if hasattr(tag, 'attrs'):61 tag.attrs = {key: value for key, value in tag.attrs.items() 62 if key in allowed_attributes}63 64 # Truncate lists and tables65 list_and_table_tags = ['ul', 'ol', 'table', 'tbody', 'thead']66 for tag_name in list_and_table_tags:67 for tag in soup.find_all(tag_name):68 children = list(tag.children)69 # Filter out NavigableString objects (text nodes, whitespace)70 non_text_children = [child for child in children if not isinstance(child, NavigableString)]71 72 if len(non_text_children) > max_list_items:73 # Keep only the first max_list_items children74 for child in non_text_children[max_list_items:]:75 child.decompose()76 77 # Add a comment indicating truncation78 if tag.name in ['ul', 'ol']:79 truncation_notice = soup.new_tag("li")80 truncation_notice.string = f"... ({len(non_text_children) - max_list_items} more items)"81 tag.append(truncation_notice)82 elif tag.name == 'table':83 truncation_notice = soup.new_tag("tr")84 td = soup.new_tag("td")85 td.string = f"... ({len(non_text_children) - max_list_items} more rows)"86 truncation_notice.append(td)87 tag.append(truncation_notice)88 89 # Truncate long text content90 for element in soup.find_all(string=True):91 if isinstance(element, NavigableString) and not isinstance(element, Comment):92 text = str(element).strip()93 if len(text) > max_text_length:94 element.replace_with(text[:max_text_length] + "...")95 96 # Apply percentage-based pruning if specified97 if prune_percentage < 1.0:98 # Calculate target length based on percentage99 target_length = int(len(str(soup)) * prune_percentage)100 current_html = str(soup)101 102 if len(current_html) > target_length:103 # Additional aggressive pruning to meet percentage target104 # Remove more list items105 for tag_name in ['ul', 'ol', 'table', 'tbody', 'thead']:106 for tag in soup.find_all(tag_name):107 children = list(tag.children)108 non_text_children = [child for child in children if not isinstance(child, NavigableString)]109 110 # Keep even fewer items if we need more aggressive pruning111 aggressive_max = max(1, int(max_list_items * prune_percentage))112 if len(non_text_children) > aggressive_max:113 for child in non_text_children[aggressive_max:]:114 child.decompose()115 116 # More aggressive text truncation117 aggressive_text_length = int(max_text_length * prune_percentage)118 for element in soup.find_all(string=True):119 if isinstance(element, NavigableString) and not isinstance(element, Comment):120 text = str(element).strip()121 if len(text) > aggressive_text_length:122 element.replace_with(text[:aggressive_text_length] + "...")123 124 # Return the cleaned HTML125 return str(soup)126 127 except Exception as e:128 # If parsing fails, return original content truncated129 return html_content[:max_text_length * 10] if len(html_content) > max_text_length * 10 else html_content130
131
132def extract_text_content(html_content: str) -> str:133 """134 Extract clean text content from HTML.135 136 Args:137 html_content: HTML content to extract text from138 139 Returns:140 Clean text content141 """142 if not html_content:143 return ""144 145 try:146 soup = BeautifulSoup(html_content, 'html.parser')147 return soup.get_text(separator=' ', strip=True)148 except Exception:149 return html_content150
151
152def validate_html_structure(html_content: str) -> bool:153 """154 Validate basic HTML structure.155 156 Args:157 html_content: HTML content to validate158 159 Returns:160 True if HTML has basic valid structure161 """162 if not html_content:163 return False164 165 try:166 soup = BeautifulSoup(html_content, 'html.parser')167 168 # Check for basic HTML elements169 has_html_tag = soup.find('html') is not None170 has_body_tag = soup.find('body') is not None171 has_content = len(soup.get_text(strip=True)) > 0172 173 return has_html_tag or has_body_tag or has_content174 175 except Exception:176 return False