Example Website Screenshot Crawler avatar

Example Website Screenshot Crawler

Try for free

No credit card required

Go to Store
Example Website Screenshot Crawler

Example Website Screenshot Crawler

dz_omar/example-website-screenshot-crawler
Try for free

No credit card required

Automated website screenshot crawler using Pyppeteer and Apify. This open-source actor captures screenshots from specified URLs, uploads them to the Apify Key-Value Store, and provides easy access to the results, making it ideal for monitoring website changes and archiving web content.

.actor/Dockerfile

1# Use the official Apify Python base image
2FROM python:3.10-slim
3
4# Set environment variables
5ENV PIP_NO_CACHE_DIR=1
6ENV PLAYWRIGHT_BROWSERS_PATH=/usr/local/share/playwright/
7
8# Install system dependencies
9RUN apt-get update && apt-get install -y \
10    curl \
11    unzip \
12    wget \
13    xvfb \
14    libnss3 \
15    libatk-bridge2.0-0 \
16    libgtk-3-0 \
17    libgbm-dev \
18    && apt-get clean \
19    && rm -rf /var/lib/apt/lists/*
20
21# Install Python dependencies
22COPY requirements.txt .
23RUN pip install --no-cache-dir -r requirements.txt
24
25# Install Playwright (optional if using Pyppeteer)
26RUN pip install playwright && playwright install --with-deps
27
28# Create a directory for the actor's files
29RUN mkdir -p /usr/src/app
30WORKDIR /usr/src/app
31
32# Copy all files from the current directory into the container
33COPY . .
34
35
36# Command to run the application
37CMD ["python", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "website-screenshot-crawler",
4    "title": "Automated Website Screenshot Crawler",
5    "description": "This actor takes screenshots of specified websites, uploads them to Apify Key-Value Store, and provides URLs for easy access. It is useful for monitoring website changes, capturing visuals for reports, or web archiving.",
6    "version": "1.0",
7    "meta": {
8        "templateId": "puppeteer_crawler"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile",
12    "buildTag": "latest",
13    "minMemoryMbytes": 4096,
14    "readme": "./README.md",
15    "storages": {
16        "dataset": {
17            "actorSpecification": 1,
18            "title": "Results",
19            "views": {
20                "results": {
21                    "title": "results to scan",
22                    "transformation": {
23                        "fields": ["screenshot_url", "linkUrl"]
24                    },
25                    "display": {
26                        "component": "table",
27                        "properties": {
28                            "linkUrl": {
29                                "label": "linkUrl",
30                                "format": "link"
31                            },
32                            "screenshot_url": {
33                                "label": "screenshot_url",
34                                "format": "image"
35                            }
36                        }
37                    }
38                }
39            }
40        }
41    }
42}

.actor/input_schema.json

1{
2  "$schema": "http://json-schema.org/draft-07/schema#",
3  "title": "Input Schema",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "fullPage": {
8      "title": "Full Page Screenshot",
9      "type": "boolean",
10      "description": "Indicates whether the screenshot should capture the entire height of the page.",
11      "default": false
12    },
13    "link_urls": {
14      "title": "Link URLs",
15      "type": "array",
16      "description": "A list of URLs to capture screenshots from.",
17      "default": ["https://apify.com"],
18      "editor": "stringList"
19    },
20    "window_Width": {
21      "title": "Browser Window Width",
22      "type": "integer",
23      "description": "The width of the browser window in pixels.",
24      "default": 1920,
25      "unit": "px"   
26    },
27    "window_Height": {
28      "title": "Browser Window Height",
29      "type": "integer",
30      "description": "The height of the browser window in pixels.",
31      "default": 1080,
32      "unit": "px"
33    },
34    "Sleep": {
35      "type": "integer",
36      "title": "Sleep Duration",
37      "description": "The duration (in seconds) to wait after loading a page before taking a screenshot.",
38      "default": 10,
39      "editor": "number"
40    },
41    "waitUntil": {
42      "title": "Navigation Wait Condition",
43      "type": "string",
44      "description": "Specify when the navigation should be considered finished. Options are 'load' for when the load event is fired, or 'domcontentloaded' for when the DOM has been loaded.",
45      "editor": "select",
46      "default": "networkidle0",
47      "enum": ["load", "domcontentloaded", "networkidle0", "networkidle2"],
48      "enumTitles": [
49        "Load (all resources loaded)", 
50        "DOM Content Loaded (HTML parsed)", 
51        "Network Idle (no network connections)", 
52        "Network Idle (minimal network connections)"
53      ]
54    },
55    "cookies": {
56      "sectionCaption": "Cookies",
57      "sectionDescription": "You can use cookie editors such as [Cookie Editor](https://cookie-editor.com/) or [Copy Cookies](https://chromewebstore.google.com/detail/copy-cookies/jcbpglbplpblnagieibnemmkiamekcdg) to format cookies.",
58      "title": "Cookies",
59      "type": "array",
60      "description": "Cookies to be used for the browsing session, formatted as JSON objects.",
61      "editor": "json"
62    },
63    "scrollToBottom": {
64      "sectionCaption": "Scrolling Option",
65      "title": "Enable Scrolling to Bottom",
66      "type": "boolean",
67      "description": "Determines whether the page should be scrolled to the bottom before taking a screenshot.",
68      "default": false
69    },
70    "distance": {
71      "title": "Scrolling Distance",
72      "type": "integer",
73      "description": "The distance (in pixels) to scroll down the page during each scroll action. This controls how much content is revealed with each step.",
74      "default": 100,  
75      "maximum": 1000, 
76      "unit": "px"
77    },
78    "delay": {
79      "title": "Scrolling Delay",
80      "type": "integer",
81      "description": "The delay (in milliseconds) to wait after each scroll action. This can be adjusted based on how quickly content loads after scrolling.",
82      "default": 100, 
83      "maximum": 5000, 
84      "unit": "ms"
85    }
86  },
87  "required": ["link_urls"],
88  "additionalProperties": true
89}

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint.
6asyncio.run(main())

src/main.py

1import os
2import random
3import string
4import asyncio
5import aiofiles
6from apify import Actor
7from pyppeteer import launch
8
9class MyActor:
10    def __init__(self, actor_input):
11        self.link_urls = actor_input.get('link_urls')
12        self.sleep_duration = actor_input.get('Sleep')  # Default to 10 seconds if not provided
13        self.waitUntil = actor_input.get('waitUntil')  # Default to 'networkidle2' if not provided     
14        self.cookies = actor_input.get('cookies', [])
15        self.fullPage = actor_input.get('fullPage')  # Default to True if not provided
16        self.window_Width = actor_input.get('window_Width')  # Default width
17        self.window_Height = actor_input.get('window_Height')  # Default height
18
19        self.scrollToBottom = actor_input.get('scrollToBottom')  # Default to True
20        # The distance to scroll down the page in pixels for each scroll action.
21        self.distance = actor_input.get('distance')  # Configurable scroll distance per step
22
23        # The delay in milliseconds to wait after each scroll action.
24        self.delay = actor_input.get('delay')  # Configurable delay between scroll actions
25
26    async def configure_browser(self):
27        """Configures the Pyppeteer browser with the necessary options."""
28        browser = await launch({
29            'headless': True,  # Set to False if you want to see the browser
30            'args': [
31                '--no-sandbox',
32                '--disable-setuid-sandbox',
33                '--disable-dev-shm-usage',
34                '--disable-gpu',
35                '--blink-settings=imagesEnabled=false',  # Disable image loading
36            ]
37        })
38        return browser
39
40    async def scroll_to_bottom(self, page):
41        """Scrolls to the bottom of the page."""
42        await page.evaluate(f"""
43            async () => {{
44                const distance = {self.distance};  // Scroll by a configurable distance
45                const delay = {self.delay};         // Wait for a configurable delay after each scroll
46                while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {{
47                    document.scrollingElement.scrollBy(0, distance);
48                    await new Promise(resolve => setTimeout(resolve, delay));
49                }}
50            }}
51        """)
52
53    async def process_link(self, page, link_url):
54        """Processes a single URL: open page, optionally scroll to bottom, take a screenshot, upload it."""
55        try:
56            # Set the cookies before navigating to the page
57            await page.setCookie(*self.cookies)
58
59            # Set the viewport size to specified width and height
60            await page.setViewport({'width': self.window_Width, 'height': self.window_Height})
61
62            # Open the page
63            await page.goto(link_url, {'waitUntil': self.waitUntil})  # Wait until the page is fully loaded
64
65            # Conditionally scroll to the bottom of the page
66            if self.scrollToBottom:
67                await self.scroll_to_bottom(page)
68
69            # Wait for the page to load after scrolling
70            await asyncio.sleep(self.sleep_duration)
71
72            # Generate a random screenshot name
73            screenshot_name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16)) + '.png'
74            screenshot_path = f'Image_Files/{screenshot_name}'
75
76            # Take a full-page screenshot
77            await page.screenshot({'path': screenshot_path, 'fullPage': self.fullPage})
78
79            # Upload the screenshot to Apify's Key-Value Store
80            async with aiofiles.open(screenshot_path, 'rb') as f:
81                screenshot_content = await f.read()
82
83            store_id = Actor.config.default_key_value_store_id
84            await Actor.set_value(screenshot_name, screenshot_content)
85
86            screenshot_url = f'https://api.apify.com/v2/key-value-stores/{store_id}/records/{screenshot_name}'
87            await Actor.push_data({'screenshot_url': screenshot_url, 'linkUrl': link_url})
88
89            print(f'Screenshot for {link_url} saved.')
90
91        except Exception as e:
92            print(f"Error processing {link_url}: {str(e)}")
93
94    async def run(self):
95        """Main execution logic."""
96        os.makedirs('Image_Files', exist_ok=True)
97        
98        # Configure and launch the browser
99        browser = await self.configure_browser()
100        page = await browser.newPage()
101
102        for link_url in self.link_urls:
103            await self.process_link(page, link_url)
104
105        await browser.close()
106
107async def main():
108    async with Actor:
109        actor_input = await Actor.get_input() or {}
110        my_actor = MyActor(actor_input)
111        await my_actor.run()
112
113if __name__ == "__main__":
114    asyncio.run(main())

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify==2.0.0
5pyppeteer
6aiofiles
7requests
8tenacity
Developer
Maintained by Community

Actor Metrics

  • 3 monthly users

  • 2 stars

  • 85% runs succeeded

  • Created in Oct 2024

  • Modified 3 months ago

Categories