# Use the official Apify Python + Playwright image as base
FROM apify/actor-python-playwright:3.13

# Copy only requirements.txt first to install dependencies in a separate layer
COPY requirements.txt ./

# Install Python packages and log versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install --no-cache-dir -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Install Playwright dependencies and browsers
RUN playwright install-deps && \
    playwright install

# Copy all source files into the container
COPY . ./

# Precompile Python code for early error detection (optional but recommended)
RUN python3 -m compileall -q .

# Set the default command to run your main script
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "county-taxdeed-scraper",
    "title": "Duval & Clay County Tax Deed Scraper",
    "description": "Scrapes tax deed sale data from Duval and Clay County Clerk websites using Playwright and Apify SDK.",
    "version": "0.0",
    "buildTag": "latest",
    "meta": {
        "templateId": "python-playwright"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Duval & Clay County Tax Deed Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "start_urls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start scraping from (Duval/Clay Clerk sites)",
            "prefill": [
                { "url": "https://taxdeed.duvalclerk.com/" },
                { "url": "https://landmark.clayclerk.com/TaxDeed/" }
            ],
            "editor": "requestListSources"
        },
        "max_depth": {
            "title": "Maximum depth",
            "type": "integer",
            "description": "Depth to which the scraper should follow links (not used in current logic but may be useful for future)",
            "default": 1
        }
    },
    "required": ["start_urls"]
}

src/init.py

src/main.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1"""Apify Actor for scraping Duval and Clay County Tax Deed sales data."""
2
3from __future__ import annotations
4
5import time
6from urllib.parse import urljoin
7
8from apify import Actor
9from playwright.async_api import async_playwright
10
11
12async def main() -> None:
13    """Main entry point for the Apify Actor."""
14
15    async with Actor:
16        Actor.log.info("Starting the Duval Tax Deed scraper...")
17
18        base_urls = [
19            "https://taxdeed.duvalclerk.com/",
20            "https://landmark.clayclerk.com/TaxDeed/"
21        ]
22
23        data_list = []
24
25        async with async_playwright() as playwright:
26            browser = await playwright.chromium.launch(
27                headless=Actor.config.headless,
28                args=["--disable-gpu"]
29            )
30            context = await browser.new_context()
31            page = await context.new_page()
32
33            for base_url in base_urls:
34                await page.goto(base_url)
35
36                # --- Step 1: Select Date Options and Search ---
37                await page.select_option("#SearchSaleDateFrom", index=3)
38                await page.select_option("#SearchSaleDateTo", index=0)
39                await page.click("#tabs-9 button")
40
41                # Filter for "SALE" status
42                status_input = page.locator("#gs_Status")
43                await status_input.click()
44                await status_input.fill("")
45                await status_input.type("SALE")
46                await page.wait_for_timeout(2000)
47
48                try:
49                    total_pages = int((await page.text_content("#sp_1_pager")).strip())
50                except Exception:
51                    total_pages = 1
52
53                for current_page in range(1, total_pages + 1):
54                    Actor.log.info(f"Processing page {current_page} of {total_pages}")
55                    rows = await page.locator("tr[role='row'][id]").all()
56                    filtered_row_ids = []
57
58                    for row in rows:
59                        row_id = await row.get_attribute("id")
60                        if row_id and row_id.isdigit():
61                            status = await row.locator("td[aria-describedby='TaxDeed_Status']").text_content()
62                            if status.strip() == "SALE":
63                                filtered_row_ids.append(row_id)
64
65                    for row_id in filtered_row_ids:
66                        if "clayclerk" in base_url:
67                            details_url = f"https://landmark.clayclerk.com/TaxDeed/Home/Details?id={row_id}"
68                        else:
69                            details_url = f"https://taxdeed.duvalclerk.com/Home/Details?id={row_id}"
70
71                        Actor.log.info(f"Visiting details URL: {details_url}")
72                        await page.goto(details_url)
73                        await page.wait_for_timeout(2000)
74
75                        detail_data = {}
76                        rows_detail = await page.locator("tr:has(td b)").all()
77
78                        for row in rows_detail:
79                            try:
80                                key = await row.locator("td:nth-child(1) b").text_content()
81                                value = await row.locator("td:nth-child(2)").text_content()
82                                key = key.strip()
83                                value = value.strip()
84
85                                if key in ["Property Address", "Parcel ID"]:
86                                    detail_data[key] = value
87                                elif key in ["Opening Bid", "Base Bid"]:
88                                    detail_data["Opening Bid"] = value
89                            except Exception:
90                                pass
91
92                        for field in ["Property Address", "Parcel ID", "Opening Bid"]:
93                            if field not in detail_data:
94                                detail_data[field] = "N/A"
95
96                        # Extracting County and State from the Property Address field
97                        address = detail_data.get("Property Address", "")
98                        if address:
99                            parts = address.split(",")
100                            if len(parts) >= 2:
101                                city_and_state = parts[-1].strip()
102                                state_parts = city_and_state.split()
103                                if len(state_parts) >= 2:
104                                    state = state_parts[-2].strip()
105                                else:
106                                    state = "N/A"
107                            else:
108                                state = "N/A"
109                        else:
110                            state = "N/A"
111
112                        if "clayclerk" in base_url:
113                            county = "Clay County"
114                        else:
115                            county = "Duval County"
116
117                        # Add county and state to the detail_data dictionary
118                        detail_data["County"] = county
119                        detail_data["State"] = state
120
121                        data_list.append(detail_data)
122
123                    # Reinitialize the search for next page
124                    await page.goto(base_url)
125                    await page.select_option("#SearchSaleDateFrom", index=3)
126                    await page.select_option("#SearchSaleDateTo", index=0)
127                    await page.click("#tabs-9 button")
128                    await status_input.click()
129                    await status_input.fill("")
130                    await status_input.type("SALE")
131                    await page.wait_for_timeout(2000)
132
133                    if current_page < total_pages:
134                        await page.click("#next_pager")
135                        await page.wait_for_timeout(3000)
136
137            await browser.close()
138
139        # Deduplicate based on 'Parcel ID' and 'Property Address'
140        seen = set()
141        deduped_data = []
142        for item in data_list:
143            key = (item.get("Parcel ID"), item.get("Property Address"))
144            if key not in seen:
145                seen.add(key)
146                deduped_data.append(item)
147
148        # Push data to Apify dataset
149        if deduped_data:
150            await Actor.push_data(deduped_data)
151            Actor.log.info(f"Successfully pushed {len(deduped_data)} unique items to the default dataset.")
152        else:
153            Actor.log.warning("No data extracted after deduplication.")

src/py.typed

.dockerignore

.git
.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

# Zed editor
#  Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out
#  to share Project Settings within a team
.zed

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5playwright

Dallas County Tax Assessor API

adrian_horning/dallas-county-tax-assessor-api

API for the Dallas County Tax Assessor's site: https://www.dallasact.com/act_webdev/dallas/searchbyproperty.jsp

Adrian | The Web Scraping Guy

Landwatch.com | Search | Listing(s) | Agent(s) | Scraper

memo23/apify-landwatch-scraper

The LandWatch Scraper is a tool for extracting property data from LandWatch.com. Ideal for real estate investors, market analysts, and property researchers, it tracks property details, pricing, and listing information. It's perfect for anyone looking to gather comprehensive property data from a site

Muhamed Didovic

Landandfarm.com | Search | Detail(s) | $1 / 1K | Scraper

memo23/apify-landandfarm-scraper

Ideal for real estate investors, market analysts, and property researchers, it tracks property details, pricing, and listing information. With easy setup and multiple export formats (JSON, CSV), it's perfect for anyone looking to gather comprehensive property data from Land And Farm.

Muhamed Didovic

LiveAuctioneers Scraper

ivanvs/liveauctioneers-scraper

Scrape from LiveAuctioneers data about auctions for arts, antiques, coins, fashion, furniture and other collectibles. Download listings data in JSON, XML, Excel, and other versatile

Ivan Vasiljević

Auction.com Property Scraper

parseforge/auction-com-property-scraper

Extract property listings from auction.com. Provides comprehensive property data including financial details, auction information, and property specifications.

ParseForge

5.0

Loopnet Scraper

epctex/loopnet-scraper

Scrape loopnet.com for millions of properties or businesses. Extract real estate properties, descriptions, images, contact information, tax properties, zone information, and detailed information like size and price. You can specify search terms, filters, and much more. Extremely fast! No limits!

epctex

458

3.0

Auction.com Property Listing Scraper Pay Per Event

parseforge/auction-com-property-scraper-ppe

🏠 Get Auction.com property data instantly! Extract 30+ data points including pricing, broker contacts, and auction details. Save 6+ hours per week on manual research. Perfect for real estate investors and brokers. Starting at $1.50 for 100 properties.

ParseForge

5.0

Serp Events Scraper

payai/serp-events-scraper

🚀 Scrape professional networking events, conferences & business meetups from Google Events via SerpAPI. Get 400+ real events across 10 major US cities in minutes!

PayAI

LoopNet | Search | Property(ies) | Agent(s) | Bypasses Limits

memo23/apify-loopnet-search-cheerio

Unlock valuable commercial property data from LoopNet. Extract detailed listings and agents insights for smarter real estate decisions.

Muhamed Didovic

137

4.6

Facebook Events Lite

danek/facebook-page-events-lite

Scrap Facebook events fast and easy. This actor is a lightweight version of the Facebook Events actor. It is designed to be fast and efficient, so it can extract numerous events in a short time. It is also less likely to be blocked by Facebook.

Jan Danecki

Taxdeed_Scraper

Taxdeed_Scraper

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

src/__init__.py

src/__main__.py

src/main.py

src/py.typed

.dockerignore

.gitignore

requirements.txt

You might also like

Dallas County Tax Assessor API

Landwatch.com | Search | Listing(s) | Agent(s) | Scraper

Landandfarm.com | Search | Detail(s) | $1 / 1K | Scraper

LiveAuctioneers Scraper

Auction.com Property Scraper

Loopnet Scraper

Auction.com Property Listing Scraper Pay Per Event

Serp Events Scraper

LoopNet | Search | Property(ies) | Agent(s) | Bypasses Limits

Facebook Events Lite

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

src/__init__.py

src/__main__.py

src/main.py

src/py.typed

.dockerignore

.gitignore

requirements.txt

src/init.py

src/main.py

src/init.py

src/main.py