
Taxdeed_Scraper
Under maintenance
Pricing
Pay per usage
Go to Store

Taxdeed_Scraper
Under maintenance
Scrapes tax deed sale data from Duval and Clay County Clerk websites using Playwright and Apify SDK.
0.0 (0)
Pricing
Pay per usage
0
Monthly users
3
Runs succeeded
>99%
Last modified
14 days ago
.actor/Dockerfile
1# Use the official Apify Python + Playwright image as base
2FROM apify/actor-python-playwright:3.13
3
4# Copy only requirements.txt first to install dependencies in a separate layer
5COPY requirements.txt ./
6
7# Install Python packages and log versions for debugging
8RUN echo "Python version:" \
9 && python --version \
10 && echo "Pip version:" \
11 && pip --version \
12 && echo "Installing dependencies:" \
13 && pip install --no-cache-dir -r requirements.txt \
14 && echo "All installed Python packages:" \
15 && pip freeze
16
17# Install Playwright dependencies and browsers
18RUN playwright install-deps && \
19 playwright install
20
21# Copy all source files into the container
22COPY . ./
23
24# Precompile Python code for early error detection (optional but recommended)
25RUN python3 -m compileall -q .
26
27# Set the default command to run your main script
28CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "county-taxdeed-scraper",
4 "title": "Duval & Clay County Tax Deed Scraper",
5 "description": "Scrapes tax deed sale data from Duval and Clay County Clerk websites using Playwright and Apify SDK.",
6 "version": "0.0",
7 "buildTag": "latest",
8 "meta": {
9 "templateId": "python-playwright"
10 },
11 "input": "./input_schema.json",
12 "dockerfile": "./Dockerfile"
13}
.actor/input_schema.json
1{
2 "title": "Duval & Clay County Tax Deed Scraper",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "start_urls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start scraping from (Duval/Clay Clerk sites)",
10 "prefill": [
11 { "url": "https://taxdeed.duvalclerk.com/" },
12 { "url": "https://landmark.clayclerk.com/TaxDeed/" }
13 ],
14 "editor": "requestListSources"
15 },
16 "max_depth": {
17 "title": "Maximum depth",
18 "type": "integer",
19 "description": "Depth to which the scraper should follow links (not used in current logic but may be useful for future)",
20 "default": 1
21 }
22 },
23 "required": ["start_urls"]
24}
src/__init__.py
src/__main__.py
1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())
src/main.py
1"""Apify Actor for scraping Duval and Clay County Tax Deed sales data."""
2
3from __future__ import annotations
4
5import time
6from urllib.parse import urljoin
7
8from apify import Actor
9from playwright.async_api import async_playwright
10
11
12async def main() -> None:
13 """Main entry point for the Apify Actor."""
14
15 async with Actor:
16 Actor.log.info("Starting the Duval Tax Deed scraper...")
17
18 base_urls = [
19 "https://taxdeed.duvalclerk.com/",
20 "https://landmark.clayclerk.com/TaxDeed/"
21 ]
22
23 data_list = []
24
25 async with async_playwright() as playwright:
26 browser = await playwright.chromium.launch(
27 headless=Actor.config.headless,
28 args=["--disable-gpu"]
29 )
30 context = await browser.new_context()
31 page = await context.new_page()
32
33 for base_url in base_urls:
34 await page.goto(base_url)
35
36 # --- Step 1: Select Date Options and Search ---
37 await page.select_option("#SearchSaleDateFrom", index=3)
38 await page.select_option("#SearchSaleDateTo", index=0)
39 await page.click("#tabs-9 button")
40
41 # Filter for "SALE" status
42 status_input = page.locator("#gs_Status")
43 await status_input.click()
44 await status_input.fill("")
45 await status_input.type("SALE")
46 await page.wait_for_timeout(2000)
47
48 try:
49 total_pages = int((await page.text_content("#sp_1_pager")).strip())
50 except Exception:
51 total_pages = 1
52
53 for current_page in range(1, total_pages + 1):
54 Actor.log.info(f"Processing page {current_page} of {total_pages}")
55 rows = await page.locator("tr[role='row'][id]").all()
56 filtered_row_ids = []
57
58 for row in rows:
59 row_id = await row.get_attribute("id")
60 if row_id and row_id.isdigit():
61 status = await row.locator("td[aria-describedby='TaxDeed_Status']").text_content()
62 if status.strip() == "SALE":
63 filtered_row_ids.append(row_id)
64
65 for row_id in filtered_row_ids:
66 if "clayclerk" in base_url:
67 details_url = f"https://landmark.clayclerk.com/TaxDeed/Home/Details?id={row_id}"
68 else:
69 details_url = f"https://taxdeed.duvalclerk.com/Home/Details?id={row_id}"
70
71 Actor.log.info(f"Visiting details URL: {details_url}")
72 await page.goto(details_url)
73 await page.wait_for_timeout(2000)
74
75 detail_data = {}
76 rows_detail = await page.locator("tr:has(td b)").all()
77
78 for row in rows_detail:
79 try:
80 key = await row.locator("td:nth-child(1) b").text_content()
81 value = await row.locator("td:nth-child(2)").text_content()
82 key = key.strip()
83 value = value.strip()
84
85 if key in ["Property Address", "Parcel ID"]:
86 detail_data[key] = value
87 elif key in ["Opening Bid", "Base Bid"]:
88 detail_data["Opening Bid"] = value
89 except Exception:
90 pass
91
92 for field in ["Property Address", "Parcel ID", "Opening Bid"]:
93 if field not in detail_data:
94 detail_data[field] = "N/A"
95
96 # Extracting County and State from the Property Address field
97 address = detail_data.get("Property Address", "")
98 if address:
99 parts = address.split(",")
100 if len(parts) >= 2:
101 city_and_state = parts[-1].strip()
102 state_parts = city_and_state.split()
103 if len(state_parts) >= 2:
104 state = state_parts[-2].strip()
105 else:
106 state = "N/A"
107 else:
108 state = "N/A"
109 else:
110 state = "N/A"
111
112 if "clayclerk" in base_url:
113 county = "Clay County"
114 else:
115 county = "Duval County"
116
117 # Add county and state to the detail_data dictionary
118 detail_data["County"] = county
119 detail_data["State"] = state
120
121 data_list.append(detail_data)
122
123 # Reinitialize the search for next page
124 await page.goto(base_url)
125 await page.select_option("#SearchSaleDateFrom", index=3)
126 await page.select_option("#SearchSaleDateTo", index=0)
127 await page.click("#tabs-9 button")
128 await status_input.click()
129 await status_input.fill("")
130 await status_input.type("SALE")
131 await page.wait_for_timeout(2000)
132
133 if current_page < total_pages:
134 await page.click("#next_pager")
135 await page.wait_for_timeout(3000)
136
137 await browser.close()
138
139 # Push data to Apify dataset
140 if data_list:
141 await Actor.push_data(data_list)
142 Actor.log.info(f"Successfully pushed {len(data_list)} items to the default dataset.")
143 else:
144 Actor.log.warning("No data extracted.")
src/py.typed
1
.dockerignore
1.git
2.mise.toml
3.nvim.lua
4storage
5
6# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
7
8# Byte-compiled / optimized / DLL files
9__pycache__/
10*.py[cod]
11*$py.class
12
13# C extensions
14*.so
15
16# Distribution / packaging
17.Python
18build/
19develop-eggs/
20dist/
21downloads/
22eggs/
23.eggs/
24lib/
25lib64/
26parts/
27sdist/
28var/
29wheels/
30share/python-wheels/
31*.egg-info/
32.installed.cfg
33*.egg
34MANIFEST
35
36# PyInstaller
37# Usually these files are written by a python script from a template
38# before PyInstaller builds the exe, so as to inject date/other infos into it.
39*.manifest
40*.spec
41
42# Installer logs
43pip-log.txt
44pip-delete-this-directory.txt
45
46# Unit test / coverage reports
47htmlcov/
48.tox/
49.nox/
50.coverage
51.coverage.*
52.cache
53nosetests.xml
54coverage.xml
55*.cover
56*.py,cover
57.hypothesis/
58.pytest_cache/
59cover/
60
61# Translations
62*.mo
63*.pot
64
65# Django stuff:
66*.log
67local_settings.py
68db.sqlite3
69db.sqlite3-journal
70
71# Flask stuff:
72instance/
73.webassets-cache
74
75# Scrapy stuff:
76.scrapy
77
78# Sphinx documentation
79docs/_build/
80
81# PyBuilder
82.pybuilder/
83target/
84
85# Jupyter Notebook
86.ipynb_checkpoints
87
88# IPython
89profile_default/
90ipython_config.py
91
92# pyenv
93# For a library or package, you might want to ignore these files since the code is
94# intended to run in multiple environments; otherwise, check them in:
95.python-version
96
97# pdm
98# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
99#pdm.lock
100# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
101# in version control.
102# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
103.pdm.toml
104.pdm-python
105.pdm-build/
106
107# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
108__pypackages__/
109
110# Celery stuff
111celerybeat-schedule
112celerybeat.pid
113
114# SageMath parsed files
115*.sage.py
116
117# Environments
118.env
119.venv
120env/
121venv/
122ENV/
123env.bak/
124venv.bak/
125
126# Spyder project settings
127.spyderproject
128.spyproject
129
130# Rope project settings
131.ropeproject
132
133# mkdocs documentation
134/site
135
136# mypy
137.mypy_cache/
138.dmypy.json
139dmypy.json
140
141# Pyre type checker
142.pyre/
143
144# pytype static type analyzer
145.pytype/
146
147# Cython debug symbols
148cython_debug/
149
150# PyCharm
151# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
152# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153# and can be added to the global gitignore or merged into this file. For a more nuclear
154# option (not recommended) you can uncomment the following to ignore the entire idea folder.
155.idea/
.gitignore
1.mise.toml
2.nvim.lua
3storage
4
5# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
6
7# Byte-compiled / optimized / DLL files
8__pycache__/
9*.py[cod]
10*$py.class
11
12# C extensions
13*.so
14
15# Distribution / packaging
16.Python
17build/
18develop-eggs/
19dist/
20downloads/
21eggs/
22.eggs/
23lib/
24lib64/
25parts/
26sdist/
27var/
28wheels/
29share/python-wheels/
30*.egg-info/
31.installed.cfg
32*.egg
33MANIFEST
34
35# PyInstaller
36# Usually these files are written by a python script from a template
37# before PyInstaller builds the exe, so as to inject date/other infos into it.
38*.manifest
39*.spec
40
41# Installer logs
42pip-log.txt
43pip-delete-this-directory.txt
44
45# Unit test / coverage reports
46htmlcov/
47.tox/
48.nox/
49.coverage
50.coverage.*
51.cache
52nosetests.xml
53coverage.xml
54*.cover
55*.py,cover
56.hypothesis/
57.pytest_cache/
58cover/
59
60# Translations
61*.mo
62*.pot
63
64# Django stuff:
65*.log
66local_settings.py
67db.sqlite3
68db.sqlite3-journal
69
70# Flask stuff:
71instance/
72.webassets-cache
73
74# Scrapy stuff:
75.scrapy
76
77# Sphinx documentation
78docs/_build/
79
80# PyBuilder
81.pybuilder/
82target/
83
84# Jupyter Notebook
85.ipynb_checkpoints
86
87# IPython
88profile_default/
89ipython_config.py
90
91# pyenv
92# For a library or package, you might want to ignore these files since the code is
93# intended to run in multiple environments; otherwise, check them in:
94.python-version
95
96# pdm
97# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
98#pdm.lock
99# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
100# in version control.
101# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
102.pdm.toml
103.pdm-python
104.pdm-build/
105
106# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
107__pypackages__/
108
109# Celery stuff
110celerybeat-schedule
111celerybeat.pid
112
113# SageMath parsed files
114*.sage.py
115
116# Environments
117.env
118.venv
119env/
120venv/
121ENV/
122env.bak/
123venv.bak/
124
125# Spyder project settings
126.spyderproject
127.spyproject
128
129# Rope project settings
130.ropeproject
131
132# mkdocs documentation
133/site
134
135# mypy
136.mypy_cache/
137.dmypy.json
138dmypy.json
139
140# Pyre type checker
141.pyre/
142
143# pytype static type analyzer
144.pytype/
145
146# Cython debug symbols
147cython_debug/
148
149# PyCharm
150# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
151# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
152# and can be added to the global gitignore or merged into this file. For a more nuclear
153# option (not recommended) you can uncomment the following to ignore the entire idea folder.
154.idea/
155
156# Zed editor
157# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out
158# to share Project Settings within a team
159.zed
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5playwright
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.