1import json
2import os
3import time
4import urllib.request
5import urllib.parse
6import asyncio
7from typing import Optional
8
9from apify import Actor
10
11
12def _split_base_url(raw: str) -> tuple[str, str]:
13 """Split APIFY_YC_BASE_URL into (origin, path_prefix).
14 origin = scheme://host[:port]
15 path_prefix = normalized leading path, e.g. '' or '/api' or '/custom/yc'
16 """
17 parsed = urllib.parse.urlparse(raw)
18 if not parsed.scheme or not parsed.netloc:
19 raise RuntimeError(f"Invalid APIFY_YC_BASE_URL: {raw}")
20 origin = f"{parsed.scheme}://{parsed.netloc}"
21
22 prefix = parsed.path or ''
23 if prefix and not prefix.startswith('/'):
24 prefix = '/' + prefix
25 if prefix.endswith('/') and len(prefix) > 1:
26 prefix = prefix[:-1]
27 return origin, prefix
28
29
30def http_request(base_url: str, method: str, path: str, headers: dict, data: Optional[dict] = None):
31 url = urllib.parse.urljoin(base_url.rstrip('/') + '/', path.lstrip('/'))
32 body_bytes = None
33 if data is not None:
34 body_bytes = json.dumps(data).encode('utf-8')
35 headers.setdefault('Content-Type', 'application/json')
36 req = urllib.request.Request(url=url, data=body_bytes, headers=headers, method=method)
37 with urllib.request.urlopen(req, timeout=60) as resp:
38 resp_body = resp.read().decode('utf-8')
39 status = resp.getcode()
40 return status, json.loads(resp_body) if resp_body else None
41
42
43async def main() -> None:
44 async with Actor:
45
46 input_ = await Actor.get_input() or {}
47 search_url: str = input_.get('url')
48 if not search_url:
49 raise RuntimeError('Input must contain "url"')
50
51 APIFY_YC_BASE_URL = os.getenv('APIFY_YC_BASE_URL')
52 APIFY_YC_KEY_ID = os.getenv('APIFY_YC_KEY_ID')
53 APIFY_YC_KEY_SECRET = os.getenv('APIFY_YC_KEY_SECRET')
54
55 if not APIFY_YC_BASE_URL or not APIFY_YC_KEY_ID or not APIFY_YC_KEY_SECRET:
56 raise RuntimeError('Missing APIFY_YC_BASE_URL, APIFY_YC_KEY_ID or APIFY_YC_KEY_SECRET environment variables')
57
58
59 try:
60 base_origin, path_prefix = _split_base_url(APIFY_YC_BASE_URL)
61 except Exception as e:
62 raise RuntimeError(str(e))
63
64 Actor.log.info('Triggering YC search job...')
65
66 raw_path_search = '/yc/search'
67 effective_path_search = (path_prefix + raw_path_search) or raw_path_search
68 status, payload = http_request(
69 base_origin, 'POST', effective_path_search,
70 headers={'Authorization': f'Bearer {APIFY_YC_KEY_ID}:{APIFY_YC_KEY_SECRET}'},
71 data={'url': search_url},
72 )
73 if status != 200 or not payload or 'job_id' not in payload:
74 raise RuntimeError(f'Failed to trigger search job: status={status}, payload={payload}')
75
76 job_id = payload['job_id']
77 Actor.log.info(f'Job queued: {job_id}')
78
79
80 deadline = time.time() + 15 * 60
81 interval = 15
82 last_status = None
83 while time.time() < deadline:
84 raw_path_status = f'/yc/async/job/{job_id}'
85 effective_path_status = (path_prefix + raw_path_status) or raw_path_status
86 try:
87 Actor.log.debug(f"GET path={effective_path_status}")
88 status_code, job_payload = http_request(
89 base_origin, 'GET', effective_path_status,
90 headers={'Authorization': f'Bearer {APIFY_YC_KEY_ID}:{APIFY_YC_KEY_SECRET}'},
91 )
92 except Exception as e:
93 Actor.log.warning(f'Error polling job status: {e}')
94 await asyncio.sleep(interval)
95 continue
96
97 if status_code != 200 or not job_payload:
98 Actor.log.warning(f'Unexpected job status response: {status_code} {job_payload}')
99 await asyncio.sleep(interval)
100 continue
101
102 last_status = job_payload.get('status')
103 Actor.log.info(f'Job status: {last_status}')
104 if last_status in ('completed', 'failed'):
105 await Actor.set_value('RESULT', job_payload)
106 if last_status == 'failed':
107 Actor.log.error('Job failed.')
108 else:
109 companies = job_payload.get("companies") or []
110 await Actor.push_data(companies)
111 Actor.log.info('Job completed successfully.')
112 return
113
114 await asyncio.sleep(interval)
115
116
117 Actor.log.error('Job did not complete within timeout. Returning last known status.')
118 await Actor.set_value('RESULT', {
119 'job_id': job_id,
120 'status': last_status or 'unknown',
121 'message': 'timeout',
122 })