Ai Job Finder
Try for free
No credit card required
Go to Store
Ai Job Finder
stefanie-rink/ai-job-finder
Try for free
No credit card required
Give a prompt or a CV and find jobs according to you
requirements.txt
1apify-client>=1.0.0,<2.0.0
2openai>=1.0.0
3anthropic>=0.5.0
4google-generativeai==0.3.0
5pydantic>=2.0.0
test.py
1#!/usr/bin/env python3
2import asyncio
3import json
4import sys
5import os
6from typing import Dict, Any, Optional
7
8# Import from our modules
9from src.llm_providers.factory import create_llm_provider
10from src.cv_processor import process_cv
11from src.prompt_processor import process_prompt
12from src.parameter_handler import apply_parameter_defaults
13
14async def test_cv_processing():
15 """Test CV processing with a local file"""
16 # Check if file path was provided
17 if len(sys.argv) < 2:
18 print("Usage: python test.py path/to/cv.pdf [prompt]")
19 sys.exit(1)
20
21 # Get CV file path and optional prompt
22 cv_path = sys.argv[1]
23 prompt = sys.argv[2] if len(sys.argv) > 2 else None
24
25 # Check if API key is set
26 openai_key = os.environ.get("OPENAI_API_KEY")
27 if not openai_key:
28 print("ERROR: OPENAI_API_KEY environment variable not set.")
29 print("Please set it with: export OPENAI_API_KEY=your-api-key")
30 sys.exit(1)
31
32 # Read CV file
33 try:
34 with open(cv_path, "rb") as f:
35 cv_data = f.read()
36
37 # Convert to base64 for testing
38 import base64
39 import mimetypes
40 mime_type, _ = mimetypes.guess_type(cv_path)
41 if not mime_type:
42 mime_type = "application/octet-stream"
43
44 cv_data_base64 = f"data:{mime_type};base64,{base64.b64encode(cv_data).decode('utf-8')}"
45 except Exception as e:
46 print(f"Error reading CV file: {str(e)}")
47 sys.exit(1)
48
49 # Create LLM provider
50 provider = create_llm_provider("openai", openai_key)
51
52 # Process CV
53 print("Processing CV...")
54 cv_parameters = await process_cv(cv_data_base64, provider, "openai")
55 print(f"CV Parameters: {json.dumps(cv_parameters, indent=2)}")
56
57 # Process prompt if provided
58 prompt_parameters = {}
59 if prompt:
60 print("\nProcessing prompt...")
61 prompt_parameters = await process_prompt(prompt, provider)
62 print(f"Prompt Parameters: {json.dumps(prompt_parameters, indent=2)}")
63
64 # Merge and apply defaults
65 parameters = {**cv_parameters, **prompt_parameters}
66 final_parameters = apply_parameter_defaults(parameters)
67
68 print("\nFinal LinkedIn Search Parameters:")
69 print(json.dumps(final_parameters, indent=2))
70
71 # Note: This test doesn't actually call the LinkedIn scraper
72 print("\nTest complete. To perform a real LinkedIn search, upload this Actor to Apify.")
73
74if __name__ == "__main__":
75 asyncio.run(test_cv_processing())
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "ai-job-finder",
4 "title": "AI Job Finder",
5 "description": "An AI-powered tool that reads a CV and/or prompt to find relevant jobs on LinkedIn",
6 "version": "0.1",
7 "buildTag": "latest",
8 "meta": {
9 "templateId": "python-apify"
10 },
11 "input": "./input_schema.json",
12 "dockerfile": "./Dockerfile"
13}
.actor/Dockerfile
1# First specify the base Docker image.
2FROM apify/actor-python:3.12
3
4# Copy requirements.txt into the Actor image
5COPY requirements.txt ./
6
7# Install the packages specified in requirements.txt
8RUN echo "Python version:" \
9 && python --version \
10 && echo "Pip version:" \
11 && pip --version \
12 && echo "Installing dependencies:" \
13 && pip install -r requirements.txt \
14 && echo "All installed Python packages:" \
15 && pip freeze
16
17# Copy the remaining files and directories with the source code
18COPY . ./
19
20# Use compileall to ensure the runnability of the Actor Python code
21RUN python3 -m compileall -q .
22
23# Specify how to launch the source code of your Actor
24CMD ["python3", "-m", "src"]
.actor/INPUT_SCHEMA.json
1{
2 "title": "AI Job Finder",
3 "description": "An AI-powered tool that reads a CV and/or prompt to find relevant jobs on LinkedIn",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "cv": {
8 "title": "CV/Resume",
9 "type": "object",
10 "description": "Upload your CV/resume (PDF, DOCX, TXT formats supported) as Base64 encoded string",
11 "editor": "file",
12 "nullable": true
13 },
14 "prompt": {
15 "title": "Job Search Query",
16 "type": "string",
17 "description": "Describe the job you're looking for (e.g., 'Senior Python Developer in New York')",
18 "editor": "textarea",
19 "default": "I'm looking for remote senior software engineering roles in AI companies. I have 5 years of experience with Python and machine learning.",
20 "nullable": true
21 },
22 "llm_settings": {
23 "title": "LLM Provider Settings",
24 "type": "object",
25 "description": "Configure which LLM provider to use",
26 "editor": "json",
27 "default": {
28 "provider": "gemini",
29 "model": "gemini-1.5-pro"
30 },
31 "prefill": {
32 "provider": "gemini",
33 "model": "gemini-1.5-pro"
34 }
35 },
36 "api_keys": {
37 "title": "API Keys",
38 "type": "object",
39 "description": "API keys for LLM providers (optional - defaults to environment variables)",
40 "editor": "json",
41 "default": {},
42 "prefill": {
43 "openai": "",
44 "claude": "",
45 "gemini": ""
46 }
47 },
48 "linkedin_search_params": {
49 "title": "Additional LinkedIn Search Parameters",
50 "type": "object",
51 "description": "Override specific LinkedIn search parameters",
52 "editor": "json",
53 "nullable": true
54 },
55 "proxy": {
56 "title": "Proxy Configuration",
57 "type": "object",
58 "description": "Configure Apify proxy for LinkedIn scraping",
59 "editor": "proxy",
60 "default": {
61 "useApifyProxy": true,
62 "apifyProxyGroups": ["RESIDENTIAL"]
63 }
64 }
65 },
66 "required": []
67}
src/cv_processor.py
1import logging
2import base64
3import json
4import re
5from typing import Dict, Any, Optional
6
7logger = logging.getLogger(__name__)
8
9async def process_cv(cv_data: str, llm_provider, provider_name: str) -> Dict[str, Any]:
10 """
11 Process CV data using the appropriate LLM provider
12
13 Args:
14 cv_data: CV data (either base64 encoded file or plain text)
15 llm_provider: The LLM provider instance to use
16 provider_name: Name of the provider ('openai', 'claude', or 'gemini')
17
18 Returns:
19 Dictionary of extracted parameters for LinkedIn job search
20 """
21 try:
22 logger.info(f"Processing CV with {provider_name} provider")
23
24 # Process CV with the provider
25 cv_parameters = await llm_provider.process_cv(cv_data)
26
27 # Validate and clean the parameters
28 cv_parameters = validate_cv_parameters(cv_parameters)
29
30 logger.info(f"Successfully extracted parameters from CV: {json.dumps(cv_parameters, indent=2)}")
31 return cv_parameters
32
33 except Exception as e:
34 logger.error(f"Error processing CV: {str(e)}")
35 # Return empty parameters, which will use defaults later
36 return {}
37
38def validate_cv_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:
39 """
40 Validate and clean the parameters extracted from the CV
41
42 Args:
43 parameters: Raw parameters extracted by the LLM
44
45 Returns:
46 Cleaned and validated parameters
47 """
48 cleaned = {}
49
50 # Clean and validate title
51 if "title" in parameters and parameters["title"]:
52 cleaned["title"] = str(parameters["title"]).strip()
53
54 # Clean and validate location
55 if "location" in parameters and parameters["location"]:
56 cleaned["location"] = str(parameters["location"]).strip()
57
58 # Clean and validate experienceLevel
59 if "experienceLevel" in parameters and parameters["experienceLevel"]:
60 exp_level = str(parameters["experienceLevel"]).strip()
61 # Ensure it's a number from 1-5
62 if exp_level in ["1", "2", "3", "4", "5"]:
63 cleaned["experienceLevel"] = exp_level
64
65 # Clean and validate workType
66 if "workType" in parameters and parameters["workType"]:
67 work_type = str(parameters["workType"]).strip()
68 # Ensure it's a valid work type (1, 2, or 3)
69 if work_type in ["1", "2", "3"]:
70 cleaned["workType"] = work_type
71
72 # Clean and validate contractType
73 if "contractType" in parameters and parameters["contractType"]:
74 contract_type = str(parameters["contractType"]).strip().upper()
75 # Ensure it's a valid contract type (F, P, C, T, I, or V)
76 if contract_type in ["F", "P", "C", "T", "I", "V"]:
77 cleaned["contractType"] = contract_type
78
79 # Clean and validate skills (might be used for custom filtering later)
80 if "skills" in parameters and isinstance(parameters["skills"], list):
81 cleaned["skills"] = [str(skill).strip() for skill in parameters["skills"] if skill]
82
83 return cleaned
src/main.py
1#!/usr/bin/env python3
2from apify import Actor
3import logging
4import json
5import base64
6import re
7import os
8from typing import Dict, List, Any, Optional
9
10# Import providers
11from .llm_providers.factory import create_llm_provider
12from .cv_processor import process_cv
13from .prompt_processor import process_prompt
14from .parameter_handler import apply_parameter_defaults
15
16# Set up logging
17logging.basicConfig(level=logging.INFO)
18logger = logging.getLogger(__name__)
19
20async def main():
21 """Main entry point for the Actor"""
22 # Initialize the Actor
23 await Actor.init()
24
25 # Get input from the actor
26 actor_input = await Actor.get_input() or {}
27
28 # Validate input - require at least CV or prompt
29 cv_data = actor_input.get("cv")
30 prompt = actor_input.get("prompt")
31
32 if not cv_data and not prompt:
33 raise ValueError("At least one of CV or prompt must be provided")
34
35 # Get LLM settings
36 llm_settings = actor_input.get("llm_settings", {"provider": "gemini", "model": "gemini-1.5-pro"})
37 provider_name = llm_settings.get("provider", "gemini")
38
39 # Get API key - first from input, then from environment variables
40 api_keys = actor_input.get("api_keys", {})
41 api_key = api_keys.get(provider_name)
42
43 # If no API key in input, try to get from environment variables
44 if not api_key:
45 if provider_name == "openai":
46 api_key = os.getenv("OPENAI_API_KEY")
47 elif provider_name == "gemini":
48 api_key = os.getenv("GEMINI_API_KEY")
49 elif provider_name == "claude":
50 api_key = os.getenv("CLAUDE_API_KEY")
51
52 # If no API key was found, we can't proceed with LLM processing
53 if not api_key:
54 logger.warning(f"No API key provided for {provider_name}")
55 await Actor.push_data([{
56 "title": "LLM API KEY IS NEEDED",
57 "description": f"Please provide an API key for {provider_name.upper()} to use this Actor",
58 "instructions": f"Set the {provider_name.upper()}_API_KEY environment variable or provide it in the api_keys input parameter",
59 "location": "N/A",
60 "companyName": "AI Job Finder",
61 "experienceLevel": "N/A",
62 "workType": "N/A",
63 "contractType": "N/A",
64 "publishedAt": "N/A",
65 "message": f"API key for {provider_name} is required to get real results"
66 }])
67 logger.info("Returned message indicating API key is needed")
68 return
69
70 # Create LLM provider for processing
71 model = llm_settings.get("model")
72 if provider_name == "gemini" and not model:
73 model = "gemini-1.5-pro"
74
75 logger.info(f"Using LLM provider: {provider_name} with model: {model}")
76 llm_provider = create_llm_provider(provider_name, api_key, model)
77
78 # Process parameters
79 parameters = {}
80
81 # Extract parameters from CV and/or prompt
82 if cv_data:
83 logger.info("Processing CV...")
84 cv_parameters = await process_cv(cv_data, llm_provider, provider_name)
85 parameters.update(cv_parameters)
86
87 if prompt:
88 logger.info("Processing prompt...")
89 try:
90 prompt_parameters = await process_prompt(prompt, llm_provider)
91 # Prompt parameters override CV parameters
92 parameters.update(prompt_parameters)
93 except Exception as e:
94 logger.error(f"Error processing prompt: {str(e)}")
95 # Continue with default parameters
96
97 # Apply any explicit parameters from input
98 linkedin_params = actor_input.get("linkedin_search_params", {})
99 if linkedin_params:
100 parameters.update(linkedin_params)
101
102 # Apply defaults for missing parameters
103 parameters = apply_parameter_defaults(parameters)
104
105 # Set proxy configuration
106 if "proxy_configuration" in actor_input:
107 parameters["proxy"] = actor_input["proxy_configuration"]
108 elif "proxy" in actor_input:
109 parameters["proxy"] = actor_input["proxy"]
110
111 # Log the parameters we'll use
112 logger.info(f"Using LinkedIn search parameters: {json.dumps(parameters, indent=2)}")
113
114 # Call LinkedIn scraper
115 logger.info("Calling LinkedIn scraper with parameters")
116 try:
117 jobs = await call_linkedin_scraper(parameters)
118
119 # Save output
120 await Actor.push_data(jobs)
121 logger.info(f"Found {len(jobs)} matching jobs")
122 except Exception as e:
123 logger.error(f"Error calling LinkedIn scraper: {str(e)}")
124 # Return a meaningful error to the user
125 await Actor.push_data([{
126 "title": "Error Connecting to LinkedIn Scraper",
127 "description": f"An error occurred while trying to connect to the LinkedIn Jobs Scraper: {str(e)}",
128 "error": True,
129 "parameters": parameters
130 }])
131
132async def call_linkedin_scraper(parameters):
133 """Call the LinkedIn scraper with the given parameters"""
134 # Prepare the Actor input
135 run_input = {
136 "title": parameters.get("title", ""),
137 "location": parameters.get("location", ""),
138 "companyName": parameters.get("companyName", []),
139 "companyId": parameters.get("companyId", []),
140 "workType": parameters.get("workType", ""),
141 "experienceLevel": parameters.get("experienceLevel", ""),
142 "contractType": parameters.get("contractType", ""),
143 "publishedAt": parameters.get("publishedAt", ""),
144 "rows": parameters.get("rows", 10),
145 "proxy": parameters.get("proxy", {
146 "useApifyProxy": True,
147 "apifyProxyGroups": ["RESIDENTIAL"]
148 })
149 }
150
151 # Run the Actor and wait for it to finish using Actor.apify_client
152 # This automatically handles the authentication - no need for explicit API key
153 run = await Actor.apify_client.actor("BHzefUZlZRKWxkTck").call(run_input=run_input)
154
155 # Fetch and return the Actor's output
156 dataset_items = await Actor.apify_client.dataset(run["defaultDatasetId"]).list_items()
157 return dataset_items.items
src/parameter_handler.py
1import logging
2from typing import Dict, Any
3
4logger = logging.getLogger(__name__)
5
6def apply_parameter_defaults(parameters: Dict[str, Any]) -> Dict[str, Any]:
7 """
8 Apply default values for missing parameters
9
10 Args:
11 parameters: Current set of parameters
12
13 Returns:
14 Parameters with defaults applied
15 """
16 # Create a copy of the parameters to avoid modifying the original
17 final_params = parameters.copy()
18
19 # Check for title (required parameter)
20 if "title" not in final_params or not final_params["title"]:
21 final_params["title"] = "Software Engineer" # Default job title
22 logger.info("Using default job title: 'Software Engineer'")
23
24 # Set default location if not provided
25 if "location" not in final_params or not final_params["location"]:
26 final_params["location"] = "United States" # Country is required, default to United States
27 logger.info("Using default location: United States")
28
29 # Set default experience level if not provided
30 if "experienceLevel" not in final_params or not final_params["experienceLevel"]:
31 final_params["experienceLevel"] = "3" # Associate
32 logger.info("Using default experience level: 3 (Associate)")
33
34 # Set default work type if not provided
35 if "workType" not in final_params or not final_params["workType"]:
36 final_params["workType"] = "" # Empty string means any work type
37 logger.info("Using default work type: any")
38
39 # Set default contract type if not provided
40 if "contractType" not in final_params or not final_params["contractType"]:
41 final_params["contractType"] = "F" # Full-time
42 logger.info("Using default contract type: F (Full-Time)")
43
44 # Set default published at if not provided
45 if "publishedAt" not in final_params or not final_params["publishedAt"]:
46 final_params["publishedAt"] = "" # Empty string means any time
47 logger.info("Using default time frame: any time")
48
49 # Set default company name if not provided
50 if "companyName" not in final_params or not final_params["companyName"]:
51 final_params["companyName"] = [] # Empty list means any company
52 logger.info("Using default company name: any company")
53
54 # Set default company ID if not provided
55 if "companyId" not in final_params or not final_params["companyId"]:
56 final_params["companyId"] = [] # Empty list means any company ID
57 logger.info("Using default company ID: any company ID")
58
59 # Set default rows if not provided
60 if "rows" not in final_params or not final_params["rows"]:
61 final_params["rows"] = 10 # Default to 10 results
62 logger.info("Using default rows: 10")
63
64 # Ensure we have proper proxy configuration
65 if "proxy" not in final_params or not final_params["proxy"]:
66 final_params["proxy"] = {
67 "useApifyProxy": True,
68 "apifyProxyGroups": ["RESIDENTIAL"]
69 }
70 logger.info("Using default proxy configuration")
71
72 return final_params
src/prompt_processor.py
1import logging
2import json
3from typing import Dict, Any, Optional
4
5logger = logging.getLogger(__name__)
6
7async def process_prompt(prompt: str, llm_provider) -> Dict[str, Any]:
8 """
9 Process user prompt and extract job search parameters
10
11 Args:
12 prompt: User's job search query
13 llm_provider: The LLM provider instance to use
14
15 Returns:
16 Dictionary of extracted parameters for LinkedIn job search
17 """
18 try:
19 logger.info("Processing user prompt")
20
21 # Process prompt with the provider
22 prompt_parameters = await llm_provider.process_prompt(prompt)
23
24 # Validate and clean the parameters
25 prompt_parameters = validate_prompt_parameters(prompt_parameters)
26
27 logger.info(f"Successfully extracted parameters from prompt: {json.dumps(prompt_parameters, indent=2)}")
28 return prompt_parameters
29
30 except Exception as e:
31 logger.error(f"Error processing prompt: {str(e)}")
32 # Return empty parameters, which will use defaults later
33 return {}
34
35def validate_prompt_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:
36 """
37 Validate and clean the parameters extracted from the prompt
38
39 Args:
40 parameters: Raw parameters extracted by the LLM
41
42 Returns:
43 Cleaned and validated parameters
44 """
45 cleaned = {}
46
47 # Clean and validate title
48 if "title" in parameters and parameters["title"]:
49 cleaned["title"] = str(parameters["title"]).strip()
50
51 # Clean and validate location
52 if "location" in parameters and parameters["location"]:
53 cleaned["location"] = str(parameters["location"]).strip()
54
55 # Clean and validate experienceLevel
56 if "experienceLevel" in parameters and parameters["experienceLevel"]:
57 exp_level = str(parameters["experienceLevel"]).strip()
58 # Ensure it's a number from 1-5
59 if exp_level in ["1", "2", "3", "4", "5"]:
60 cleaned["experienceLevel"] = exp_level
61
62 # Clean and validate workType
63 if "workType" in parameters and parameters["workType"]:
64 work_type = str(parameters["workType"]).strip()
65 # Ensure it's a valid work type (1, 2, or 3)
66 if work_type in ["1", "2", "3"]:
67 cleaned["workType"] = work_type
68
69 # Clean and validate contractType
70 if "contractType" in parameters and parameters["contractType"]:
71 contract_type = str(parameters["contractType"]).strip().upper()
72 # Ensure it's a valid contract type (F, P, C, T, I, or V)
73 if contract_type in ["F", "P", "C", "T", "I", "V"]:
74 cleaned["contractType"] = contract_type
75
76 # Clean and validate publishedAt
77 if "publishedAt" in parameters and parameters["publishedAt"]:
78 published_at = str(parameters["publishedAt"]).strip()
79 # Ensure it's a valid time frame
80 if published_at in ["r86400", "r604800", "r2592000", ""]:
81 cleaned["publishedAt"] = published_at
82
83 # Clean and validate rows
84 if "rows" in parameters and parameters["rows"]:
85 try:
86 rows = int(parameters["rows"])
87 if rows > 0:
88 cleaned["rows"] = rows
89 except (ValueError, TypeError):
90 pass
91
92 # Clean and validate companyName
93 if "companyName" in parameters and isinstance(parameters["companyName"], list):
94 cleaned["companyName"] = [str(company).strip() for company in parameters["companyName"] if company]
95
96 # Clean and validate companyId
97 if "companyId" in parameters and isinstance(parameters["companyId"], list):
98 cleaned["companyId"] = [str(company_id).strip() for company_id in parameters["companyId"] if company_id]
99
100 return cleaned
src/__init__.py
1# AI Job Finder package
src/__main__.py
1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint
6asyncio.run(main())
__pycache__/main.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/.dockerignore
1.git
2.mise.toml
3.nvim.lua
4storage
5
6# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
7
8# Byte-compiled / optimized / DLL files
9__pycache__/
10*.py[cod]
11*$py.class
12
13# C extensions
14*.so
15
16# Distribution / packaging
17.Python
18build/
19develop-eggs/
20dist/
21downloads/
22eggs/
23.eggs/
24lib/
25lib64/
26parts/
27sdist/
28var/
29wheels/
30share/python-wheels/
31*.egg-info/
32.installed.cfg
33*.egg
34MANIFEST
35
36# PyInstaller
37# Usually these files are written by a python script from a template
38# before PyInstaller builds the exe, so as to inject date/other infos into it.
39*.manifest
40*.spec
41
42# Installer logs
43pip-log.txt
44pip-delete-this-directory.txt
45
46# Unit test / coverage reports
47htmlcov/
48.tox/
49.nox/
50.coverage
51.coverage.*
52.cache
53nosetests.xml
54coverage.xml
55*.cover
56*.py,cover
57.hypothesis/
58.pytest_cache/
59cover/
60
61# Translations
62*.mo
63*.pot
64
65# Django stuff:
66*.log
67local_settings.py
68db.sqlite3
69db.sqlite3-journal
70
71# Flask stuff:
72instance/
73.webassets-cache
74
75# Scrapy stuff:
76.scrapy
77
78# Sphinx documentation
79docs/_build/
80
81# PyBuilder
82.pybuilder/
83target/
84
85# Jupyter Notebook
86.ipynb_checkpoints
87
88# IPython
89profile_default/
90ipython_config.py
91
92# pyenv
93# For a library or package, you might want to ignore these files since the code is
94# intended to run in multiple environments; otherwise, check them in:
95.python-version
96
97# pdm
98# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
99#pdm.lock
100# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
101# in version control.
102# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
103.pdm.toml
104.pdm-python
105.pdm-build/
106
107# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
108__pypackages__/
109
110# Celery stuff
111celerybeat-schedule
112celerybeat.pid
113
114# SageMath parsed files
115*.sage.py
116
117# Environments
118.env
119.venv
120env/
121venv/
122ENV/
123env.bak/
124venv.bak/
125
126# Spyder project settings
127.spyderproject
128.spyproject
129
130# Rope project settings
131.ropeproject
132
133# mkdocs documentation
134/site
135
136# mypy
137.mypy_cache/
138.dmypy.json
139dmypy.json
140
141# Pyre type checker
142.pyre/
143
144# pytype static type analyzer
145.pytype/
146
147# Cython debug symbols
148cython_debug/
149
150# PyCharm
151# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
152# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153# and can be added to the global gitignore or merged into this file. For a more nuclear
154# option (not recommended) you can uncomment the following to ignore the entire idea folder.
155.idea/
example/advanced-reddit-scraper/.gitignore
1.mise.toml
2.nvim.lua
3storage
4
5# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
6
7# Byte-compiled / optimized / DLL files
8__pycache__/
9*.py[cod]
10*$py.class
11
12# C extensions
13*.so
14
15# Distribution / packaging
16.Python
17build/
18develop-eggs/
19dist/
20downloads/
21eggs/
22.eggs/
23lib/
24lib64/
25parts/
26sdist/
27var/
28wheels/
29share/python-wheels/
30*.egg-info/
31.installed.cfg
32*.egg
33MANIFEST
34
35# PyInstaller
36# Usually these files are written by a python script from a template
37# before PyInstaller builds the exe, so as to inject date/other infos into it.
38*.manifest
39*.spec
40
41# Installer logs
42pip-log.txt
43pip-delete-this-directory.txt
44
45# Unit test / coverage reports
46htmlcov/
47.tox/
48.nox/
49.coverage
50.coverage.*
51.cache
52nosetests.xml
53coverage.xml
54*.cover
55*.py,cover
56.hypothesis/
57.pytest_cache/
58cover/
59
60# Translations
61*.mo
62*.pot
63
64# Django stuff:
65*.log
66local_settings.py
67db.sqlite3
68db.sqlite3-journal
69
70# Flask stuff:
71instance/
72.webassets-cache
73
74# Scrapy stuff:
75.scrapy
76
77# Sphinx documentation
78docs/_build/
79
80# PyBuilder
81.pybuilder/
82target/
83
84# Jupyter Notebook
85.ipynb_checkpoints
86
87# IPython
88profile_default/
89ipython_config.py
90
91# pyenv
92# For a library or package, you might want to ignore these files since the code is
93# intended to run in multiple environments; otherwise, check them in:
94.python-version
95
96# pdm
97# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
98#pdm.lock
99# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
100# in version control.
101# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
102.pdm.toml
103.pdm-python
104.pdm-build/
105
106# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
107__pypackages__/
108
109# Celery stuff
110celerybeat-schedule
111celerybeat.pid
112
113# SageMath parsed files
114*.sage.py
115
116# Environments
117.env
118.venv
119env/
120venv/
121ENV/
122env.bak/
123venv.bak/
124
125# Spyder project settings
126.spyderproject
127.spyproject
128
129# Rope project settings
130.ropeproject
131
132# mkdocs documentation
133/site
134
135# mypy
136.mypy_cache/
137.dmypy.json
138dmypy.json
139
140# Pyre type checker
141.pyre/
142
143# pytype static type analyzer
144.pytype/
145
146# Cython debug symbols
147cython_debug/
148
149# PyCharm
150# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
151# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
152# and can be added to the global gitignore or merged into this file. For a more nuclear
153# option (not recommended) you can uncomment the following to ignore the entire idea folder.
154.idea/
155
156# Added by Apify CLI
157node_modules
example/advanced-reddit-scraper/README.md
1# 🚀 Advanced Reddit Scraper for Apify Actors - Lightning Fast & SEO Optimized
2
3Unlock the full potential of Reddit data with our Advanced Reddit Scraper designed for the Apify platform. This high-performance tool uses lightning-fast requests to extract extensive subreddit information, providing researchers, marketers, and data enthusiasts with unparalleled social media insights.
4
5## 📊 Comprehensive Reddit Data Extraction
6
7Our Reddit Scraper offers a robust set of features that allow you to collect detailed data from any subreddit using Apify's powerful actor architecture. Enjoy rapid scraping with optimal performance while taking advantage of customizable settings tailored for your data requirements.
8
9### 🔥 Key SEO-Optimized Features
10
11- **Full Subreddit Scraping**: Extract every detail from the target subreddits, capturing posts, comments, and metadata.
12- **Customizable Data Fields**: Configure exactly what you're after, ensuring that you only get the data that matters.
13- **Lightning Fast Performance**: Utilizes Python requests for rapid data retrieval, so you never miss a trending topic.
14- **Scalable Data Collection**: Effortlessly scrape multiple subreddits simultaneously, ideal for large-scale data mining.
15- **Real-Time Insights**: Obtain the most current Reddit information, perfect for real-time analytics and trend monitoring.
16- **Easy Integration with Data Pipelines**: Seamlessly export data in various formats (JSON, CSV, etc.) for immediate analysis.
17
18### 🌟 Use Cases for Maximum Impact
19
201. **Market Research & Trend Analysis**: Monitor public opinion and identify trending topics across subreddits.
212. **Content Creation & Optimization**: Discover viral posts and themes to inspire your content strategy.
223. **Sentiment Analysis**: Analyze user reactions and sentiments using detailed comment extraction.
234. **Competitive Intelligence**: Stay ahead by tracking competitor mentions and industry-specific discussions.
245. **Academic & Social Media Research**: Gather comprehensive data for scholarly studies and social behavior analysis.
25
26### 🛠️ How It Works
27
281. **Input Parameters**:
29 - **Queries**: Provide one or more subreddit URLs in the format `https://reddit.com/r/<subreddit>`.
30 - **Post Sorting**: Choose how posts are sorted (e.g., `hot`, `new`, `top`, or `rising`).
31 - **Top Period**: Specify the period for top posts (e.g., `day`, `week`, or `all`).
32 - **Max Posts**: Set the maximum number of posts to scrape per subreddit.
33 - **Comment Sorting**: Define the method to sort comments (e.g., `best`, `top`, `new`).
34 - **Number of Comments**: Determine how many comments (and a few nested replies) to extract per post.
352. **Execution**: Our scraper efficiently navigates Reddit using HTTP requests, ensuring quick and reliable data extraction while strictly following Reddit's guidelines.
363. **Output**: Receive clean, structured data ready for analysis and integration into your existing workflows.
37
38### 📈 Why Our Reddit Scraper Stands Out
39
40- **Comprehensive Data Collection**: Capture every available piece of information from the subreddits you track.
41- **High-Speed Requests**: Leveraging the fastest possible scraping techniques to give you immediate insights.
42- **Customizable & Flexible**: Tailor the scraping process to meet diverse and specific data needs.
43- **Enterprise-Grade Scalability**: Perfect for both small-scale projects and large-scale data operations.
44- **Ethical & Compliant**: Adheres to Reddit’s data usage policies, including respecting robots.txt and API guidelines.
45
46### 🔗 Essential Resources
47
48- [Apify Platform](https://apify.com)
49- [Actor Documentation](https://docs.apify.com/actors)
50- [API Reference](https://docs.apify.com/api/v2)
51
52### 📞 Expert Support When You Need It
53
54For further assistance or inquiries, feel free to reach out:
55- 📧 Email: tnot2652@gmail.com
56
57### 🚀 Ready to Upgrade Your Data Game?
58
59Don't miss out on vital Reddit insights. Enhance your data strategy and make informed decisions with our Advanced Reddit Scraper. Start scraping smarter and faster on Apify today!
example/advanced-reddit-scraper/requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify == 2.2.1
5beautifulsoup4[lxml]
6httpx
7types-beautifulsoup4
src/llm_providers/base_provider.py
1from abc import ABC, abstractmethod
2from typing import Dict, Any, Optional
3
4class LLMProvider(ABC):
5 """Base abstract class for LLM providers"""
6
7 def __init__(self, api_key: str, model: Optional[str] = None):
8 """
9 Initialize the LLM provider
10
11 Args:
12 api_key: API key for the provider
13 model: Optional specific model to use
14 """
15 self.api_key = api_key
16 self.model = model
17
18 @abstractmethod
19 async def process_cv(self, cv_data: str) -> Dict[str, Any]:
20 """
21 Process CV data and extract job search parameters
22
23 Args:
24 cv_data: CV content (could be base64 encoded file or text)
25
26 Returns:
27 Dictionary of extracted parameters
28 """
29 pass
30
31 @abstractmethod
32 async def process_prompt(self, prompt: str) -> Dict[str, Any]:
33 """
34 Process user prompt and extract job search parameters
35
36 Args:
37 prompt: User's job search query
38
39 Returns:
40 Dictionary of extracted parameters
41 """
42 pass
43
44 @abstractmethod
45 async def validate_api_key(self) -> bool:
46 """
47 Validate that the API key is correct
48
49 Returns:
50 True if valid, False otherwise
51 """
52 pass
53
54 def supports_document_processing(self) -> bool:
55 """
56 Check if the provider and model support direct document processing
57
58 Returns:
59 True if document processing is supported, False otherwise
60 """
61 return False
src/llm_providers/claude_provider.py
1import json
2import logging
3import re
4import base64
5from typing import Dict, Any, Optional, List
6
7from anthropic import AsyncAnthropic
8from src.llm_providers.base_provider import LLMProvider
9
10logger = logging.getLogger(__name__)
11
12class ClaudeProvider(LLMProvider):
13 """Implementation of LLM provider for Anthropic Claude"""
14
15 def __init__(self, api_key: str, model: Optional[str] = None):
16 """Initialize the Claude provider"""
17 super().__init__(api_key, model)
18 self.client = AsyncAnthropic(api_key=api_key)
19 self.model = model or "claude-3-opus-20240229" # Default to most capable model
20
21 def supports_document_processing(self) -> bool:
22 """Check if this provider/model supports direct document processing"""
23 return "claude-3" in self.model # All Claude 3 models support document processing
24
25 async def validate_api_key(self) -> bool:
26 """Validate the API key by making a simple models call"""
27 try:
28 # There's no direct way to validate the key without making a message request
29 # Use a minimal request to check if the key works
30 await self.client.messages.create(
31 model=self.model,
32 max_tokens=10,
33 messages=[{"role": "user", "content": "Hello"}]
34 )
35 return True
36 except Exception as e:
37 logger.error(f"Claude API key validation failed: {str(e)}")
38 return False
39
40 async def process_cv(self, cv_data: str) -> Dict[str, Any]:
41 """
42 Process CV with Claude
43
44 Args:
45 cv_data: CV content (could be base64 encoded file or text)
46
47 Returns:
48 Dictionary of extracted parameters
49 """
50 if self.supports_document_processing() and cv_data.startswith("data:"):
51 return await self._process_cv_with_document_api(cv_data)
52 else:
53 # Assume it's already text
54 return await self._process_cv_text(cv_data)
55
56 async def _process_cv_with_document_api(self, cv_data: str) -> Dict[str, Any]:
57 """Process CV using Claude's document capabilities"""
58 try:
59 # Extract the mime type and base64 data
60 mime_type, encoded_data = cv_data.split(';base64,', 1)
61 mime_type = mime_type.replace('data:', '')
62
63 response = await self.client.messages.create(
64 model=self.model,
65 max_tokens=4000,
66 system="Extract job search parameters from this CV/resume.",
67 messages=[
68 {"role": "user", "content": [
69 {"type": "text", "text": self._get_cv_prompt()},
70 {"type": "image", "source": {
71 "type": "base64",
72 "media_type": mime_type,
73 "data": encoded_data
74 }}
75 ]}
76 ]
77 )
78
79 # Extract JSON from response
80 content = response.content[0].text
81 # Find JSON in the content (handle potential text wrapping)
82 json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
83 if json_match:
84 return json.loads(json_match.group(1))
85
86 # If no JSON block, try to parse the entire content
87 return json.loads(content)
88 except Exception as e:
89 logger.error(f"Claude document processing failed: {str(e)}")
90 raise
91
92 async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:
93 """Process CV text with Claude"""
94 try:
95 response = await self.client.messages.create(
96 model=self.model,
97 max_tokens=4000,
98 system="Extract job search parameters from this CV/resume.",
99 messages=[
100 {"role": "user", "content": self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}"}
101 ]
102 )
103
104 # Extract JSON from response
105 content = response.content[0].text
106 # Find JSON in the content (handle potential text wrapping)
107 json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
108 if json_match:
109 return json.loads(json_match.group(1))
110
111 # If no JSON block, try to parse the entire content
112 return json.loads(content)
113 except Exception as e:
114 logger.error(f"Claude text processing failed: {str(e)}")
115 raise
116
117 async def process_prompt(self, prompt: str) -> Dict[str, Any]:
118 """Process user prompt and extract job search parameters"""
119 try:
120 response = await self.client.messages.create(
121 model=self.model,
122 max_tokens=4000,
123 system="Extract job search parameters from this query.",
124 messages=[
125 {"role": "user", "content": self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}"}
126 ]
127 )
128
129 # Extract JSON from response
130 content = response.content[0].text
131 # Find JSON in the content (handle potential text wrapping)
132 json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
133 if json_match:
134 return json.loads(json_match.group(1))
135
136 # If no JSON block, try to parse the entire content
137 return json.loads(content)
138 except Exception as e:
139 logger.error(f"Claude prompt processing failed: {str(e)}")
140 raise
141
142 def _get_cv_prompt(self) -> str:
143 """Get the prompt for CV analysis"""
144 return """
145 Extract the following job search parameters from this CV/resume.
146
147 Return your response as valid JSON object inside ```json code blocks with the following structure:
148
149 ```json
150 {
151 "title": "The most recent job title or professional role",
152 "location": "Current or preferred location",
153 "experienceLevel": "A numeric value from 1-5 where:
154 1 = Internship
155 2 = Entry Level
156 3 = Associate
157 4 = Mid-Senior Level
158 5 = Director",
159 "workType": "Either:
160 1 = On-Site
161 2 = Remote
162 3 = Hybrid
163 Based on any workstyle preferences found in the CV",
164 "contractType": "A single letter representing employment type preference:
165 F = Full-Time
166 P = Part-Time
167 C = Contract
168 T = Temporary
169 I = Internship
170 V = Volunteer",
171 "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]
172 }
173 ```
174
175 If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.
176
177 IMPORTANT: Your output must be a valid JSON object wrapped in ```json code blocks.
178 """
179
180 def _get_prompt_extraction_prompt(self) -> str:
181 """Get the prompt for extracting parameters from user query"""
182 return """
183 Extract LinkedIn job search parameters from this query.
184
185 Return your response as valid JSON object inside ```json code blocks with the following structure:
186
187 ```json
188 {
189 "title": "Job title or role to search for",
190 "location": "Geographic location for job search",
191 "companyName": ["array of specific companies mentioned"],
192 "companyId": ["array of LinkedIn company IDs if mentioned"],
193 "workType": "Either:
194 1 = On-Site
195 2 = Remote
196 3 = Hybrid",
197 "experienceLevel": "A numeric value from 1-5 where:
198 1 = Internship
199 2 = Entry Level
200 3 = Associate
201 4 = Mid-Senior Level
202 5 = Director",
203 "contractType": "A single letter representing employment type:
204 F = Full-Time
205 P = Part-Time
206 C = Contract
207 T = Temporary
208 I = Internship
209 V = Volunteer",
210 "publishedAt": "Time frame:
211 r86400 = Last 24 hours
212 r604800 = Last week
213 r2592000 = Last month
214 empty string = Any time",
215 "rows": "Number of job listings to return (integer)"
216 }
217 ```
218
219 For any parameters not explicitly mentioned in the query, use null.
220
221 IMPORTANT: Your output must be a valid JSON object wrapped in ```json code blocks.
222 """
src/llm_providers/factory.py
1import logging
2from typing import Optional, Any
3
4logger = logging.getLogger(__name__)
5
6def create_llm_provider(provider_name: str, api_key: str, model: Optional[str] = None) -> Any:
7 """
8 Create and return an instance of the specified LLM provider.
9
10 Args:
11 provider_name: Name of the LLM provider ('openai', 'claude', or 'gemini')
12 api_key: API key for the provider
13 model: Optional specific model to use
14
15 Returns:
16 An instance of the appropriate LLM provider
17
18 Raises:
19 ValueError: If the provider is not supported
20 """
21 if provider_name.lower() == "openai":
22 from src.llm_providers.openai_provider import OpenAIProvider
23 return OpenAIProvider(api_key, model)
24 elif provider_name.lower() == "claude":
25 from src.llm_providers.claude_provider import ClaudeProvider
26 return ClaudeProvider(api_key, model)
27 elif provider_name.lower() == "gemini":
28 from src.llm_providers.gemini_provider import GeminiProvider
29 return GeminiProvider(api_key, model)
30 else:
31 raise ValueError(f"Unsupported LLM provider: {provider_name}")
src/llm_providers/gemini_provider.py
1import json
2import logging
3import re
4import base64
5from typing import Dict, Any, Optional, List
6
7import google.generativeai as genai
8from src.llm_providers.base_provider import LLMProvider
9
10logger = logging.getLogger(__name__)
11
12class GeminiProvider(LLMProvider):
13 """Implementation of LLM provider for Google Gemini"""
14
15 def __init__(self, api_key: str, model: Optional[str] = None):
16 """Initialize the Gemini provider"""
17 super().__init__(api_key, model)
18 genai.configure(api_key=api_key)
19 self.model_name = model or "gemini-1.5-pro"
20 self.model = genai.GenerativeModel(self.model_name)
21
22 def supports_document_processing(self) -> bool:
23 """Check if this provider/model supports direct document processing"""
24 vision_capable_models = ["gemini-pro-vision", "gemini-1.5-pro", "gemini-1.5-flash"]
25 return any(model_name in self.model_name for model_name in vision_capable_models)
26
27 async def validate_api_key(self) -> bool:
28 """Validate the API key by making a simple models call"""
29 try:
30 # Gemini doesn't have a dedicated validate endpoint, use a simple generation
31 response = self.model.generate_content("Hello")
32 return True
33 except Exception as e:
34 logger.error(f"Gemini API key validation failed: {str(e)}")
35 return False
36
37 async def process_cv(self, cv_data: str) -> Dict[str, Any]:
38 """
39 Process CV with Gemini
40
41 Args:
42 cv_data: CV content (could be base64 encoded file or text)
43
44 Returns:
45 Dictionary of extracted parameters
46 """
47 if self.supports_document_processing() and cv_data.startswith("data:"):
48 return await self._process_cv_with_vision(cv_data)
49 else:
50 # Assume it's already text
51 return await self._process_cv_text(cv_data)
52
53 async def _process_cv_with_vision(self, cv_data: str) -> Dict[str, Any]:
54 """Process CV using Gemini's vision capabilities"""
55 try:
56 # Extract the mime type and base64 data
57 mime_type, encoded_data = cv_data.split(';base64,', 1)
58 mime_type = mime_type.replace('data:', '')
59
60 # Create a content parts list with prompt and image
61 parts = [
62 self._get_cv_prompt(),
63 {"mime_type": mime_type, "data": base64.b64decode(encoded_data)}
64 ]
65
66 response = self.model.generate_content(
67 parts,
68 generation_config={
69 "temperature": 0.1
70 }
71 )
72
73 # Extract JSON from response
74 content = response.text
75
76 # Try to parse as JSON directly
77 try:
78 return json.loads(content)
79 except json.JSONDecodeError:
80 # If direct parsing fails, look for JSON in code blocks
81 json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)
82 if json_match:
83 return json.loads(json_match.group(1))
84
85 # If still no match, try to find anything that looks like JSON
86 json_pattern = r'{.*}'
87 json_match = re.search(json_pattern, content, re.DOTALL)
88 if json_match:
89 return json.loads(json_match.group(0))
90
91 logger.error(f"Could not parse Gemini response as JSON: {content}")
92 raise ValueError("Failed to parse Gemini response as JSON")
93
94 except Exception as e:
95 logger.error(f"Gemini vision processing failed: {str(e)}")
96 raise
97
98 async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:
99 """Process CV text with Gemini"""
100 try:
101 response = self.model.generate_content(
102 self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}",
103 generation_config={
104 "temperature": 0.1
105 }
106 )
107
108 # Extract JSON from response
109 content = response.text
110
111 # Try to parse as JSON directly
112 try:
113 return json.loads(content)
114 except json.JSONDecodeError:
115 # If direct parsing fails, look for JSON in code blocks
116 json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)
117 if json_match:
118 return json.loads(json_match.group(1))
119
120 # If still no match, try to find anything that looks like JSON
121 json_pattern = r'{.*}'
122 json_match = re.search(json_pattern, content, re.DOTALL)
123 if json_match:
124 return json.loads(json_match.group(0))
125
126 logger.error(f"Could not parse Gemini response as JSON: {content}")
127 raise ValueError("Failed to parse Gemini response as JSON")
128
129 except Exception as e:
130 logger.error(f"Gemini text processing failed: {str(e)}")
131 raise
132
133 async def process_prompt(self, prompt: str) -> Dict[str, Any]:
134 """Process user prompt and extract job search parameters"""
135 try:
136 response = self.model.generate_content(
137 self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}",
138 generation_config={
139 "temperature": 0.1
140 }
141 )
142
143 # Extract JSON from response
144 content = response.text
145
146 # Try to parse as JSON directly
147 try:
148 return json.loads(content)
149 except json.JSONDecodeError:
150 # If direct parsing fails, look for JSON in code blocks
151 json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)
152 if json_match:
153 return json.loads(json_match.group(1))
154
155 # If still no match, try to find anything that looks like JSON
156 json_pattern = r'{.*}'
157 json_match = re.search(json_pattern, content, re.DOTALL)
158 if json_match:
159 return json.loads(json_match.group(0))
160
161 logger.error(f"Could not parse Gemini response as JSON: {content}")
162 raise ValueError("Failed to parse Gemini response as JSON")
163
164 except Exception as e:
165 logger.error(f"Gemini prompt processing failed: {str(e)}")
166 raise
167
168 def _get_cv_prompt(self) -> str:
169 """Get the prompt for CV analysis"""
170 return """
171 Extract the following job search parameters from this CV/resume:
172
173 Follow these steps:
174 1. Identify the job title
175 2. Determine the location
176 3. Assess experience level (1-5)
177 4. Identify work type preference (1-3)
178 5. Determine contract type (FPCTIV)
179 6. List key skills
180
181 Return ONLY a JSON object with this format:
182 {
183 "title": "The most recent job title or professional role",
184 "location": "Current or preferred location",
185 "experienceLevel": "A numeric value from 1-5 where:
186 1 = Internship
187 2 = Entry Level
188 3 = Associate
189 4 = Mid-Senior Level
190 5 = Director",
191 "workType": "Either:
192 1 = On-Site
193 2 = Remote
194 3 = Hybrid
195 Based on any workstyle preferences found in the CV",
196 "contractType": "A single letter representing employment type preference:
197 F = Full-Time
198 P = Part-Time
199 C = Contract
200 T = Temporary
201 I = Internship
202 V = Volunteer",
203 "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]
204 }
205
206 If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.
207
208 IMPORTANT: Your output must be ONLY the JSON object with no additional text.
209 """
210
211 def _get_prompt_extraction_prompt(self) -> str:
212 """Get the prompt for extracting parameters from user query"""
213 return """
214 Extract LinkedIn job search parameters from this query.
215
216 Follow these steps:
217 1. Identify job title or role
218 2. Determine geographic location
219 3. Note any specific companies mentioned
220 4. Assess experience level (1-5)
221 5. Identify work type (1-3)
222 6. Determine contract type (FPCTIV)
223 7. Identify time frame for job postings
224
225 Return ONLY a JSON object with this format:
226 {
227 "title": "Job title or role to search for",
228 "location": "Geographic location for job search",
229 "companyName": ["array of specific companies mentioned"],
230 "companyId": ["array of LinkedIn company IDs if mentioned"],
231 "workType": "Either:
232 1 = On-Site
233 2 = Remote
234 3 = Hybrid",
235 "experienceLevel": "A numeric value from 1-5 where:
236 1 = Internship
237 2 = Entry Level
238 3 = Associate
239 4 = Mid-Senior Level
240 5 = Director",
241 "contractType": "A single letter representing employment type:
242 F = Full-Time
243 P = Part-Time
244 C = Contract
245 T = Temporary
246 I = Internship
247 V = Volunteer",
248 "publishedAt": "Time frame:
249 r86400 = Last 24 hours
250 r604800 = Last week
251 r2592000 = Last month
252 empty string = Any time",
253 "rows": "Number of job listings to return (integer)"
254 }
255
256 For any parameters not explicitly mentioned in the query, use null.
257
258 IMPORTANT: Your output must be ONLY the JSON object with no additional text.
259 """
src/llm_providers/openai_provider.py
1import json
2import logging
3import base64
4import re
5from typing import Dict, Any, Optional, List
6
7from openai import AsyncOpenAI
8from src.llm_providers.base_provider import LLMProvider
9
10logger = logging.getLogger(__name__)
11
12class OpenAIProvider(LLMProvider):
13 """Implementation of LLM provider for OpenAI"""
14
15 def __init__(self, api_key: str, model: Optional[str] = None):
16 """Initialize the OpenAI provider"""
17 super().__init__(api_key, model)
18 self.client = AsyncOpenAI(api_key=api_key)
19 self.model = model or "gpt-4o" # Default to most capable model
20
21 def supports_document_processing(self) -> bool:
22 """Check if this provider/model supports direct document processing"""
23 document_capable_models = ["gpt-4-vision", "gpt-4o"]
24 return any(model_name in self.model for model_name in document_capable_models)
25
26 async def validate_api_key(self) -> bool:
27 """Validate the API key by making a simple models.list call"""
28 try:
29 await self.client.models.list()
30 return True
31 except Exception as e:
32 logger.error(f"OpenAI API key validation failed: {str(e)}")
33 return False
34
35 async def process_cv(self, cv_data: str) -> Dict[str, Any]:
36 """
37 Process CV with OpenAI
38
39 Args:
40 cv_data: CV content (could be base64 encoded file or text)
41
42 Returns:
43 Dictionary of extracted parameters
44 """
45 if self.supports_document_processing() and cv_data.startswith("data:"):
46 return await self._process_cv_with_vision(cv_data)
47 else:
48 # Assume it's already text
49 return await self._process_cv_text(cv_data)
50
51 async def _process_cv_with_vision(self, cv_data: str) -> Dict[str, Any]:
52 """Process CV using OpenAI's vision capabilities"""
53 try:
54 response = await self.client.chat.completions.create(
55 model=self.model,
56 messages=[
57 {"role": "system", "content": "Extract job search parameters from this CV/resume."},
58 {"role": "user", "content": [
59 {"type": "text", "text": self._get_cv_prompt()},
60 {"type": "image_url", "image_url": {"url": cv_data}}
61 ]}
62 ],
63 response_format={"type": "json_object"}
64 )
65 return json.loads(response.choices[0].message.content)
66 except Exception as e:
67 logger.error(f"OpenAI vision processing failed: {str(e)}")
68 raise
69
70 async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:
71 """Process CV text with OpenAI"""
72 try:
73 response = await self.client.chat.completions.create(
74 model=self.model,
75 messages=[
76 {"role": "system", "content": "Extract job search parameters from this CV/resume."},
77 {"role": "user", "content": self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}"}
78 ],
79 response_format={"type": "json_object"}
80 )
81 return json.loads(response.choices[0].message.content)
82 except Exception as e:
83 logger.error(f"OpenAI text processing failed: {str(e)}")
84 raise
85
86 async def process_prompt(self, prompt: str) -> Dict[str, Any]:
87 """Process user prompt and extract job search parameters"""
88 try:
89 response = await self.client.chat.completions.create(
90 model=self.model,
91 messages=[
92 {"role": "system", "content": "Extract job search parameters from this query."},
93 {"role": "user", "content": self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}"}
94 ],
95 response_format={"type": "json_object"}
96 )
97 return json.loads(response.choices[0].message.content)
98 except Exception as e:
99 logger.error(f"OpenAI prompt processing failed: {str(e)}")
100 raise
101
102 def _get_cv_prompt(self) -> str:
103 """Get the prompt for CV analysis"""
104 return """
105 Extract the following job search parameters from this CV/resume in JSON format:
106
107 Required JSON format:
108 {
109 "title": "The most recent job title or professional role",
110 "location": "Current or preferred location",
111 "experienceLevel": "A numeric value from 1-5 where:
112 1 = Internship
113 2 = Entry Level
114 3 = Associate
115 4 = Mid-Senior Level
116 5 = Director",
117 "workType": "Either:
118 1 = On-Site
119 2 = Remote
120 3 = Hybrid
121 Based on any workstyle preferences found in the CV",
122 "contractType": "A single letter representing employment type preference:
123 F = Full-Time
124 P = Part-Time
125 C = Contract
126 T = Temporary
127 I = Internship
128 V = Volunteer",
129 "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]
130 }
131
132 If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.
133 """
134
135 def _get_prompt_extraction_prompt(self) -> str:
136 """Get the prompt for extracting parameters from user query"""
137 return """
138 Extract LinkedIn job search parameters from this query in JSON format:
139
140 Required JSON format:
141 {
142 "title": "Job title or role to search for",
143 "location": "Geographic location for job search",
144 "companyName": ["array of specific companies mentioned"],
145 "companyId": ["array of LinkedIn company IDs if mentioned"],
146 "workType": "Either:
147 1 = On-Site
148 2 = Remote
149 3 = Hybrid",
150 "experienceLevel": "A numeric value from 1-5 where:
151 1 = Internship
152 2 = Entry Level
153 3 = Associate
154 4 = Mid-Senior Level
155 5 = Director",
156 "contractType": "A single letter representing employment type:
157 F = Full-Time
158 P = Part-Time
159 C = Contract
160 T = Temporary
161 I = Internship
162 V = Volunteer",
163 "publishedAt": "Time frame:
164 r86400 = Last 24 hours
165 r604800 = Last week
166 r2592000 = Last month
167 empty string = Any time",
168 "rows": "Number of job listings to return (integer)"
169 }
170
171 For any parameters not explicitly mentioned in the query, use null.
172 """
src/llm_providers/__init__.py
1# LLM Providers package
src/__pycache__/cv_processor.cpython-312.pyc
Downloadsrc/__pycache__/main.cpython-312.pyc
Downloadsrc/__pycache__/parameter_handler.cpython-312.pyc
Downloadsrc/__pycache__/prompt_processor.cpython-312.pyc
Downloadsrc/__pycache__/__init__.cpython-312.pyc
Downloadsrc/__pycache__/__main__.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "reddit-subreddit-scraper",
4 "title": "Reddit Subreddit Scraper",
5 "description": "Scrapes Reddit subreddits.",
6 "version": "0.0",
7 "buildTag": "latest",
8 "meta": {
9 "templateId": "python-beautifulsoup"
10 },
11 "input": "./input_schema.json",
12 "dockerfile": "./Dockerfile"
13}
example/advanced-reddit-scraper/.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.12
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]
example/advanced-reddit-scraper/.actor/input_schema.json
1{
2 "title": "Advanced Reddit Scraper",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "queries": {
7 "title": "Start URLs or subreddits",
8 "type": "array",
9 "description": "Subreddits to scrape in the format of https://reddit.com/r/<subreddit>",
10 "prefill": [
11 "https://reddit.com/r/AskReddit"
12 ],
13 "default": [
14 "https://reddit.com/r/AskReddit"
15 ],
16 "editor": "stringList"
17 },
18 "postSort": {
19 "title": "Sorting",
20 "type": "string",
21 "enum": [
22 "hot",
23 "new",
24 "top",
25 "rising"
26 ],
27 "description": "Sorting of posts in the subreddit - (top, new, rising, hot) - If already given a sorted subreddit link (e.g.: https://reddit.com/r/eli5/top/?t=day) the sorting will be ignored",
28 "default": "top"
29 },
30 "topPeriod": {
31 "title": "Top posts period",
32 "type": "string",
33 "enum": [
34 "hour",
35 "day",
36 "week",
37 "month",
38 "year",
39 "all"
40 ],
41 "description": "Top posts period - (only works when sorting is top)",
42 "default": "week"
43 },
44 "limit": {
45 "title": "Max Posts",
46 "type": "integer",
47 "description": "Maximum number of posts to scrape per URL (default: 10)",
48 "default": 10
49 },
50
51 "commentSort": {
52 "title": "Comment sorting",
53 "description": "Sorting of comments in the post - (best, top, new, controversial, old, qa)",
54 "type": "string",
55 "enum": [
56 "best",
57 "top",
58 "new",
59 "controversial",
60 "old",
61 "qa"
62 ],
63 "default": "top"
64 },
65 "numComments": {
66 "title": "Number of comments to scrape",
67 "type": "integer",
68 "description": "A few replies to comments is also returned",
69 "default": 0
70 }
71 },
72 "required": ["queries"]
73}
example/advanced-reddit-scraper/.git/COMMIT_EDITMSG
1release
example/advanced-reddit-scraper/.git/config
1[core]
2 repositoryformatversion = 0
3 filemode = false
4 bare = false
5 logallrefupdates = true
6 symlinks = false
7 ignorecase = true
8[remote "origin"]
9 url = https://github.com/deduble/advanced-reddit-scraper.git
10 fetch = +refs/heads/*:refs/remotes/origin/*
11[branch "main"]
12 remote = origin
13 merge = refs/heads/main
14[gui]
15 wmstate = normal
16 geometry = 1322x693+228+228 254 315
example/advanced-reddit-scraper/.git/description
1Unnamed repository; edit this file 'description' to name the repository.
example/advanced-reddit-scraper/.git/FETCH_HEAD
168034258495c18bfb133b925e51bbce4b07c2cf2 branch 'main' of https://github.com/deduble/advanced-reddit-scraper
example/advanced-reddit-scraper/.git/HEAD
1ref: refs/heads/main
example/advanced-reddit-scraper/.git/index
Downloadexample/advanced-reddit-scraper/.git/ORIG_HEAD
1d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45
example/advanced-reddit-scraper/src/cookies.json
1[
2 {
3 "domain": ".reddit.com",
4 "hostOnly": false,
5 "httpOnly": false,
6 "name": "csrf_token",
7 "path": "/",
8 "sameSite": "strict",
9 "secure": true,
10 "session": true,
11 "storeId": "0",
12 "value": "eb3157bc50b012701ac2f7aab49fcc4c",
13 "id": 1
14 },
15 {
16 "domain": ".reddit.com",
17 "expirationDate": 1757533137,
18 "hostOnly": false,
19 "httpOnly": false,
20 "name": "csv",
21 "path": "/",
22 "sameSite": "no_restriction",
23 "secure": true,
24 "session": false,
25 "storeId": "0",
26 "value": "2",
27 "id": 2
28 },
29 {
30 "domain": ".reddit.com",
31 "expirationDate": 1757533137,
32 "hostOnly": false,
33 "httpOnly": false,
34 "name": "edgebucket",
35 "path": "/",
36 "sameSite": "unspecified",
37 "secure": true,
38 "session": false,
39 "storeId": "0",
40 "value": "lH2AY01VDhdJZrAOeK",
41 "id": 3
42 },
43 {
44 "domain": ".reddit.com",
45 "expirationDate": 1761371475,
46 "hostOnly": false,
47 "httpOnly": false,
48 "name": "loid",
49 "path": "/",
50 "sameSite": "no_restriction",
51 "secure": true,
52 "session": false,
53 "storeId": "0",
54 "value": "0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg",
55 "id": 4
56 },
57 {
58 "domain": ".reddit.com",
59 "expirationDate": 1759110222,
60 "hostOnly": false,
61 "httpOnly": false,
62 "name": "pc",
63 "path": "/",
64 "sameSite": "unspecified",
65 "secure": true,
66 "session": false,
67 "storeId": "0",
68 "value": "81",
69 "id": 5
70 },
71 {
72 "domain": ".reddit.com",
73 "expirationDate": 1757612826,
74 "hostOnly": false,
75 "httpOnly": true,
76 "name": "reddit_session",
77 "path": "/",
78 "sameSite": "unspecified",
79 "secure": true,
80 "session": false,
81 "storeId": "0",
82 "value": "710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6",
83 "id": 6
84 },
85 {
86 "domain": ".reddit.com",
87 "hostOnly": false,
88 "httpOnly": false,
89 "name": "session_tracker",
90 "path": "/",
91 "sameSite": "no_restriction",
92 "secure": true,
93 "session": true,
94 "storeId": "0",
95 "value": "mdmqaqfjmphfropmga.0.1727922697221.Z0FBQUFBQm1fZ0lKOVhiTHFwazVhYXBQa0FSS2VUTllqd2ljRmhuNFozRHVnZmkxU1JOcmZBd1dteXRPSmJxS0x3S2s0YVE2VEVRaGk1M0JMei1TV1Q2RGN4STZ4aHhCWnJhSEtsRDZsdEZveFVxeUhnVjNrSFNjOFpJRmM0bEREdVZfR2UyYTdZM2U",
96 "id": 7
97 },
98 {
99 "domain": ".reddit.com",
100 "expirationDate": 1759458549,
101 "hostOnly": false,
102 "httpOnly": false,
103 "name": "t2_927nml2x_recentclicks3",
104 "path": "/",
105 "sameSite": "strict",
106 "secure": false,
107 "session": false,
108 "storeId": "0",
109 "value": "t3_pgyvok%2Ct3_1fsuzj4%2Ct3_1fk6551%2Ct3_eokkto%2Ct3_14x7ys7%2Ct3_17wo9ms%2Ct3_dpcb2z%2Ct3_16fac9r%2Ct3_analu0%2Ct3_142jsph",
110 "id": 8
111 },
112 {
113 "domain": ".reddit.com",
114 "expirationDate": 1728008948.6718,
115 "hostOnly": false,
116 "httpOnly": true,
117 "name": "token_v2",
118 "path": "/",
119 "sameSite": "unspecified",
120 "secure": true,
121 "session": false,
122 "storeId": "0",
123 "value": "eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI4MDA4OTQ4LjE0MDk0MywiaWF0IjoxNzI3OTIyNTQ4LjE0MDk0MiwianRpIjoiNE5wUE5zejMzWkhrWXI0cktxZU9hazJiY0tYMkRRIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.GX8N8AYcgK2DWqWPqiclkljcwEawb7GFRw6QMdL9C7lb5FS-_ofuZpR0bx77pgWjWJ9uOczItTUfZvjx9u4CgeS9dK3U8G1apuqUW9YWDrgxfQeFWNMPVd0IjDTEt6Sn8vrdWb5cjv_SsGzxHgtC2RjdDLQYfQu2ud-Qp_1sELlBDPHDfhgOPbuOpzuFz2NJ8ifj623r2a8XOgQi5UaAHEClgleVAdkN2bpMd1kUsYh0PmMZOpN2XqvgdwKJUuyce-9yAqhMLiIPneVJnaytpth0jeRkT5-Fyt-_CgsXYphTG9T9u8Q2Z5JwOrwiosBPEokbhjculNQ78QlUUlC7UA",
124 "id": 9
125 }
126 ]
example/advanced-reddit-scraper/src/main.py
1"""This module defines the main entry point for the Apify Actor.
2
3Feel free to modify this file to suit your specific needs.
4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
6https://docs.apify.com/sdk/python
7"""
8
9import sys
10import os
11
12from httpx import AsyncClient
13
14from apify import Actor, Request
15from .redditor import Redditor
16
17
18async def main() -> None:
19 """Main entry point for the Apify Actor.
20
21 This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.
22 Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
23 the field of web scraping significantly.
24 """
25 async with Actor:
26 # Retrieve the Actor input, and use default values if not provided.
27 actor_input = await Actor.get_input() or {}
28 queries = actor_input.get('queries', ['https://reddit.com/r/AskReddit'])
29 limit = actor_input.get('limit', 10)
30 num_comments = actor_input.get('numComments', 0)
31 sorting = actor_input.get('postSort', 'top')
32 comment_sort = actor_input.get('commentSort', 'top')
33 sorting_period = actor_input.get('topPeriod', 'today')
34 if sorting_period not in {'hour', 'day', 'week', 'month', 'year', 'all'}:
35 raise ValueError('topPeriod must be one of hour, day, week, month, year, all')
36 if sorting not in {'hot', 'new', 'top', 'rising'}:
37 raise ValueError('postSort must be one of hot, new, top, rising')
38 if comment_sort not in {'best', 'top', 'new','controversial', 'old', 'qa'}:
39 raise ValueError('commentSort must be one of hot, new, top, best, top, new,controversial, old, qa')
40 reddit_scraper = Redditor(logger=Actor.log)
41
42 # Exit if no start URLs are provided.
43 if not queries:
44 Actor.log.info('No queries specified in Actor input, exiting...')
45 await Actor.exit()
46
47 # Open the default request queue for handling URLs to be processed.
48 request_queue = await Actor.open_request_queue()
49
50 # Enqueue the start URLs with an initial crawl depth of 0.
51 for query in queries:
52 url = reddit_scraper.subreddit_link_from_query(query, sorting=sorting, period=sorting_period)
53 Actor.log.info(f'Enqueuing {url} ...')
54 request = Request.from_url(url, user_data={'limit': limit, 'numComments': num_comments, 'query': query})
55 await request_queue.add_request(request)
56
57 # Process the URLs from the request queue.
58 while request := await request_queue.fetch_next_request():
59 url = request.url
60 query = request.user_data['query']
61 posts_limit = request.user_data['limit']
62 num_comments = request.user_data['numComments']
63 Actor.log.info(f'Scraping {request.url} ...')
64
65 try:
66 # Fetch the HTTP response from the specified URL using HTTPX.
67 async with AsyncClient() as client:
68 # response = await client.get(url, follow_redirects=True)
69 for post in reddit_scraper.get_all_posts(url, posts_limit=posts_limit, comments_limit=num_comments):
70 await Actor.push_data(post)
71
72 except Exception:
73 Actor.log.exception(f'Failed to scrape {url}. Will be tried again.')
74 try:
75 Actor.log.exception(f"Data failed to be pushed is {post}")
76 except:
77 pass
78 finally:
79 # Mark the request as handled to ensure it is not processed again.
80 await request_queue.mark_request_as_handled(request)
example/advanced-reddit-scraper/src/redditor.py
1import requests
2from bs4 import BeautifulSoup
3from typing import Dict, Any, List, Optional
4import re
5import base64
6import urllib.parse
7from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
8from .session import cookies, headers
9from typing import Dict, Any, List, Optional, Tuple
10
11import time
12import functools
13
14def log_execution_time(func):
15 @functools.wraps(func)
16 def wrapper(self, *args, **kwargs):
17 start_time = time.time()
18 result = func(self, *args, **kwargs)
19 end_time = time.time()
20 duration = end_time - start_time
21 self.logger.debug(f"{func.__name__} took {duration:.2f} seconds to execute")
22 return result
23 return wrapper
24
25class Redditor:
26 BASE_URL = "https://www.reddit.com"
27
28 def __init__(self, logger):
29 self.logger = logger
30 self.cookies = cookies
31 self.headers = headers
32 self.session = requests.Session()
33 self.session.cookies.update(cookies)
34 self.session.headers.update(headers)
35
36 @log_execution_time
37 def get_community_posts(self, url: str, after: Optional[str] = None) -> str:
38 try:
39 parsed_query = self.parse_url(url)
40 sort = parsed_query.get('sort', 'top')
41 url = f"{self.BASE_URL}/svc/shreddit/community-more-posts/{sort}/"
42 params = {
43 "after": after,
44 "t": parsed_query['time'] or 'day',
45 "name": parsed_query['sub'],
46 "navigationSessionId": "a10adc86-f1ec-4221-9179-d9613e4c7d05",
47 "feedLength": "28"
48 }
49
50 response = self.session.get(url, params=params)
51 response.raise_for_status()
52
53 return response.text
54 except requests.exceptions.RequestException as e:
55 self.logger.error(f"Error fetching community posts: {e}")
56 raise
57 except Exception as e:
58 self.logger.error(f"Unexpected error: {e}")
59 raise
60
61 @log_execution_time
62 def get_post_content(self, permalink: str) -> str:
63 """Get the content of a post using its permalink"""
64 try:
65 url = f"{self.BASE_URL}{permalink}"
66 response = self.session.get(url)
67 response.raise_for_status()
68 soup = BeautifulSoup(response.text, 'html.parser')
69
70 # Find the post content in the text-body slot
71 text_body = soup.find('div', {'slot': 'text-body'})
72 if text_body:
73 md_div = text_body.find('div', {'class': 'md'})
74 if md_div:
75 paragraphs = md_div.find_all('p')
76 return '\n'.join(p.get_text(strip=True) for p in paragraphs)
77
78 # If no text content, check for media content
79 shreddit_post = soup.find('shreddit-post')
80 if shreddit_post:
81 content_href = shreddit_post.get('content-href')
82 if content_href:
83 return content_href
84
85 return ''
86 except Exception as e:
87 self.logger.error(f"Error getting post content: {e}")
88 return ''
89
90 @log_execution_time
91 def parse_posts(self, html_content: str) -> List[Dict[str, Any]]:
92 try:
93 soup = BeautifulSoup(html_content, 'html.parser')
94 posts = []
95 for article in soup.find_all('article'):
96 shreddit_post = article.find('shreddit-post')
97 if shreddit_post:
98 permalink = shreddit_post.get('permalink')
99 post = {
100 "id": shreddit_post.get('id'),
101 "title": shreddit_post.get('post-title'),
102 "author": shreddit_post.get('author'),
103 "subreddit": shreddit_post.get('subreddit-prefixed-name'),
104 "score": shreddit_post.get('score'),
105 "num_comments": shreddit_post.get('comment-count'),
106 "created_timestamp": shreddit_post.get('created-timestamp'),
107 "permalink": permalink,
108 "content": self.get_post_content(permalink)
109 }
110 posts.append(post)
111 return posts
112 except Exception as e:
113 self.logger.error(f"Error parsing posts: {e}")
114 raise
115 @log_execution_time
116 def get_next_cursor(self, html_content: str) -> Optional[str]:
117 try:
118 soup = BeautifulSoup(html_content, 'html.parser')
119 load_after = soup.find('faceplate-partial', slot='load-after')
120
121 if load_after:
122 src = load_after.get('src', '')
123 match = re.search(r'after=([^&]+)', src)
124 if match:
125 encoded_cursor = match.group(1)
126 decoded_cursor = urllib.parse.unquote(encoded_cursor)
127 padding = '=' * ((4 - len(decoded_cursor) % 4) % 4)
128 padded_cursor = decoded_cursor + padding
129 return base64.b64decode(padded_cursor).decode('utf-8')
130
131 except Exception as e:
132 self.logger.error(f"Error retrieving next cursor: {e}")
133 return None
134 @log_execution_time
135 def get_all_posts(self, subreddit: str, posts_limit: int = 100, comments_limit: int = 0) -> List[Dict[str, Any]]:
136 all_posts = []
137 after = None
138
139 try:
140 while len(all_posts) < posts_limit:
141 self.logger.info(f"Fetching posts for subreddit {subreddit}...")
142 html_content = self.get_community_posts(subreddit, after)
143 new_posts = self.parse_posts(html_content)[:posts_limit - len(all_posts)]
144
145 if not new_posts:
146 break
147
148 for post in new_posts:
149 if comments_limit > 0:
150 post['comments'] = self.get_all_comments(post['subreddit'].split('/')[1], post['id'], comments_limit)
151
152 all_posts.extend(new_posts)
153 after = self.get_next_cursor(html_content)
154
155 if not after:
156 break
157
158 self.logger.info(f"Retrieved {len(all_posts[:posts_limit])} posts.")
159 return all_posts[:posts_limit]
160 except Exception as e:
161 self.logger.error(f"Error retrieving posts: {e}")
162 raise
163
164 @log_execution_time
165 def parse_url(self, url: str) -> Dict[str, str]:
166 result = {'sub': '', 'sort': 'none', 'time': None}
167
168 try:
169 subreddit_pattern = re.compile(r'(?:/r/|reddit\.com/r/|^)(\w+)')
170 sort_pattern = re.compile(r'/(hot|new|top|rising)')
171 time_pattern = re.compile(r'[?&]t=(hour|day|week|month|year|all)')
172
173 if not url.startswith('http'):
174 match = subreddit_pattern.search(url)
175 if match:
176 result['sub'] = match.group(1)
177 return result
178
179 path = urlparse(url).path
180 query_string = urlparse(url).query
181
182 sub_match = subreddit_pattern.search(path)
183 if sub_match:
184 result['sub'] = sub_match.group(1)
185
186 sort_match = sort_pattern.search(path)
187 if sort_match:
188 result['sort'] = sort_match.group(1)
189
190 time_match = time_pattern.search(query_string)
191 if time_match:
192 result['time'] = time_match.group(1)
193
194 return result
195 except Exception as e:
196 self.logger.error(f"Error parsing URL: {e}")
197 raise
198
199 @log_execution_time
200 def get_comments(self, subreddit: str, post_id: str, cursor: Optional[str] = None, sort: str = 'hot') -> Tuple[List[Dict[str, Any]], Optional[str]]:
201 try:
202 url = f"{self.BASE_URL}/svc/shreddit/more-comments/{subreddit}/t3_{post_id.split('_')[1]}"
203 params = {'sort': sort, 'top-level': '1'}
204 data = {}
205
206 if cursor:
207 params['cursor'] = cursor
208
209 response = self.session.post(url, params=params, data=data)
210 response.raise_for_status()
211
212 return self.parse_comments(response.text)
213 except requests.exceptions.RequestException as e:
214 self.logger.error(f"Error fetching comments: {e}")
215 raise
216 except Exception as e:
217 self.logger.error(f"Unexpected error: {e}")
218 raise
219
220 @log_execution_time
221 def parse_comments(self, html_content: str) -> Tuple[List[Dict[str, Any]], Optional[str]]:
222 try:
223 soup = BeautifulSoup(html_content, 'html.parser')
224 comments = []
225
226 for comment in soup.find_all('shreddit-comment'):
227 content_div = comment.find('div', {'class': 'md'})
228 # Extract clean comment text if content div exists
229 if content_div:
230 # Get all paragraphs from the content
231 paragraphs = content_div.find_all('p')
232 # Join paragraphs with newlines, strip whitespace
233 content = '\n'.join(p.get_text(strip=True) for p in paragraphs)
234 else:
235 content = ''
236 parsed_comment = {
237 "id": comment.get('thingid'),
238 "author": comment.get('author'),
239 "score": comment.get('score'),
240 "depth": comment.get('depth'),
241 "permalink": comment.get('permalink'),
242 "content": content.strip()
243 }
244 comments.append(parsed_comment)
245
246 next_cursor = self.get_next_comment_cursor(html_content)
247 return comments, next_cursor
248 except Exception as e:
249 self.logger.error(f"Error parsing comments: {e}")
250 raise
251
252 @log_execution_time
253 def get_next_comment_cursor(self, html_content: str) -> Optional[str]:
254 try:
255 soup = BeautifulSoup(html_content, 'html.parser')
256 faceplate_partial = soup.find('faceplate-partial', attrs={'loading': 'action'})
257
258 if faceplate_partial:
259 hidden_input = faceplate_partial.find('input', attrs={'type': 'hidden', 'name': 'cursor'})
260 if hidden_input:
261 return hidden_input.get('value')
262
263 except Exception as e:
264 self.logger.error(f"Error retrieving next comment cursor: {e}")
265 return None
266
267 @log_execution_time
268 def get_all_comments(self, subreddit: str, post_id: str, limit: int = 100) -> List[Dict[str, Any]]:
269 all_comments = []
270 cursor = None
271
272 try:
273 while len(all_comments) < limit:
274 comments, next_cursor = self.get_comments(subreddit, post_id, cursor)
275 all_comments.extend(comments)
276
277 if not next_cursor:
278 self.logger.info(f"Next cursor not found for post {post_id}.")
279 break
280
281 cursor = next_cursor
282 self.logger.info(f"Retrieved {len(all_comments)} comments.")
283 return all_comments[:limit]
284 except Exception as e:
285 self.logger.error(f"Error retrieving comments: {e}")
286 raise
287
288 @log_execution_time
289 def subreddit_link_from_query(self, query, sorting='top', period='week'):
290 try:
291 # If the input is just a subreddit name (with or without 'r/')
292 if not query.startswith('http'):
293 # Normalize input to the form 'r/subredditname'
294 if query.startswith('r/'):
295 query = f'https://www.reddit.com/{query}/'
296 else:
297 query = f'https://www.reddit.com/r/{query}/'
298
299 # Parse the subreddit link
300 parsed_url = urlparse(query)
301
302 # Ensure that the path ends with a trailing slash
303 path_parts = parsed_url.path.rstrip('/').split('/')
304
305 # Valid sorting options
306 valid_sorting = ['hot', 'new', 'rising', 'top']
307
308 # Check if the link is already sorted
309 if len(path_parts) > 3 and path_parts[3] in valid_sorting:
310 # Return the original link if already sorted
311 return query
312
313 # Otherwise, append the sorting method to the path
314 path_parts.append(sorting)
315
316 # Add the 't' parameter only if sorting is 'top'
317 query_params = parse_qs(parsed_url.query)
318 if sorting == 'top':
319 query_params['t'] = [period]
320
321 # Rebuild the URL
322 new_path = '/'.join(path_parts) + '/'
323 new_query = urlencode(query_params, doseq=True)
324
325 # Return the new URL
326 return urlunparse((parsed_url.scheme, parsed_url.netloc, new_path, parsed_url.params, new_query, parsed_url.fragment))
327
328 except Exception as e:
329 self.logger.error(f"Error constructing subreddit URL from query: {e}")
330 raise
example/advanced-reddit-scraper/src/session.py
1cookies = {
2 'csv': '2',
3 'edgebucket': 'lH2AY01VDhdJZrAOeK',
4 'loid': '0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg',
5 'pc': '81',
6 'reddit_session': '710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6',
7 'token_v2': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI5MTk4MTQ0LjY5NDYyMSwiaWF0IjoxNzI5MTExNzQ0LjY5NDYyMSwianRpIjoiZXBsV3k0R1VURHl4aGwtdEhWZnI2U0lxY00xR0lnIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.FBxK7Xnbhy-bW3l71YopqqUBpjfkOdz8XBUauNi3o3pScQLvO0sOs72E2kMiaYX6iTfUPyklR5xRnGVF6PjQmurx2vu8XAm3W1IkGIYPZOOvjnWKhbzv1m8bzfOHGSIZg9bOy7RoCce6A-HCKfR6y4nQyMaiv5jCUdLILePHdUYw3kZEC_ASAXEXvv-dyyaO2GCW_Jxq95CU6lYxqLaO73xhPzR9YjNl_RaAC9xMip6d5Xe3n5wuMdY8bQ3dAfqNVNJKI4fkIij0v90-SJT7vKffNSbueqrckCPgDIXQrpJA1_bx-npHLl5gg7-uBLwDUzXpWMO_BTDxgekscFc6fQ',
8 'reddit_chat_view': 'closed',
9 't2_927nml2x_recentclicks3': 't3_1g57do3%2Ct3_e7ewoo%2Ct3_1fvuu0l%2Ct3_435p6x%2Ct3_d956ag%2Ct3_15svnqa%2Ct3_f2nxzt%2Ct3_e6ryal%2Ct3_79uq5s%2Ct3_7qry4j',
10 'csrf_token': 'd7886d7dde33b8ae9f535d8cf19dad8f',
11 'session_tracker': 'mifofnihaddjdlkjml.0.1729129739819.Z0FBQUFBQm5FRzBMWUZrSlZycUctVmcwZ25zZm9ZRTV4T1NMNjdQTW45dTI1eFQ1NDVqTWF2N20yQzlXNVFCUkEyNndKazVCbWJ1ZHFoVlFZMEFPS2xGYXpDY2Fxcm4xX1F6UEZfWFpfal92NTVuRDF6Q0EzTWtOT3lZOENQQUVBaFlScWQwMGpqZFk',
12}
13
14headers = {
15 'accept': 'text/vnd.reddit.partial+html, text/html;q=0.9',
16 'accept-language': 'en,en-US;q=0.9,tr-TR;q=0.8,tr;q=0.7,de;q=0.6',
17 'content-type': 'application/x-www-form-urlencoded',
18 # 'cookie': 'csv=2; edgebucket=lH2AY01VDhdJZrAOeK; loid=0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg; pc=81; reddit_session=710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6; token_v2=eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI5MTk4MTQ0LjY5NDYyMSwiaWF0IjoxNzI5MTExNzQ0LjY5NDYyMSwianRpIjoiZXBsV3k0R1VURHl4aGwtdEhWZnI2U0lxY00xR0lnIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.FBxK7Xnbhy-bW3l71YopqqUBpjfkOdz8XBUauNi3o3pScQLvO0sOs72E2kMiaYX6iTfUPyklR5xRnGVF6PjQmurx2vu8XAm3W1IkGIYPZOOvjnWKhbzv1m8bzfOHGSIZg9bOy7RoCce6A-HCKfR6y4nQyMaiv5jCUdLILePHdUYw3kZEC_ASAXEXvv-dyyaO2GCW_Jxq95CU6lYxqLaO73xhPzR9YjNl_RaAC9xMip6d5Xe3n5wuMdY8bQ3dAfqNVNJKI4fkIij0v90-SJT7vKffNSbueqrckCPgDIXQrpJA1_bx-npHLl5gg7-uBLwDUzXpWMO_BTDxgekscFc6fQ; reddit_chat_view=closed; t2_927nml2x_recentclicks3=t3_1g57do3%2Ct3_e7ewoo%2Ct3_1fvuu0l%2Ct3_435p6x%2Ct3_d956ag%2Ct3_15svnqa%2Ct3_f2nxzt%2Ct3_e6ryal%2Ct3_79uq5s%2Ct3_7qry4j; csrf_token=d7886d7dde33b8ae9f535d8cf19dad8f; session_tracker=mifofnihaddjdlkjml.0.1729129739819.Z0FBQUFBQm5FRzBMWUZrSlZycUctVmcwZ25zZm9ZRTV4T1NMNjdQTW45dTI1eFQ1NDVqTWF2N20yQzlXNVFCUkEyNndKazVCbWJ1ZHFoVlFZMEFPS2xGYXpDY2Fxcm4xX1F6UEZfWFpfal92NTVuRDF6Q0EzTWtOT3lZOENQQUVBaFlScWQwMGpqZFk',
19 'origin': 'https://www.reddit.com',
20 'priority': 'u=1, i',
21 'referer': 'https://www.reddit.com/r/AskReddit/comments/1g57do3/whats_a_bitter_life_lesson_you_learned_from_your/',
22 'sec-ch-ua': '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
23 'sec-ch-ua-mobile': '?0',
24 'sec-ch-ua-platform': '"Windows"',
25 'sec-fetch-dest': 'empty',
26 'sec-fetch-mode': 'cors',
27 'sec-fetch-site': 'same-origin',
28 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
29}
example/advanced-reddit-scraper/src/__main__.py
1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint.
6asyncio.run(main())
src/llm_providers/__pycache__/base_provider.cpython-312.pyc
Downloadsrc/llm_providers/__pycache__/factory.cpython-312.pyc
Downloadsrc/llm_providers/__pycache__/gemini_provider.cpython-312.pyc
Downloadsrc/llm_providers/__pycache__/__init__.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/.git/info/exclude
1# git ls-files --others --exclude-from=.git/info/exclude
2# Lines that start with '#' are comments.
3# For a project mostly in C, the following would be a good set of
4# exclude patterns (uncomment them if you want to use them):
5# *.[oa]
6# *~
example/advanced-reddit-scraper/.git/hooks/applypatch-msg.sample
1#!/bin/sh
2#
3# An example hook script to check the commit log message taken by
4# applypatch from an e-mail message.
5#
6# The hook should exit with non-zero status after issuing an
7# appropriate message if it wants to stop the commit. The hook is
8# allowed to edit the commit message file.
9#
10# To enable this hook, rename this file to "applypatch-msg".
11
12. git-sh-setup
13commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
14test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
15:
example/advanced-reddit-scraper/.git/hooks/commit-msg.sample
1#!/bin/sh
2#
3# An example hook script to check the commit log message.
4# Called by "git commit" with one argument, the name of the file
5# that has the commit message. The hook should exit with non-zero
6# status after issuing an appropriate message if it wants to stop the
7# commit. The hook is allowed to edit the commit message file.
8#
9# To enable this hook, rename this file to "commit-msg".
10
11# Uncomment the below to add a Signed-off-by line to the message.
12# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
13# hook is more suited to it.
14#
15# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
16# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
17
18# This example catches duplicate Signed-off-by lines.
19
20test "" = "$(grep '^Signed-off-by: ' "$1" |
21 sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || {
22 echo >&2 Duplicate Signed-off-by lines.
23 exit 1
24}
example/advanced-reddit-scraper/.git/hooks/fsmonitor-watchman.sample
1#!/usr/bin/perl
2
3use strict;
4use warnings;
5use IPC::Open2;
6
7# An example hook script to integrate Watchman
8# (https://facebook.github.io/watchman/) with git to speed up detecting
9# new and modified files.
10#
11# The hook is passed a version (currently 2) and last update token
12# formatted as a string and outputs to stdout a new update token and
13# all files that have been modified since the update token. Paths must
14# be relative to the root of the working tree and separated by a single NUL.
15#
16# To enable this hook, rename this file to "query-watchman" and set
17# 'git config core.fsmonitor .git/hooks/query-watchman'
18#
19my ($version, $last_update_token) = @ARGV;
20
21# Uncomment for debugging
22# print STDERR "$0 $version $last_update_token\n";
23
24# Check the hook interface version
25if ($version ne 2) {
26 die "Unsupported query-fsmonitor hook version '$version'.\n" .
27 "Falling back to scanning...\n";
28}
29
30my $git_work_tree = get_working_dir();
31
32my $retry = 1;
33
34my $json_pkg;
35eval {
36 require JSON::XS;
37 $json_pkg = "JSON::XS";
38 1;
39} or do {
40 require JSON::PP;
41 $json_pkg = "JSON::PP";
42};
43
44launch_watchman();
45
46sub launch_watchman {
47 my $o = watchman_query();
48 if (is_work_tree_watched($o)) {
49 output_result($o->{clock}, @{$o->{files}});
50 }
51}
52
53sub output_result {
54 my ($clockid, @files) = @_;
55
56 # Uncomment for debugging watchman output
57 # open (my $fh, ">", ".git/watchman-output.out");
58 # binmode $fh, ":utf8";
59 # print $fh "$clockid\n@files\n";
60 # close $fh;
61
62 binmode STDOUT, ":utf8";
63 print $clockid;
64 print "\0";
65 local $, = "\0";
66 print @files;
67}
68
69sub watchman_clock {
70 my $response = qx/watchman clock "$git_work_tree"/;
71 die "Failed to get clock id on '$git_work_tree'.\n" .
72 "Falling back to scanning...\n" if $? != 0;
73
74 return $json_pkg->new->utf8->decode($response);
75}
76
77sub watchman_query {
78 my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
79 or die "open2() failed: $!\n" .
80 "Falling back to scanning...\n";
81
82 # In the query expression below we're asking for names of files that
83 # changed since $last_update_token but not from the .git folder.
84 #
85 # To accomplish this, we're using the "since" generator to use the
86 # recency index to select candidate nodes and "fields" to limit the
87 # output to file names only. Then we're using the "expression" term to
88 # further constrain the results.
89 my $last_update_line = "";
90 if (substr($last_update_token, 0, 1) eq "c") {
91 $last_update_token = "\"$last_update_token\"";
92 $last_update_line = qq[\n"since": $last_update_token,];
93 }
94 my $query = <<" END";
95 ["query", "$git_work_tree", {$last_update_line
96 "fields": ["name"],
97 "expression": ["not", ["dirname", ".git"]]
98 }]
99 END
100
101 # Uncomment for debugging the watchman query
102 # open (my $fh, ">", ".git/watchman-query.json");
103 # print $fh $query;
104 # close $fh;
105
106 print CHLD_IN $query;
107 close CHLD_IN;
108 my $response = do {local $/; <CHLD_OUT>};
109
110 # Uncomment for debugging the watch response
111 # open ($fh, ">", ".git/watchman-response.json");
112 # print $fh $response;
113 # close $fh;
114
115 die "Watchman: command returned no output.\n" .
116 "Falling back to scanning...\n" if $response eq "";
117 die "Watchman: command returned invalid output: $response\n" .
118 "Falling back to scanning...\n" unless $response =~ /^\{/;
119
120 return $json_pkg->new->utf8->decode($response);
121}
122
123sub is_work_tree_watched {
124 my ($output) = @_;
125 my $error = $output->{error};
126 if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
127 $retry--;
128 my $response = qx/watchman watch "$git_work_tree"/;
129 die "Failed to make watchman watch '$git_work_tree'.\n" .
130 "Falling back to scanning...\n" if $? != 0;
131 $output = $json_pkg->new->utf8->decode($response);
132 $error = $output->{error};
133 die "Watchman: $error.\n" .
134 "Falling back to scanning...\n" if $error;
135
136 # Uncomment for debugging watchman output
137 # open (my $fh, ">", ".git/watchman-output.out");
138 # close $fh;
139
140 # Watchman will always return all files on the first query so
141 # return the fast "everything is dirty" flag to git and do the
142 # Watchman query just to get it over with now so we won't pay
143 # the cost in git to look up each individual file.
144 my $o = watchman_clock();
145 $error = $output->{error};
146
147 die "Watchman: $error.\n" .
148 "Falling back to scanning...\n" if $error;
149
150 output_result($o->{clock}, ("/"));
151 $last_update_token = $o->{clock};
152
153 eval { launch_watchman() };
154 return 0;
155 }
156
157 die "Watchman: $error.\n" .
158 "Falling back to scanning...\n" if $error;
159
160 return 1;
161}
162
163sub get_working_dir {
164 my $working_dir;
165 if ($^O =~ 'msys' || $^O =~ 'cygwin') {
166 $working_dir = Win32::GetCwd();
167 $working_dir =~ tr/\\/\//;
168 } else {
169 require Cwd;
170 $working_dir = Cwd::cwd();
171 }
172
173 return $working_dir;
174}
example/advanced-reddit-scraper/.git/hooks/post-update.sample
1#!/bin/sh
2#
3# An example hook script to prepare a packed repository for use over
4# dumb transports.
5#
6# To enable this hook, rename this file to "post-update".
7
8exec git update-server-info
example/advanced-reddit-scraper/.git/hooks/pre-applypatch.sample
1#!/bin/sh
2#
3# An example hook script to verify what is about to be committed
4# by applypatch from an e-mail message.
5#
6# The hook should exit with non-zero status after issuing an
7# appropriate message if it wants to stop the commit.
8#
9# To enable this hook, rename this file to "pre-applypatch".
10
11. git-sh-setup
12precommit="$(git rev-parse --git-path hooks/pre-commit)"
13test -x "$precommit" && exec "$precommit" ${1+"$@"}
14:
example/advanced-reddit-scraper/.git/hooks/pre-commit.sample
1#!/bin/sh
2#
3# An example hook script to verify what is about to be committed.
4# Called by "git commit" with no arguments. The hook should
5# exit with non-zero status after issuing an appropriate message if
6# it wants to stop the commit.
7#
8# To enable this hook, rename this file to "pre-commit".
9
10if git rev-parse --verify HEAD >/dev/null 2>&1
11then
12 against=HEAD
13else
14 # Initial commit: diff against an empty tree object
15 against=$(git hash-object -t tree /dev/null)
16fi
17
18# If you want to allow non-ASCII filenames set this variable to true.
19allownonascii=$(git config --type=bool hooks.allownonascii)
20
21# Redirect output to stderr.
22exec 1>&2
23
24# Cross platform projects tend to avoid non-ASCII filenames; prevent
25# them from being added to the repository. We exploit the fact that the
26# printable range starts at the space character and ends with tilde.
27if [ "$allownonascii" != "true" ] &&
28 # Note that the use of brackets around a tr range is ok here, (it's
29 # even required, for portability to Solaris 10's /usr/bin/tr), since
30 # the square bracket bytes happen to fall in the designated range.
31 test $(git diff-index --cached --name-only --diff-filter=A -z $against |
32 LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
33then
34 cat <<\EOF
35Error: Attempt to add a non-ASCII file name.
36
37This can cause problems if you want to work with people on other platforms.
38
39To be portable it is advisable to rename the file.
40
41If you know what you are doing you can disable this check using:
42
43 git config hooks.allownonascii true
44EOF
45 exit 1
46fi
47
48# If there are whitespace errors, print the offending file names and fail.
49exec git diff-index --check --cached $against --
example/advanced-reddit-scraper/.git/hooks/pre-merge-commit.sample
1#!/bin/sh
2#
3# An example hook script to verify what is about to be committed.
4# Called by "git merge" with no arguments. The hook should
5# exit with non-zero status after issuing an appropriate message to
6# stderr if it wants to stop the merge commit.
7#
8# To enable this hook, rename this file to "pre-merge-commit".
9
10. git-sh-setup
11test -x "$GIT_DIR/hooks/pre-commit" &&
12 exec "$GIT_DIR/hooks/pre-commit"
13:
example/advanced-reddit-scraper/.git/hooks/pre-push.sample
1#!/bin/sh
2
3# An example hook script to verify what is about to be pushed. Called by "git
4# push" after it has checked the remote status, but before anything has been
5# pushed. If this script exits with a non-zero status nothing will be pushed.
6#
7# This hook is called with the following parameters:
8#
9# $1 -- Name of the remote to which the push is being done
10# $2 -- URL to which the push is being done
11#
12# If pushing without using a named remote those arguments will be equal.
13#
14# Information about the commits which are being pushed is supplied as lines to
15# the standard input in the form:
16#
17# <local ref> <local oid> <remote ref> <remote oid>
18#
19# This sample shows how to prevent push of commits where the log message starts
20# with "WIP" (work in progress).
21
22remote="$1"
23url="$2"
24
25zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
26
27while read local_ref local_oid remote_ref remote_oid
28do
29 if test "$local_oid" = "$zero"
30 then
31 # Handle delete
32 :
33 else
34 if test "$remote_oid" = "$zero"
35 then
36 # New branch, examine all commits
37 range="$local_oid"
38 else
39 # Update to existing branch, examine new commits
40 range="$remote_oid..$local_oid"
41 fi
42
43 # Check for WIP commit
44 commit=$(git rev-list -n 1 --grep '^WIP' "$range")
45 if test -n "$commit"
46 then
47 echo >&2 "Found WIP commit in $local_ref, not pushing"
48 exit 1
49 fi
50 fi
51done
52
53exit 0
example/advanced-reddit-scraper/.git/hooks/pre-rebase.sample
1#!/bin/sh
2#
3# Copyright (c) 2006, 2008 Junio C Hamano
4#
5# The "pre-rebase" hook is run just before "git rebase" starts doing
6# its job, and can prevent the command from running by exiting with
7# non-zero status.
8#
9# The hook is called with the following parameters:
10#
11# $1 -- the upstream the series was forked from.
12# $2 -- the branch being rebased (or empty when rebasing the current branch).
13#
14# This sample shows how to prevent topic branches that are already
15# merged to 'next' branch from getting rebased, because allowing it
16# would result in rebasing already published history.
17
18publish=next
19basebranch="$1"
20if test "$#" = 2
21then
22 topic="refs/heads/$2"
23else
24 topic=`git symbolic-ref HEAD` ||
25 exit 0 ;# we do not interrupt rebasing detached HEAD
26fi
27
28case "$topic" in
29refs/heads/??/*)
30 ;;
31*)
32 exit 0 ;# we do not interrupt others.
33 ;;
34esac
35
36# Now we are dealing with a topic branch being rebased
37# on top of master. Is it OK to rebase it?
38
39# Does the topic really exist?
40git show-ref -q "$topic" || {
41 echo >&2 "No such branch $topic"
42 exit 1
43}
44
45# Is topic fully merged to master?
46not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
47if test -z "$not_in_master"
48then
49 echo >&2 "$topic is fully merged to master; better remove it."
50 exit 1 ;# we could allow it, but there is no point.
51fi
52
53# Is topic ever merged to next? If so you should not be rebasing it.
54only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
55only_next_2=`git rev-list ^master ${publish} | sort`
56if test "$only_next_1" = "$only_next_2"
57then
58 not_in_topic=`git rev-list "^$topic" master`
59 if test -z "$not_in_topic"
60 then
61 echo >&2 "$topic is already up to date with master"
62 exit 1 ;# we could allow it, but there is no point.
63 else
64 exit 0
65 fi
66else
67 not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
68 /usr/bin/perl -e '
69 my $topic = $ARGV[0];
70 my $msg = "* $topic has commits already merged to public branch:\n";
71 my (%not_in_next) = map {
72 /^([0-9a-f]+) /;
73 ($1 => 1);
74 } split(/\n/, $ARGV[1]);
75 for my $elem (map {
76 /^([0-9a-f]+) (.*)$/;
77 [$1 => $2];
78 } split(/\n/, $ARGV[2])) {
79 if (!exists $not_in_next{$elem->[0]}) {
80 if ($msg) {
81 print STDERR $msg;
82 undef $msg;
83 }
84 print STDERR " $elem->[1]\n";
85 }
86 }
87 ' "$topic" "$not_in_next" "$not_in_master"
88 exit 1
89fi
90
91<<\DOC_END
92
93This sample hook safeguards topic branches that have been
94published from being rewound.
95
96The workflow assumed here is:
97
98 * Once a topic branch forks from "master", "master" is never
99 merged into it again (either directly or indirectly).
100
101 * Once a topic branch is fully cooked and merged into "master",
102 it is deleted. If you need to build on top of it to correct
103 earlier mistakes, a new topic branch is created by forking at
104 the tip of the "master". This is not strictly necessary, but
105 it makes it easier to keep your history simple.
106
107 * Whenever you need to test or publish your changes to topic
108 branches, merge them into "next" branch.
109
110The script, being an example, hardcodes the publish branch name
111to be "next", but it is trivial to make it configurable via
112$GIT_DIR/config mechanism.
113
114With this workflow, you would want to know:
115
116(1) ... if a topic branch has ever been merged to "next". Young
117 topic branches can have stupid mistakes you would rather
118 clean up before publishing, and things that have not been
119 merged into other branches can be easily rebased without
120 affecting other people. But once it is published, you would
121 not want to rewind it.
122
123(2) ... if a topic branch has been fully merged to "master".
124 Then you can delete it. More importantly, you should not
125 build on top of it -- other people may already want to
126 change things related to the topic as patches against your
127 "master", so if you need further changes, it is better to
128 fork the topic (perhaps with the same name) afresh from the
129 tip of "master".
130
131Let's look at this example:
132
133 o---o---o---o---o---o---o---o---o---o "next"
134 / / / /
135 / a---a---b A / /
136 / / / /
137 / / c---c---c---c B /
138 / / / \ /
139 / / / b---b C \ /
140 / / / / \ /
141 ---o---o---o---o---o---o---o---o---o---o---o "master"
142
143
144A, B and C are topic branches.
145
146 * A has one fix since it was merged up to "next".
147
148 * B has finished. It has been fully merged up to "master" and "next",
149 and is ready to be deleted.
150
151 * C has not merged to "next" at all.
152
153We would want to allow C to be rebased, refuse A, and encourage
154B to be deleted.
155
156To compute (1):
157
158 git rev-list ^master ^topic next
159 git rev-list ^master next
160
161 if these match, topic has not merged in next at all.
162
163To compute (2):
164
165 git rev-list master..topic
166
167 if this is empty, it is fully merged to "master".
168
169DOC_END
example/advanced-reddit-scraper/.git/hooks/pre-receive.sample
1#!/bin/sh
2#
3# An example hook script to make use of push options.
4# The example simply echoes all push options that start with 'echoback='
5# and rejects all pushes when the "reject" push option is used.
6#
7# To enable this hook, rename this file to "pre-receive".
8
9if test -n "$GIT_PUSH_OPTION_COUNT"
10then
11 i=0
12 while test "$i" -lt "$GIT_PUSH_OPTION_COUNT"
13 do
14 eval "value=\$GIT_PUSH_OPTION_$i"
15 case "$value" in
16 echoback=*)
17 echo "echo from the pre-receive-hook: ${value#*=}" >&2
18 ;;
19 reject)
20 exit 1
21 esac
22 i=$((i + 1))
23 done
24fi
example/advanced-reddit-scraper/.git/hooks/prepare-commit-msg.sample
1#!/bin/sh
2#
3# An example hook script to prepare the commit log message.
4# Called by "git commit" with the name of the file that has the
5# commit message, followed by the description of the commit
6# message's source. The hook's purpose is to edit the commit
7# message file. If the hook fails with a non-zero status,
8# the commit is aborted.
9#
10# To enable this hook, rename this file to "prepare-commit-msg".
11
12# This hook includes three examples. The first one removes the
13# "# Please enter the commit message..." help message.
14#
15# The second includes the output of "git diff --name-status -r"
16# into the message, just before the "git status" output. It is
17# commented because it doesn't cope with --amend or with squashed
18# commits.
19#
20# The third example adds a Signed-off-by line to the message, that can
21# still be edited. This is rarely a good idea.
22
23COMMIT_MSG_FILE=$1
24COMMIT_SOURCE=$2
25SHA1=$3
26
27/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"
28
29# case "$COMMIT_SOURCE,$SHA1" in
30# ,|template,)
31# /usr/bin/perl -i.bak -pe '
32# print "\n" . `git diff --cached --name-status -r`
33# if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;
34# *) ;;
35# esac
36
37# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
38# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"
39# if test -z "$COMMIT_SOURCE"
40# then
41# /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"
42# fi
example/advanced-reddit-scraper/.git/hooks/push-to-checkout.sample
1#!/bin/sh
2
3# An example hook script to update a checked-out tree on a git push.
4#
5# This hook is invoked by git-receive-pack(1) when it reacts to git
6# push and updates reference(s) in its repository, and when the push
7# tries to update the branch that is currently checked out and the
8# receive.denyCurrentBranch configuration variable is set to
9# updateInstead.
10#
11# By default, such a push is refused if the working tree and the index
12# of the remote repository has any difference from the currently
13# checked out commit; when both the working tree and the index match
14# the current commit, they are updated to match the newly pushed tip
15# of the branch. This hook is to be used to override the default
16# behaviour; however the code below reimplements the default behaviour
17# as a starting point for convenient modification.
18#
19# The hook receives the commit with which the tip of the current
20# branch is going to be updated:
21commit=$1
22
23# It can exit with a non-zero status to refuse the push (when it does
24# so, it must not modify the index or the working tree).
25die () {
26 echo >&2 "$*"
27 exit 1
28}
29
30# Or it can make any necessary changes to the working tree and to the
31# index to bring them to the desired state when the tip of the current
32# branch is updated to the new commit, and exit with a zero status.
33#
34# For example, the hook can simply run git read-tree -u -m HEAD "$1"
35# in order to emulate git fetch that is run in the reverse direction
36# with git push, as the two-tree form of git read-tree -u -m is
37# essentially the same as git switch or git checkout that switches
38# branches while keeping the local changes in the working tree that do
39# not interfere with the difference between the branches.
40
41# The below is a more-or-less exact translation to shell of the C code
42# for the default behaviour for git's push-to-checkout hook defined in
43# the push_to_deploy() function in builtin/receive-pack.c.
44#
45# Note that the hook will be executed from the repository directory,
46# not from the working tree, so if you want to perform operations on
47# the working tree, you will have to adapt your code accordingly, e.g.
48# by adding "cd .." or using relative paths.
49
50if ! git update-index -q --ignore-submodules --refresh
51then
52 die "Up-to-date check failed"
53fi
54
55if ! git diff-files --quiet --ignore-submodules --
56then
57 die "Working directory has unstaged changes"
58fi
59
60# This is a rough translation of:
61#
62# head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX
63if git cat-file -e HEAD 2>/dev/null
64then
65 head=HEAD
66else
67 head=$(git hash-object -t tree --stdin </dev/null)
68fi
69
70if ! git diff-index --quiet --cached --ignore-submodules $head --
71then
72 die "Working directory has staged changes"
73fi
74
75if ! git read-tree -u -m "$commit"
76then
77 die "Could not update working tree to new HEAD"
78fi
example/advanced-reddit-scraper/.git/hooks/sendemail-validate.sample
1#!/bin/sh
2
3# An example hook script to validate a patch (and/or patch series) before
4# sending it via email.
5#
6# The hook should exit with non-zero status after issuing an appropriate
7# message if it wants to prevent the email(s) from being sent.
8#
9# To enable this hook, rename this file to "sendemail-validate".
10#
11# By default, it will only check that the patch(es) can be applied on top of
12# the default upstream branch without conflicts in a secondary worktree. After
13# validation (successful or not) of the last patch of a series, the worktree
14# will be deleted.
15#
16# The following config variables can be set to change the default remote and
17# remote ref that are used to apply the patches against:
18#
19# sendemail.validateRemote (default: origin)
20# sendemail.validateRemoteRef (default: HEAD)
21#
22# Replace the TODO placeholders with appropriate checks according to your
23# needs.
24
25validate_cover_letter () {
26 file="$1"
27 # TODO: Replace with appropriate checks (e.g. spell checking).
28 true
29}
30
31validate_patch () {
32 file="$1"
33 # Ensure that the patch applies without conflicts.
34 git am -3 "$file" || return
35 # TODO: Replace with appropriate checks for this patch
36 # (e.g. checkpatch.pl).
37 true
38}
39
40validate_series () {
41 # TODO: Replace with appropriate checks for the whole series
42 # (e.g. quick build, coding style checks, etc.).
43 true
44}
45
46# main -------------------------------------------------------------------------
47
48if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1
49then
50 remote=$(git config --default origin --get sendemail.validateRemote) &&
51 ref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&
52 worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&
53 git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" &&
54 git config --replace-all sendemail.validateWorktree "$worktree"
55else
56 worktree=$(git config --get sendemail.validateWorktree)
57fi || {
58 echo "sendemail-validate: error: failed to prepare worktree" >&2
59 exit 1
60}
61
62unset GIT_DIR GIT_WORK_TREE
63cd "$worktree" &&
64
65if grep -q "^diff --git " "$1"
66then
67 validate_patch "$1"
68else
69 validate_cover_letter "$1"
70fi &&
71
72if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"
73then
74 git config --unset-all sendemail.validateWorktree &&
75 trap 'git worktree remove -ff "$worktree"' EXIT &&
76 validate_series
77fi
example/advanced-reddit-scraper/.git/hooks/update.sample
1#!/bin/sh
2#
3# An example hook script to block unannotated tags from entering.
4# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
5#
6# To enable this hook, rename this file to "update".
7#
8# Config
9# ------
10# hooks.allowunannotated
11# This boolean sets whether unannotated tags will be allowed into the
12# repository. By default they won't be.
13# hooks.allowdeletetag
14# This boolean sets whether deleting tags will be allowed in the
15# repository. By default they won't be.
16# hooks.allowmodifytag
17# This boolean sets whether a tag may be modified after creation. By default
18# it won't be.
19# hooks.allowdeletebranch
20# This boolean sets whether deleting branches will be allowed in the
21# repository. By default they won't be.
22# hooks.denycreatebranch
23# This boolean sets whether remotely creating branches will be denied
24# in the repository. By default this is allowed.
25#
26
27# --- Command line
28refname="$1"
29oldrev="$2"
30newrev="$3"
31
32# --- Safety check
33if [ -z "$GIT_DIR" ]; then
34 echo "Don't run this script from the command line." >&2
35 echo " (if you want, you could supply GIT_DIR then run" >&2
36 echo " $0 <ref> <oldrev> <newrev>)" >&2
37 exit 1
38fi
39
40if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
41 echo "usage: $0 <ref> <oldrev> <newrev>" >&2
42 exit 1
43fi
44
45# --- Config
46allowunannotated=$(git config --type=bool hooks.allowunannotated)
47allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)
48denycreatebranch=$(git config --type=bool hooks.denycreatebranch)
49allowdeletetag=$(git config --type=bool hooks.allowdeletetag)
50allowmodifytag=$(git config --type=bool hooks.allowmodifytag)
51
52# check for no description
53projectdesc=$(sed -e '1q' "$GIT_DIR/description")
54case "$projectdesc" in
55"Unnamed repository"* | "")
56 echo "*** Project description file hasn't been set" >&2
57 exit 1
58 ;;
59esac
60
61# --- Check types
62# if $newrev is 0000...0000, it's a commit to delete a ref.
63zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
64if [ "$newrev" = "$zero" ]; then
65 newrev_type=delete
66else
67 newrev_type=$(git cat-file -t $newrev)
68fi
69
70case "$refname","$newrev_type" in
71 refs/tags/*,commit)
72 # un-annotated tag
73 short_refname=${refname##refs/tags/}
74 if [ "$allowunannotated" != "true" ]; then
75 echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
76 echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
77 exit 1
78 fi
79 ;;
80 refs/tags/*,delete)
81 # delete tag
82 if [ "$allowdeletetag" != "true" ]; then
83 echo "*** Deleting a tag is not allowed in this repository" >&2
84 exit 1
85 fi
86 ;;
87 refs/tags/*,tag)
88 # annotated tag
89 if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
90 then
91 echo "*** Tag '$refname' already exists." >&2
92 echo "*** Modifying a tag is not allowed in this repository." >&2
93 exit 1
94 fi
95 ;;
96 refs/heads/*,commit)
97 # branch
98 if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
99 echo "*** Creating a branch is not allowed in this repository" >&2
100 exit 1
101 fi
102 ;;
103 refs/heads/*,delete)
104 # delete branch
105 if [ "$allowdeletebranch" != "true" ]; then
106 echo "*** Deleting a branch is not allowed in this repository" >&2
107 exit 1
108 fi
109 ;;
110 refs/remotes/*,commit)
111 # tracking branch
112 ;;
113 refs/remotes/*,delete)
114 # delete tracking branch
115 if [ "$allowdeletebranch" != "true" ]; then
116 echo "*** Deleting a tracking branch is not allowed in this repository" >&2
117 exit 1
118 fi
119 ;;
120 *)
121 # Anything else (is there anything else?)
122 echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
123 exit 1
124 ;;
125esac
126
127# --- Finished
128exit 0
example/advanced-reddit-scraper/.git/logs/HEAD
10000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917571 +0300 commit (initial): init
268034258495c18bfb133b925e51bbce4b07c2cf2 0000000000000000000000000000000000000000 deduble <yunusemremre@gmail.com> 1727917683 +0300 Branch: renamed refs/heads/master to refs/heads/main
30000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917683 +0300 Branch: renamed refs/heads/master to refs/heads/main
468034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031615 +0300 commit: using illegal api
5d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419029 +0300 commit: v1
6c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420381 +0300 commit: release
example/advanced-reddit-scraper/src/__pycache__/main.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/src/__pycache__/reddit.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/src/__pycache__/redditor.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/src/__pycache__/session.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/src/__pycache__/__main__.cpython-312.pyc
Downloadexample/advanced-reddit-scraper/.git/objects/01/fe73e828e238ad7552d0d78359d99346f66976
Downloadexample/advanced-reddit-scraper/.git/objects/17/dccf81f638ab88fec85008e15a30e05de19877
Downloadexample/advanced-reddit-scraper/.git/objects/1d/59e0ed9229e85292631f660cce4af209b785b3
Downloadexample/advanced-reddit-scraper/.git/objects/1e/e8ab06eba9d300f69f072647853b4d62ee298a
Downloadexample/advanced-reddit-scraper/.git/objects/1f/7481b8559d3f11b9a5a7648a3aa5a5a05dd531
Downloadexample/advanced-reddit-scraper/.git/objects/1f/faf74e22ef23eb3a921262e20729c2b24ef690
Downloadexample/advanced-reddit-scraper/.git/objects/5c/a4deda17fcf053a1705f70aba40b1bf25281e2
Downloadexample/advanced-reddit-scraper/.git/objects/5e/6f1303c550883e16073bc4311e3b7c865b0ecb
Downloadexample/advanced-reddit-scraper/.git/objects/65/bd461b426ed5f9b7f3ffcba098c0731caca73b
Downloadexample/advanced-reddit-scraper/.git/objects/68/034258495c18bfb133b925e51bbce4b07c2cf2
Downloadexample/advanced-reddit-scraper/.git/objects/6b/379c02244bbfd26691bd18cdc16f035a32e30c
Downloadexample/advanced-reddit-scraper/.git/objects/6e/b49d35e2ee97a2802f1f2856c0ce992b1853a7
Downloadexample/advanced-reddit-scraper/.git/objects/81/2ba43c6f6467ac8a78d4f1d604a0657958875d
Downloadexample/advanced-reddit-scraper/.git/objects/83/7427c84eb5b8cfafb7def87871ba0a18fed7cf
Downloadexample/advanced-reddit-scraper/.git/objects/84/0a359e8a157c8775b63a90af2aa76e6c8bd7db
Downloadexample/advanced-reddit-scraper/.git/objects/85/d63c75f9bb3c33e272284381f470a0360eae1b
Downloadexample/advanced-reddit-scraper/.git/objects/86/38ecf130d57c0931d9f989f46dd82843f3ac5b
Downloadexample/advanced-reddit-scraper/.git/objects/8f/3b4bb4e6848ea6a83eeb0296ab347583102674
Downloadexample/advanced-reddit-scraper/.git/objects/92/511f33669b448318aaeb62d839f94b6d6a3dd3
Downloadexample/advanced-reddit-scraper/.git/objects/9b/b76e6ae6f1bdf77d036f3fc9a9c488c41fcefd
Downloadexample/advanced-reddit-scraper/.git/objects/a3/e82ad054e006d638c99f853e24c3051e35cce2
Downloadexample/advanced-reddit-scraper/.git/objects/ae/dce4aa3907d1600116f7e70d076f583dbf9928
Downloadexample/advanced-reddit-scraper/.git/objects/b9/7ba54c96388855906965dde76c36d1897ca309
Downloadexample/advanced-reddit-scraper/.git/objects/bc/eb6e5e1b3d54d438d5030fd5cffc837072b816
Downloadexample/advanced-reddit-scraper/.git/objects/c4/7f4c6523f8273b1c9f8c1014f96512ac4be855
Downloadexample/advanced-reddit-scraper/.git/objects/c4/bd7187db208a3aa4109eac9c12530387bdf740
Downloadexample/advanced-reddit-scraper/.git/objects/c8/a66ed48d5f42d9fea6c24f6722018721d2c0a1
Downloadexample/advanced-reddit-scraper/.git/objects/ca/b93bc8c6509d4dda383bef6294b4b6211ed8c4
Downloadexample/advanced-reddit-scraper/.git/objects/d1/87efc3a2a2f86394a568fbed9bbee43b72d378
Downloadexample/advanced-reddit-scraper/.git/objects/d3/7f42b765bfb553100f49eb289e97909e349732
Downloadexample/advanced-reddit-scraper/.git/objects/d3/9c6a0e47a92ee8fc7f4baace6c8c2ef406bb45
Downloadexample/advanced-reddit-scraper/.git/objects/d6/4855c06228adcbe500c652ea2751c82a7dc7d1
Downloadexample/advanced-reddit-scraper/.git/objects/d9/8007b2c5fca88a8ee506a46f387c27745a488c
Downloadexample/advanced-reddit-scraper/.git/objects/ed/c46777034ef1b655f5af883a4d84cdf03bd6a7
Downloadexample/advanced-reddit-scraper/.git/objects/f3/6f5c6f3e4df6f647792db86053f288851ad990
Downloadexample/advanced-reddit-scraper/.git/objects/fc/2fe4bb281877272221d421a43a50ff90b05a56
Downloadexample/advanced-reddit-scraper/.git/refs/heads/main
1f36f5c6f3e4df6f647792db86053f288851ad990
example/advanced-reddit-scraper/.git/logs/refs/heads/main
10000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917571 +0300 commit (initial): init
268034258495c18bfb133b925e51bbce4b07c2cf2 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917683 +0300 Branch: renamed refs/heads/master to refs/heads/main
368034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031615 +0300 commit: using illegal api
4d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419029 +0300 commit: v1
5c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420381 +0300 commit: release
example/advanced-reddit-scraper/.git/refs/remotes/origin/main
1f36f5c6f3e4df6f647792db86053f288851ad990
example/advanced-reddit-scraper/.git/logs/refs/remotes/origin/main
10000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917690 +0300 update by push
268034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031636 +0300 update by push
3d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419037 +0300 update by push
4c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420388 +0300 update by push
Developer
Maintained by Community
Actor Metrics
1 monthly user
-
0 No bookmarks yet
>99% runs succeeded
Created in Mar 2025
Modified a day ago
Categories