AJ

Ai Job Finder

Try for free

No credit card required

Go to Store
AJ

Ai Job Finder

stefanie-rink/ai-job-finder
Try for free

No credit card required

Give a prompt or a CV and find jobs according to you

requirements.txt

1apify-client>=1.0.0,<2.0.0
2openai>=1.0.0
3anthropic>=0.5.0
4google-generativeai==0.3.0
5pydantic>=2.0.0

test.py

1#!/usr/bin/env python3
2import asyncio
3import json
4import sys
5import os
6from typing import Dict, Any, Optional
7
8# Import from our modules
9from src.llm_providers.factory import create_llm_provider
10from src.cv_processor import process_cv
11from src.prompt_processor import process_prompt
12from src.parameter_handler import apply_parameter_defaults
13
14async def test_cv_processing():
15    """Test CV processing with a local file"""
16    # Check if file path was provided
17    if len(sys.argv) < 2:
18        print("Usage: python test.py path/to/cv.pdf [prompt]")
19        sys.exit(1)
20    
21    # Get CV file path and optional prompt
22    cv_path = sys.argv[1]
23    prompt = sys.argv[2] if len(sys.argv) > 2 else None
24    
25    # Check if API key is set
26    openai_key = os.environ.get("OPENAI_API_KEY")
27    if not openai_key:
28        print("ERROR: OPENAI_API_KEY environment variable not set.")
29        print("Please set it with: export OPENAI_API_KEY=your-api-key")
30        sys.exit(1)
31    
32    # Read CV file
33    try:
34        with open(cv_path, "rb") as f:
35            cv_data = f.read()
36            
37        # Convert to base64 for testing
38        import base64
39        import mimetypes
40        mime_type, _ = mimetypes.guess_type(cv_path)
41        if not mime_type:
42            mime_type = "application/octet-stream"
43            
44        cv_data_base64 = f"data:{mime_type};base64,{base64.b64encode(cv_data).decode('utf-8')}"
45    except Exception as e:
46        print(f"Error reading CV file: {str(e)}")
47        sys.exit(1)
48    
49    # Create LLM provider
50    provider = create_llm_provider("openai", openai_key)
51    
52    # Process CV
53    print("Processing CV...")
54    cv_parameters = await process_cv(cv_data_base64, provider, "openai")
55    print(f"CV Parameters: {json.dumps(cv_parameters, indent=2)}")
56    
57    # Process prompt if provided
58    prompt_parameters = {}
59    if prompt:
60        print("\nProcessing prompt...")
61        prompt_parameters = await process_prompt(prompt, provider)
62        print(f"Prompt Parameters: {json.dumps(prompt_parameters, indent=2)}")
63    
64    # Merge and apply defaults
65    parameters = {**cv_parameters, **prompt_parameters}
66    final_parameters = apply_parameter_defaults(parameters)
67    
68    print("\nFinal LinkedIn Search Parameters:")
69    print(json.dumps(final_parameters, indent=2))
70    
71    # Note: This test doesn't actually call the LinkedIn scraper
72    print("\nTest complete. To perform a real LinkedIn search, upload this Actor to Apify.")
73
74if __name__ == "__main__":
75    asyncio.run(test_cv_processing())

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "ai-job-finder",
4    "title": "AI Job Finder",
5    "description": "An AI-powered tool that reads a CV and/or prompt to find relevant jobs on LinkedIn",
6    "version": "0.1",
7    "buildTag": "latest",
8    "meta": {
9        "templateId": "python-apify"
10    },
11    "input": "./input_schema.json",
12    "dockerfile": "./Dockerfile"
13}

.actor/Dockerfile

1# First specify the base Docker image.
2FROM apify/actor-python:3.12
3
4# Copy requirements.txt into the Actor image
5COPY requirements.txt ./
6
7# Install the packages specified in requirements.txt
8RUN echo "Python version:" \
9 && python --version \
10 && echo "Pip version:" \
11 && pip --version \
12 && echo "Installing dependencies:" \
13 && pip install -r requirements.txt \
14 && echo "All installed Python packages:" \
15 && pip freeze
16
17# Copy the remaining files and directories with the source code
18COPY . ./
19
20# Use compileall to ensure the runnability of the Actor Python code
21RUN python3 -m compileall -q .
22
23# Specify how to launch the source code of your Actor
24CMD ["python3", "-m", "src"]

.actor/INPUT_SCHEMA.json

1{
2    "title": "AI Job Finder",
3    "description": "An AI-powered tool that reads a CV and/or prompt to find relevant jobs on LinkedIn",
4    "type": "object",
5    "schemaVersion": 1,
6    "properties": {
7        "cv": {
8            "title": "CV/Resume",
9            "type": "object",
10            "description": "Upload your CV/resume (PDF, DOCX, TXT formats supported) as Base64 encoded string",
11            "editor": "file",
12            "nullable": true
13        },
14        "prompt": {
15            "title": "Job Search Query",
16            "type": "string",
17            "description": "Describe the job you're looking for (e.g., 'Senior Python Developer in New York')",
18            "editor": "textarea",
19            "default": "I'm looking for remote senior software engineering roles in AI companies. I have 5 years of experience with Python and machine learning.",
20            "nullable": true
21        },
22        "llm_settings": {
23            "title": "LLM Provider Settings",
24            "type": "object",
25            "description": "Configure which LLM provider to use",
26            "editor": "json",
27            "default": {
28                "provider": "gemini",
29                "model": "gemini-1.5-pro"
30            },
31            "prefill": {
32                "provider": "gemini",
33                "model": "gemini-1.5-pro"
34            }
35        },
36        "api_keys": {
37            "title": "API Keys",
38            "type": "object",
39            "description": "API keys for LLM providers (optional - defaults to environment variables)",
40            "editor": "json",
41            "default": {},
42            "prefill": {
43                "openai": "",
44                "claude": "",
45                "gemini": ""
46            }
47        },
48        "linkedin_search_params": {
49            "title": "Additional LinkedIn Search Parameters",
50            "type": "object",
51            "description": "Override specific LinkedIn search parameters",
52            "editor": "json",
53            "nullable": true
54        },
55        "proxy": {
56            "title": "Proxy Configuration",
57            "type": "object",
58            "description": "Configure Apify proxy for LinkedIn scraping",
59            "editor": "proxy",
60            "default": {
61                "useApifyProxy": true,
62                "apifyProxyGroups": ["RESIDENTIAL"]
63            }
64        }
65    },
66    "required": []
67}

src/cv_processor.py

1import logging
2import base64
3import json
4import re
5from typing import Dict, Any, Optional
6
7logger = logging.getLogger(__name__)
8
9async def process_cv(cv_data: str, llm_provider, provider_name: str) -> Dict[str, Any]:
10    """
11    Process CV data using the appropriate LLM provider
12    
13    Args:
14        cv_data: CV data (either base64 encoded file or plain text)
15        llm_provider: The LLM provider instance to use
16        provider_name: Name of the provider ('openai', 'claude', or 'gemini')
17        
18    Returns:
19        Dictionary of extracted parameters for LinkedIn job search
20    """
21    try:
22        logger.info(f"Processing CV with {provider_name} provider")
23        
24        # Process CV with the provider
25        cv_parameters = await llm_provider.process_cv(cv_data)
26        
27        # Validate and clean the parameters
28        cv_parameters = validate_cv_parameters(cv_parameters)
29        
30        logger.info(f"Successfully extracted parameters from CV: {json.dumps(cv_parameters, indent=2)}")
31        return cv_parameters
32        
33    except Exception as e:
34        logger.error(f"Error processing CV: {str(e)}")
35        # Return empty parameters, which will use defaults later
36        return {}
37
38def validate_cv_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:
39    """
40    Validate and clean the parameters extracted from the CV
41    
42    Args:
43        parameters: Raw parameters extracted by the LLM
44        
45    Returns:
46        Cleaned and validated parameters
47    """
48    cleaned = {}
49    
50    # Clean and validate title
51    if "title" in parameters and parameters["title"]:
52        cleaned["title"] = str(parameters["title"]).strip()
53    
54    # Clean and validate location
55    if "location" in parameters and parameters["location"]:
56        cleaned["location"] = str(parameters["location"]).strip()
57    
58    # Clean and validate experienceLevel
59    if "experienceLevel" in parameters and parameters["experienceLevel"]:
60        exp_level = str(parameters["experienceLevel"]).strip()
61        # Ensure it's a number from 1-5
62        if exp_level in ["1", "2", "3", "4", "5"]:
63            cleaned["experienceLevel"] = exp_level
64    
65    # Clean and validate workType
66    if "workType" in parameters and parameters["workType"]:
67        work_type = str(parameters["workType"]).strip()
68        # Ensure it's a valid work type (1, 2, or 3)
69        if work_type in ["1", "2", "3"]:
70            cleaned["workType"] = work_type
71    
72    # Clean and validate contractType
73    if "contractType" in parameters and parameters["contractType"]:
74        contract_type = str(parameters["contractType"]).strip().upper()
75        # Ensure it's a valid contract type (F, P, C, T, I, or V)
76        if contract_type in ["F", "P", "C", "T", "I", "V"]:
77            cleaned["contractType"] = contract_type
78    
79    # Clean and validate skills (might be used for custom filtering later)
80    if "skills" in parameters and isinstance(parameters["skills"], list):
81        cleaned["skills"] = [str(skill).strip() for skill in parameters["skills"] if skill]
82    
83    return cleaned

src/main.py

1#!/usr/bin/env python3
2from apify import Actor
3import logging
4import json
5import base64
6import re
7import os
8from typing import Dict, List, Any, Optional
9
10# Import providers
11from .llm_providers.factory import create_llm_provider
12from .cv_processor import process_cv
13from .prompt_processor import process_prompt
14from .parameter_handler import apply_parameter_defaults
15
16# Set up logging
17logging.basicConfig(level=logging.INFO)
18logger = logging.getLogger(__name__)
19
20async def main():
21    """Main entry point for the Actor"""
22    # Initialize the Actor
23    await Actor.init()
24    
25    # Get input from the actor
26    actor_input = await Actor.get_input() or {}
27    
28    # Validate input - require at least CV or prompt
29    cv_data = actor_input.get("cv")
30    prompt = actor_input.get("prompt")
31    
32    if not cv_data and not prompt:
33        raise ValueError("At least one of CV or prompt must be provided")
34    
35    # Get LLM settings
36    llm_settings = actor_input.get("llm_settings", {"provider": "gemini", "model": "gemini-1.5-pro"})
37    provider_name = llm_settings.get("provider", "gemini")
38    
39    # Get API key - first from input, then from environment variables
40    api_keys = actor_input.get("api_keys", {})
41    api_key = api_keys.get(provider_name)
42    
43    # If no API key in input, try to get from environment variables
44    if not api_key:
45        if provider_name == "openai":
46            api_key = os.getenv("OPENAI_API_KEY")
47        elif provider_name == "gemini":
48            api_key = os.getenv("GEMINI_API_KEY")
49        elif provider_name == "claude":
50            api_key = os.getenv("CLAUDE_API_KEY")
51    
52    # If no API key was found, we can't proceed with LLM processing
53    if not api_key:
54        logger.warning(f"No API key provided for {provider_name}")
55        await Actor.push_data([{
56            "title": "LLM API KEY IS NEEDED",
57            "description": f"Please provide an API key for {provider_name.upper()} to use this Actor",
58            "instructions": f"Set the {provider_name.upper()}_API_KEY environment variable or provide it in the api_keys input parameter",
59            "location": "N/A",
60            "companyName": "AI Job Finder",
61            "experienceLevel": "N/A",
62            "workType": "N/A",
63            "contractType": "N/A",
64            "publishedAt": "N/A",
65            "message": f"API key for {provider_name} is required to get real results"
66        }])
67        logger.info("Returned message indicating API key is needed")
68        return
69    
70    # Create LLM provider for processing
71    model = llm_settings.get("model")
72    if provider_name == "gemini" and not model:
73        model = "gemini-1.5-pro"
74        
75    logger.info(f"Using LLM provider: {provider_name} with model: {model}")
76    llm_provider = create_llm_provider(provider_name, api_key, model)
77    
78    # Process parameters
79    parameters = {}
80    
81    # Extract parameters from CV and/or prompt
82    if cv_data:
83        logger.info("Processing CV...")
84        cv_parameters = await process_cv(cv_data, llm_provider, provider_name)
85        parameters.update(cv_parameters)
86    
87    if prompt:
88        logger.info("Processing prompt...")
89        try:
90            prompt_parameters = await process_prompt(prompt, llm_provider)
91            # Prompt parameters override CV parameters
92            parameters.update(prompt_parameters)
93        except Exception as e:
94            logger.error(f"Error processing prompt: {str(e)}")
95            # Continue with default parameters
96    
97    # Apply any explicit parameters from input
98    linkedin_params = actor_input.get("linkedin_search_params", {})
99    if linkedin_params:
100        parameters.update(linkedin_params)
101    
102    # Apply defaults for missing parameters
103    parameters = apply_parameter_defaults(parameters)
104    
105    # Set proxy configuration
106    if "proxy_configuration" in actor_input:
107        parameters["proxy"] = actor_input["proxy_configuration"]
108    elif "proxy" in actor_input:
109        parameters["proxy"] = actor_input["proxy"]
110    
111    # Log the parameters we'll use
112    logger.info(f"Using LinkedIn search parameters: {json.dumps(parameters, indent=2)}")
113    
114    # Call LinkedIn scraper
115    logger.info("Calling LinkedIn scraper with parameters")
116    try:
117        jobs = await call_linkedin_scraper(parameters)
118        
119        # Save output
120        await Actor.push_data(jobs)
121        logger.info(f"Found {len(jobs)} matching jobs")
122    except Exception as e:
123        logger.error(f"Error calling LinkedIn scraper: {str(e)}")
124        # Return a meaningful error to the user
125        await Actor.push_data([{
126            "title": "Error Connecting to LinkedIn Scraper",
127            "description": f"An error occurred while trying to connect to the LinkedIn Jobs Scraper: {str(e)}",
128            "error": True,
129            "parameters": parameters
130        }])
131
132async def call_linkedin_scraper(parameters):
133    """Call the LinkedIn scraper with the given parameters"""
134    # Prepare the Actor input
135    run_input = {
136        "title": parameters.get("title", ""),
137        "location": parameters.get("location", ""),
138        "companyName": parameters.get("companyName", []),
139        "companyId": parameters.get("companyId", []),
140        "workType": parameters.get("workType", ""),
141        "experienceLevel": parameters.get("experienceLevel", ""),
142        "contractType": parameters.get("contractType", ""),
143        "publishedAt": parameters.get("publishedAt", ""),
144        "rows": parameters.get("rows", 10),
145        "proxy": parameters.get("proxy", {
146            "useApifyProxy": True,
147            "apifyProxyGroups": ["RESIDENTIAL"]
148        })
149    }
150    
151    # Run the Actor and wait for it to finish using Actor.apify_client
152    # This automatically handles the authentication - no need for explicit API key
153    run = await Actor.apify_client.actor("BHzefUZlZRKWxkTck").call(run_input=run_input)
154    
155    # Fetch and return the Actor's output
156    dataset_items = await Actor.apify_client.dataset(run["defaultDatasetId"]).list_items()
157    return dataset_items.items

src/parameter_handler.py

1import logging
2from typing import Dict, Any
3
4logger = logging.getLogger(__name__)
5
6def apply_parameter_defaults(parameters: Dict[str, Any]) -> Dict[str, Any]:
7    """
8    Apply default values for missing parameters
9
10    Args:
11        parameters: Current set of parameters
12
13    Returns:
14        Parameters with defaults applied
15    """
16    # Create a copy of the parameters to avoid modifying the original
17    final_params = parameters.copy()
18
19    # Check for title (required parameter)
20    if "title" not in final_params or not final_params["title"]:
21        final_params["title"] = "Software Engineer"  # Default job title
22        logger.info("Using default job title: 'Software Engineer'")
23
24    # Set default location if not provided
25    if "location" not in final_params or not final_params["location"]:
26        final_params["location"] = "United States"  # Country is required, default to United States
27        logger.info("Using default location: United States")
28
29    # Set default experience level if not provided
30    if "experienceLevel" not in final_params or not final_params["experienceLevel"]:
31        final_params["experienceLevel"] = "3"  # Associate
32        logger.info("Using default experience level: 3 (Associate)")
33
34    # Set default work type if not provided
35    if "workType" not in final_params or not final_params["workType"]:
36        final_params["workType"] = ""  # Empty string means any work type
37        logger.info("Using default work type: any")
38
39    # Set default contract type if not provided
40    if "contractType" not in final_params or not final_params["contractType"]:
41        final_params["contractType"] = "F"  # Full-time
42        logger.info("Using default contract type: F (Full-Time)")
43
44    # Set default published at if not provided
45    if "publishedAt" not in final_params or not final_params["publishedAt"]:
46        final_params["publishedAt"] = ""  # Empty string means any time
47        logger.info("Using default time frame: any time")
48
49    # Set default company name if not provided
50    if "companyName" not in final_params or not final_params["companyName"]:
51        final_params["companyName"] = []  # Empty list means any company
52        logger.info("Using default company name: any company")
53
54    # Set default company ID if not provided
55    if "companyId" not in final_params or not final_params["companyId"]:
56        final_params["companyId"] = []  # Empty list means any company ID
57        logger.info("Using default company ID: any company ID")
58
59    # Set default rows if not provided
60    if "rows" not in final_params or not final_params["rows"]:
61        final_params["rows"] = 10  # Default to 10 results
62        logger.info("Using default rows: 10")
63
64    # Ensure we have proper proxy configuration
65    if "proxy" not in final_params or not final_params["proxy"]:
66        final_params["proxy"] = {
67            "useApifyProxy": True,
68            "apifyProxyGroups": ["RESIDENTIAL"]
69        }
70        logger.info("Using default proxy configuration")
71
72    return final_params

src/prompt_processor.py

1import logging
2import json
3from typing import Dict, Any, Optional
4
5logger = logging.getLogger(__name__)
6
7async def process_prompt(prompt: str, llm_provider) -> Dict[str, Any]:
8    """
9    Process user prompt and extract job search parameters
10    
11    Args:
12        prompt: User's job search query
13        llm_provider: The LLM provider instance to use
14        
15    Returns:
16        Dictionary of extracted parameters for LinkedIn job search
17    """
18    try:
19        logger.info("Processing user prompt")
20        
21        # Process prompt with the provider
22        prompt_parameters = await llm_provider.process_prompt(prompt)
23        
24        # Validate and clean the parameters
25        prompt_parameters = validate_prompt_parameters(prompt_parameters)
26        
27        logger.info(f"Successfully extracted parameters from prompt: {json.dumps(prompt_parameters, indent=2)}")
28        return prompt_parameters
29        
30    except Exception as e:
31        logger.error(f"Error processing prompt: {str(e)}")
32        # Return empty parameters, which will use defaults later
33        return {}
34
35def validate_prompt_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:
36    """
37    Validate and clean the parameters extracted from the prompt
38    
39    Args:
40        parameters: Raw parameters extracted by the LLM
41        
42    Returns:
43        Cleaned and validated parameters
44    """
45    cleaned = {}
46    
47    # Clean and validate title
48    if "title" in parameters and parameters["title"]:
49        cleaned["title"] = str(parameters["title"]).strip()
50    
51    # Clean and validate location
52    if "location" in parameters and parameters["location"]:
53        cleaned["location"] = str(parameters["location"]).strip()
54    
55    # Clean and validate experienceLevel
56    if "experienceLevel" in parameters and parameters["experienceLevel"]:
57        exp_level = str(parameters["experienceLevel"]).strip()
58        # Ensure it's a number from 1-5
59        if exp_level in ["1", "2", "3", "4", "5"]:
60            cleaned["experienceLevel"] = exp_level
61    
62    # Clean and validate workType
63    if "workType" in parameters and parameters["workType"]:
64        work_type = str(parameters["workType"]).strip()
65        # Ensure it's a valid work type (1, 2, or 3)
66        if work_type in ["1", "2", "3"]:
67            cleaned["workType"] = work_type
68    
69    # Clean and validate contractType
70    if "contractType" in parameters and parameters["contractType"]:
71        contract_type = str(parameters["contractType"]).strip().upper()
72        # Ensure it's a valid contract type (F, P, C, T, I, or V)
73        if contract_type in ["F", "P", "C", "T", "I", "V"]:
74            cleaned["contractType"] = contract_type
75    
76    # Clean and validate publishedAt
77    if "publishedAt" in parameters and parameters["publishedAt"]:
78        published_at = str(parameters["publishedAt"]).strip()
79        # Ensure it's a valid time frame
80        if published_at in ["r86400", "r604800", "r2592000", ""]:
81            cleaned["publishedAt"] = published_at
82    
83    # Clean and validate rows
84    if "rows" in parameters and parameters["rows"]:
85        try:
86            rows = int(parameters["rows"])
87            if rows > 0:
88                cleaned["rows"] = rows
89        except (ValueError, TypeError):
90            pass
91    
92    # Clean and validate companyName
93    if "companyName" in parameters and isinstance(parameters["companyName"], list):
94        cleaned["companyName"] = [str(company).strip() for company in parameters["companyName"] if company]
95    
96    # Clean and validate companyId
97    if "companyId" in parameters and isinstance(parameters["companyId"], list):
98        cleaned["companyId"] = [str(company_id).strip() for company_id in parameters["companyId"] if company_id]
99    
100    return cleaned

src/__init__.py

1# AI Job Finder package

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint
6asyncio.run(main())

__pycache__/main.cpython-312.pyc

Download

example/advanced-reddit-scraper/.dockerignore

1.git
2.mise.toml
3.nvim.lua
4storage
5
6# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
7
8# Byte-compiled / optimized / DLL files
9__pycache__/
10*.py[cod]
11*$py.class
12
13# C extensions
14*.so
15
16# Distribution / packaging
17.Python
18build/
19develop-eggs/
20dist/
21downloads/
22eggs/
23.eggs/
24lib/
25lib64/
26parts/
27sdist/
28var/
29wheels/
30share/python-wheels/
31*.egg-info/
32.installed.cfg
33*.egg
34MANIFEST
35
36# PyInstaller
37#  Usually these files are written by a python script from a template
38#  before PyInstaller builds the exe, so as to inject date/other infos into it.
39*.manifest
40*.spec
41
42# Installer logs
43pip-log.txt
44pip-delete-this-directory.txt
45
46# Unit test / coverage reports
47htmlcov/
48.tox/
49.nox/
50.coverage
51.coverage.*
52.cache
53nosetests.xml
54coverage.xml
55*.cover
56*.py,cover
57.hypothesis/
58.pytest_cache/
59cover/
60
61# Translations
62*.mo
63*.pot
64
65# Django stuff:
66*.log
67local_settings.py
68db.sqlite3
69db.sqlite3-journal
70
71# Flask stuff:
72instance/
73.webassets-cache
74
75# Scrapy stuff:
76.scrapy
77
78# Sphinx documentation
79docs/_build/
80
81# PyBuilder
82.pybuilder/
83target/
84
85# Jupyter Notebook
86.ipynb_checkpoints
87
88# IPython
89profile_default/
90ipython_config.py
91
92# pyenv
93#   For a library or package, you might want to ignore these files since the code is
94#   intended to run in multiple environments; otherwise, check them in:
95.python-version
96
97# pdm
98#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
99#pdm.lock
100#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
101#   in version control.
102#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
103.pdm.toml
104.pdm-python
105.pdm-build/
106
107# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
108__pypackages__/
109
110# Celery stuff
111celerybeat-schedule
112celerybeat.pid
113
114# SageMath parsed files
115*.sage.py
116
117# Environments
118.env
119.venv
120env/
121venv/
122ENV/
123env.bak/
124venv.bak/
125
126# Spyder project settings
127.spyderproject
128.spyproject
129
130# Rope project settings
131.ropeproject
132
133# mkdocs documentation
134/site
135
136# mypy
137.mypy_cache/
138.dmypy.json
139dmypy.json
140
141# Pyre type checker
142.pyre/
143
144# pytype static type analyzer
145.pytype/
146
147# Cython debug symbols
148cython_debug/
149
150# PyCharm
151#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
152#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153#  and can be added to the global gitignore or merged into this file.  For a more nuclear
154#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
155.idea/

example/advanced-reddit-scraper/.gitignore

1.mise.toml
2.nvim.lua
3storage
4
5# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
6
7# Byte-compiled / optimized / DLL files
8__pycache__/
9*.py[cod]
10*$py.class
11
12# C extensions
13*.so
14
15# Distribution / packaging
16.Python
17build/
18develop-eggs/
19dist/
20downloads/
21eggs/
22.eggs/
23lib/
24lib64/
25parts/
26sdist/
27var/
28wheels/
29share/python-wheels/
30*.egg-info/
31.installed.cfg
32*.egg
33MANIFEST
34
35# PyInstaller
36#  Usually these files are written by a python script from a template
37#  before PyInstaller builds the exe, so as to inject date/other infos into it.
38*.manifest
39*.spec
40
41# Installer logs
42pip-log.txt
43pip-delete-this-directory.txt
44
45# Unit test / coverage reports
46htmlcov/
47.tox/
48.nox/
49.coverage
50.coverage.*
51.cache
52nosetests.xml
53coverage.xml
54*.cover
55*.py,cover
56.hypothesis/
57.pytest_cache/
58cover/
59
60# Translations
61*.mo
62*.pot
63
64# Django stuff:
65*.log
66local_settings.py
67db.sqlite3
68db.sqlite3-journal
69
70# Flask stuff:
71instance/
72.webassets-cache
73
74# Scrapy stuff:
75.scrapy
76
77# Sphinx documentation
78docs/_build/
79
80# PyBuilder
81.pybuilder/
82target/
83
84# Jupyter Notebook
85.ipynb_checkpoints
86
87# IPython
88profile_default/
89ipython_config.py
90
91# pyenv
92#   For a library or package, you might want to ignore these files since the code is
93#   intended to run in multiple environments; otherwise, check them in:
94.python-version
95
96# pdm
97#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
98#pdm.lock
99#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
100#   in version control.
101#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
102.pdm.toml
103.pdm-python
104.pdm-build/
105
106# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
107__pypackages__/
108
109# Celery stuff
110celerybeat-schedule
111celerybeat.pid
112
113# SageMath parsed files
114*.sage.py
115
116# Environments
117.env
118.venv
119env/
120venv/
121ENV/
122env.bak/
123venv.bak/
124
125# Spyder project settings
126.spyderproject
127.spyproject
128
129# Rope project settings
130.ropeproject
131
132# mkdocs documentation
133/site
134
135# mypy
136.mypy_cache/
137.dmypy.json
138dmypy.json
139
140# Pyre type checker
141.pyre/
142
143# pytype static type analyzer
144.pytype/
145
146# Cython debug symbols
147cython_debug/
148
149# PyCharm
150#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
151#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
152#  and can be added to the global gitignore or merged into this file.  For a more nuclear
153#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
154.idea/
155
156# Added by Apify CLI
157node_modules

example/advanced-reddit-scraper/README.md

1# 🚀 Advanced Reddit Scraper for Apify Actors - Lightning Fast & SEO Optimized
2
3Unlock the full potential of Reddit data with our Advanced Reddit Scraper designed for the Apify platform. This high-performance tool uses lightning-fast requests to extract extensive subreddit information, providing researchers, marketers, and data enthusiasts with unparalleled social media insights.
4
5## 📊 Comprehensive Reddit Data Extraction
6
7Our Reddit Scraper offers a robust set of features that allow you to collect detailed data from any subreddit using Apify's powerful actor architecture. Enjoy rapid scraping with optimal performance while taking advantage of customizable settings tailored for your data requirements.
8
9### 🔥 Key SEO-Optimized Features
10
11- **Full Subreddit Scraping**: Extract every detail from the target subreddits, capturing posts, comments, and metadata.
12- **Customizable Data Fields**: Configure exactly what you're after, ensuring that you only get the data that matters.
13- **Lightning Fast Performance**: Utilizes Python requests for rapid data retrieval, so you never miss a trending topic.
14- **Scalable Data Collection**: Effortlessly scrape multiple subreddits simultaneously, ideal for large-scale data mining.
15- **Real-Time Insights**: Obtain the most current Reddit information, perfect for real-time analytics and trend monitoring.
16- **Easy Integration with Data Pipelines**: Seamlessly export data in various formats (JSON, CSV, etc.) for immediate analysis.
17
18### 🌟 Use Cases for Maximum Impact
19
201. **Market Research & Trend Analysis**: Monitor public opinion and identify trending topics across subreddits.
212. **Content Creation & Optimization**: Discover viral posts and themes to inspire your content strategy.
223. **Sentiment Analysis**: Analyze user reactions and sentiments using detailed comment extraction.
234. **Competitive Intelligence**: Stay ahead by tracking competitor mentions and industry-specific discussions.
245. **Academic & Social Media Research**: Gather comprehensive data for scholarly studies and social behavior analysis.
25
26### 🛠️ How It Works
27
281. **Input Parameters**: 
29   - **Queries**: Provide one or more subreddit URLs in the format `https://reddit.com/r/<subreddit>`.
30   - **Post Sorting**: Choose how posts are sorted (e.g., `hot`, `new`, `top`, or `rising`).  
31   - **Top Period**: Specify the period for top posts (e.g., `day`, `week`, or `all`).
32   - **Max Posts**: Set the maximum number of posts to scrape per subreddit.
33   - **Comment Sorting**: Define the method to sort comments (e.g., `best`, `top`, `new`).
34   - **Number of Comments**: Determine how many comments (and a few nested replies) to extract per post.
352. **Execution**: Our scraper efficiently navigates Reddit using HTTP requests, ensuring quick and reliable data extraction while strictly following Reddit's guidelines.
363. **Output**: Receive clean, structured data ready for analysis and integration into your existing workflows.
37
38### 📈 Why Our Reddit Scraper Stands Out
39
40- **Comprehensive Data Collection**: Capture every available piece of information from the subreddits you track.
41- **High-Speed Requests**: Leveraging the fastest possible scraping techniques to give you immediate insights.
42- **Customizable & Flexible**: Tailor the scraping process to meet diverse and specific data needs.
43- **Enterprise-Grade Scalability**: Perfect for both small-scale projects and large-scale data operations.
44- **Ethical & Compliant**: Adheres to Reddit’s data usage policies, including respecting robots.txt and API guidelines.
45
46### 🔗 Essential Resources
47
48- [Apify Platform](https://apify.com)
49- [Actor Documentation](https://docs.apify.com/actors)
50- [API Reference](https://docs.apify.com/api/v2)
51
52### 📞 Expert Support When You Need It
53
54For further assistance or inquiries, feel free to reach out:
55- 📧 Email: tnot2652@gmail.com
56
57### 🚀 Ready to Upgrade Your Data Game?
58
59Don't miss out on vital Reddit insights. Enhance your data strategy and make informed decisions with our Advanced Reddit Scraper. Start scraping smarter and faster on Apify today!

example/advanced-reddit-scraper/requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify == 2.2.1
5beautifulsoup4[lxml]
6httpx
7types-beautifulsoup4

src/llm_providers/base_provider.py

1from abc import ABC, abstractmethod
2from typing import Dict, Any, Optional
3
4class LLMProvider(ABC):
5    """Base abstract class for LLM providers"""
6    
7    def __init__(self, api_key: str, model: Optional[str] = None):
8        """
9        Initialize the LLM provider
10        
11        Args:
12            api_key: API key for the provider
13            model: Optional specific model to use
14        """
15        self.api_key = api_key
16        self.model = model
17    
18    @abstractmethod
19    async def process_cv(self, cv_data: str) -> Dict[str, Any]:
20        """
21        Process CV data and extract job search parameters
22        
23        Args:
24            cv_data: CV content (could be base64 encoded file or text)
25            
26        Returns:
27            Dictionary of extracted parameters
28        """
29        pass
30    
31    @abstractmethod
32    async def process_prompt(self, prompt: str) -> Dict[str, Any]:
33        """
34        Process user prompt and extract job search parameters
35        
36        Args:
37            prompt: User's job search query
38            
39        Returns:
40            Dictionary of extracted parameters
41        """
42        pass
43    
44    @abstractmethod
45    async def validate_api_key(self) -> bool:
46        """
47        Validate that the API key is correct
48        
49        Returns:
50            True if valid, False otherwise
51        """
52        pass
53    
54    def supports_document_processing(self) -> bool:
55        """
56        Check if the provider and model support direct document processing
57        
58        Returns:
59            True if document processing is supported, False otherwise
60        """
61        return False

src/llm_providers/claude_provider.py

1import json
2import logging
3import re
4import base64
5from typing import Dict, Any, Optional, List
6
7from anthropic import AsyncAnthropic
8from src.llm_providers.base_provider import LLMProvider
9
10logger = logging.getLogger(__name__)
11
12class ClaudeProvider(LLMProvider):
13    """Implementation of LLM provider for Anthropic Claude"""
14    
15    def __init__(self, api_key: str, model: Optional[str] = None):
16        """Initialize the Claude provider"""
17        super().__init__(api_key, model)
18        self.client = AsyncAnthropic(api_key=api_key)
19        self.model = model or "claude-3-opus-20240229"  # Default to most capable model
20    
21    def supports_document_processing(self) -> bool:
22        """Check if this provider/model supports direct document processing"""
23        return "claude-3" in self.model  # All Claude 3 models support document processing
24    
25    async def validate_api_key(self) -> bool:
26        """Validate the API key by making a simple models call"""
27        try:
28            # There's no direct way to validate the key without making a message request
29            # Use a minimal request to check if the key works
30            await self.client.messages.create(
31                model=self.model,
32                max_tokens=10,
33                messages=[{"role": "user", "content": "Hello"}]
34            )
35            return True
36        except Exception as e:
37            logger.error(f"Claude API key validation failed: {str(e)}")
38            return False
39    
40    async def process_cv(self, cv_data: str) -> Dict[str, Any]:
41        """
42        Process CV with Claude
43        
44        Args:
45            cv_data: CV content (could be base64 encoded file or text)
46            
47        Returns:
48            Dictionary of extracted parameters
49        """
50        if self.supports_document_processing() and cv_data.startswith("data:"):
51            return await self._process_cv_with_document_api(cv_data)
52        else:
53            # Assume it's already text
54            return await self._process_cv_text(cv_data)
55    
56    async def _process_cv_with_document_api(self, cv_data: str) -> Dict[str, Any]:
57        """Process CV using Claude's document capabilities"""
58        try:
59            # Extract the mime type and base64 data
60            mime_type, encoded_data = cv_data.split(';base64,', 1)
61            mime_type = mime_type.replace('data:', '')
62            
63            response = await self.client.messages.create(
64                model=self.model,
65                max_tokens=4000,
66                system="Extract job search parameters from this CV/resume.",
67                messages=[
68                    {"role": "user", "content": [
69                        {"type": "text", "text": self._get_cv_prompt()},
70                        {"type": "image", "source": {
71                            "type": "base64",
72                            "media_type": mime_type,
73                            "data": encoded_data
74                        }}
75                    ]}
76                ]
77            )
78            
79            # Extract JSON from response
80            content = response.content[0].text
81            # Find JSON in the content (handle potential text wrapping)
82            json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
83            if json_match:
84                return json.loads(json_match.group(1))
85            
86            # If no JSON block, try to parse the entire content
87            return json.loads(content)
88        except Exception as e:
89            logger.error(f"Claude document processing failed: {str(e)}")
90            raise
91    
92    async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:
93        """Process CV text with Claude"""
94        try:
95            response = await self.client.messages.create(
96                model=self.model,
97                max_tokens=4000,
98                system="Extract job search parameters from this CV/resume.",
99                messages=[
100                    {"role": "user", "content": self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}"}
101                ]
102            )
103            
104            # Extract JSON from response
105            content = response.content[0].text
106            # Find JSON in the content (handle potential text wrapping)
107            json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
108            if json_match:
109                return json.loads(json_match.group(1))
110            
111            # If no JSON block, try to parse the entire content
112            return json.loads(content)
113        except Exception as e:
114            logger.error(f"Claude text processing failed: {str(e)}")
115            raise
116    
117    async def process_prompt(self, prompt: str) -> Dict[str, Any]:
118        """Process user prompt and extract job search parameters"""
119        try:
120            response = await self.client.messages.create(
121                model=self.model,
122                max_tokens=4000,
123                system="Extract job search parameters from this query.",
124                messages=[
125                    {"role": "user", "content": self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}"}
126                ]
127            )
128            
129            # Extract JSON from response
130            content = response.content[0].text
131            # Find JSON in the content (handle potential text wrapping)
132            json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
133            if json_match:
134                return json.loads(json_match.group(1))
135            
136            # If no JSON block, try to parse the entire content
137            return json.loads(content)
138        except Exception as e:
139            logger.error(f"Claude prompt processing failed: {str(e)}")
140            raise
141    
142    def _get_cv_prompt(self) -> str:
143        """Get the prompt for CV analysis"""
144        return """
145        Extract the following job search parameters from this CV/resume.
146        
147        Return your response as valid JSON object inside ```json code blocks with the following structure:
148        
149        ```json
150        {
151          "title": "The most recent job title or professional role",
152          "location": "Current or preferred location",
153          "experienceLevel": "A numeric value from 1-5 where:
154            1 = Internship
155            2 = Entry Level
156            3 = Associate
157            4 = Mid-Senior Level
158            5 = Director",
159          "workType": "Either:
160            1 = On-Site
161            2 = Remote
162            3 = Hybrid
163           Based on any workstyle preferences found in the CV",
164          "contractType": "A single letter representing employment type preference:
165            F = Full-Time
166            P = Part-Time
167            C = Contract
168            T = Temporary
169            I = Internship
170            V = Volunteer",
171          "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]
172        }
173        ```
174        
175        If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.
176        
177        IMPORTANT: Your output must be a valid JSON object wrapped in ```json code blocks.
178        """
179    
180    def _get_prompt_extraction_prompt(self) -> str:
181        """Get the prompt for extracting parameters from user query"""
182        return """
183        Extract LinkedIn job search parameters from this query.
184        
185        Return your response as valid JSON object inside ```json code blocks with the following structure:
186        
187        ```json
188        {
189          "title": "Job title or role to search for",
190          "location": "Geographic location for job search",
191          "companyName": ["array of specific companies mentioned"],
192          "companyId": ["array of LinkedIn company IDs if mentioned"],
193          "workType": "Either:
194            1 = On-Site
195            2 = Remote
196            3 = Hybrid",
197          "experienceLevel": "A numeric value from 1-5 where:
198            1 = Internship
199            2 = Entry Level
200            3 = Associate
201            4 = Mid-Senior Level
202            5 = Director",
203          "contractType": "A single letter representing employment type:
204            F = Full-Time
205            P = Part-Time
206            C = Contract
207            T = Temporary
208            I = Internship
209            V = Volunteer",
210          "publishedAt": "Time frame:
211            r86400 = Last 24 hours
212            r604800 = Last week
213            r2592000 = Last month
214            empty string = Any time",
215          "rows": "Number of job listings to return (integer)"
216        }
217        ```
218        
219        For any parameters not explicitly mentioned in the query, use null.
220        
221        IMPORTANT: Your output must be a valid JSON object wrapped in ```json code blocks.
222        """

src/llm_providers/factory.py

1import logging
2from typing import Optional, Any
3
4logger = logging.getLogger(__name__)
5
6def create_llm_provider(provider_name: str, api_key: str, model: Optional[str] = None) -> Any:
7    """
8    Create and return an instance of the specified LLM provider.
9    
10    Args:
11        provider_name: Name of the LLM provider ('openai', 'claude', or 'gemini')
12        api_key: API key for the provider
13        model: Optional specific model to use
14        
15    Returns:
16        An instance of the appropriate LLM provider
17    
18    Raises:
19        ValueError: If the provider is not supported
20    """
21    if provider_name.lower() == "openai":
22        from src.llm_providers.openai_provider import OpenAIProvider
23        return OpenAIProvider(api_key, model)
24    elif provider_name.lower() == "claude":
25        from src.llm_providers.claude_provider import ClaudeProvider
26        return ClaudeProvider(api_key, model)
27    elif provider_name.lower() == "gemini":
28        from src.llm_providers.gemini_provider import GeminiProvider
29        return GeminiProvider(api_key, model)
30    else:
31        raise ValueError(f"Unsupported LLM provider: {provider_name}")

src/llm_providers/gemini_provider.py

1import json
2import logging
3import re
4import base64
5from typing import Dict, Any, Optional, List
6
7import google.generativeai as genai
8from src.llm_providers.base_provider import LLMProvider
9
10logger = logging.getLogger(__name__)
11
12class GeminiProvider(LLMProvider):
13    """Implementation of LLM provider for Google Gemini"""
14
15    def __init__(self, api_key: str, model: Optional[str] = None):
16        """Initialize the Gemini provider"""
17        super().__init__(api_key, model)
18        genai.configure(api_key=api_key)
19        self.model_name = model or "gemini-1.5-pro"
20        self.model = genai.GenerativeModel(self.model_name)
21
22    def supports_document_processing(self) -> bool:
23        """Check if this provider/model supports direct document processing"""
24        vision_capable_models = ["gemini-pro-vision", "gemini-1.5-pro", "gemini-1.5-flash"]
25        return any(model_name in self.model_name for model_name in vision_capable_models)
26
27    async def validate_api_key(self) -> bool:
28        """Validate the API key by making a simple models call"""
29        try:
30            # Gemini doesn't have a dedicated validate endpoint, use a simple generation
31            response = self.model.generate_content("Hello")
32            return True
33        except Exception as e:
34            logger.error(f"Gemini API key validation failed: {str(e)}")
35            return False
36
37    async def process_cv(self, cv_data: str) -> Dict[str, Any]:
38        """
39        Process CV with Gemini
40
41        Args:
42            cv_data: CV content (could be base64 encoded file or text)
43
44        Returns:
45            Dictionary of extracted parameters
46        """
47        if self.supports_document_processing() and cv_data.startswith("data:"):
48            return await self._process_cv_with_vision(cv_data)
49        else:
50            # Assume it's already text
51            return await self._process_cv_text(cv_data)
52
53    async def _process_cv_with_vision(self, cv_data: str) -> Dict[str, Any]:
54        """Process CV using Gemini's vision capabilities"""
55        try:
56            # Extract the mime type and base64 data
57            mime_type, encoded_data = cv_data.split(';base64,', 1)
58            mime_type = mime_type.replace('data:', '')
59
60            # Create a content parts list with prompt and image
61            parts = [
62                self._get_cv_prompt(),
63                {"mime_type": mime_type, "data": base64.b64decode(encoded_data)}
64            ]
65
66            response = self.model.generate_content(
67                parts,
68                generation_config={
69                    "temperature": 0.1
70                }
71            )
72
73            # Extract JSON from response
74            content = response.text
75
76            # Try to parse as JSON directly
77            try:
78                return json.loads(content)
79            except json.JSONDecodeError:
80                # If direct parsing fails, look for JSON in code blocks
81                json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)
82                if json_match:
83                    return json.loads(json_match.group(1))
84
85                # If still no match, try to find anything that looks like JSON
86                json_pattern = r'{.*}'
87                json_match = re.search(json_pattern, content, re.DOTALL)
88                if json_match:
89                    return json.loads(json_match.group(0))
90
91                logger.error(f"Could not parse Gemini response as JSON: {content}")
92                raise ValueError("Failed to parse Gemini response as JSON")
93
94        except Exception as e:
95            logger.error(f"Gemini vision processing failed: {str(e)}")
96            raise
97
98    async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:
99        """Process CV text with Gemini"""
100        try:
101            response = self.model.generate_content(
102                self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}",
103                generation_config={
104                    "temperature": 0.1
105                }
106            )
107
108            # Extract JSON from response
109            content = response.text
110
111            # Try to parse as JSON directly
112            try:
113                return json.loads(content)
114            except json.JSONDecodeError:
115                # If direct parsing fails, look for JSON in code blocks
116                json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)
117                if json_match:
118                    return json.loads(json_match.group(1))
119
120                # If still no match, try to find anything that looks like JSON
121                json_pattern = r'{.*}'
122                json_match = re.search(json_pattern, content, re.DOTALL)
123                if json_match:
124                    return json.loads(json_match.group(0))
125
126                logger.error(f"Could not parse Gemini response as JSON: {content}")
127                raise ValueError("Failed to parse Gemini response as JSON")
128
129        except Exception as e:
130            logger.error(f"Gemini text processing failed: {str(e)}")
131            raise
132
133    async def process_prompt(self, prompt: str) -> Dict[str, Any]:
134        """Process user prompt and extract job search parameters"""
135        try:
136            response = self.model.generate_content(
137                self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}",
138                generation_config={
139                    "temperature": 0.1
140                }
141            )
142
143            # Extract JSON from response
144            content = response.text
145
146            # Try to parse as JSON directly
147            try:
148                return json.loads(content)
149            except json.JSONDecodeError:
150                # If direct parsing fails, look for JSON in code blocks
151                json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL)
152                if json_match:
153                    return json.loads(json_match.group(1))
154
155                # If still no match, try to find anything that looks like JSON
156                json_pattern = r'{.*}'
157                json_match = re.search(json_pattern, content, re.DOTALL)
158                if json_match:
159                    return json.loads(json_match.group(0))
160
161                logger.error(f"Could not parse Gemini response as JSON: {content}")
162                raise ValueError("Failed to parse Gemini response as JSON")
163
164        except Exception as e:
165            logger.error(f"Gemini prompt processing failed: {str(e)}")
166            raise
167
168    def _get_cv_prompt(self) -> str:
169        """Get the prompt for CV analysis"""
170        return """
171        Extract the following job search parameters from this CV/resume:
172
173        Follow these steps:
174        1. Identify the job title
175        2. Determine the location
176        3. Assess experience level (1-5)
177        4. Identify work type preference (1-3)
178        5. Determine contract type (FPCTIV)
179        6. List key skills
180
181        Return ONLY a JSON object with this format:
182        {
183          "title": "The most recent job title or professional role",
184          "location": "Current or preferred location",
185          "experienceLevel": "A numeric value from 1-5 where:
186            1 = Internship
187            2 = Entry Level
188            3 = Associate
189            4 = Mid-Senior Level
190            5 = Director",
191          "workType": "Either:
192            1 = On-Site
193            2 = Remote
194            3 = Hybrid
195           Based on any workstyle preferences found in the CV",
196          "contractType": "A single letter representing employment type preference:
197            F = Full-Time
198            P = Part-Time
199            C = Contract
200            T = Temporary
201            I = Internship
202            V = Volunteer",
203          "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]
204        }
205
206        If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.
207
208        IMPORTANT: Your output must be ONLY the JSON object with no additional text.
209        """
210
211    def _get_prompt_extraction_prompt(self) -> str:
212        """Get the prompt for extracting parameters from user query"""
213        return """
214        Extract LinkedIn job search parameters from this query.
215
216        Follow these steps:
217        1. Identify job title or role
218        2. Determine geographic location
219        3. Note any specific companies mentioned
220        4. Assess experience level (1-5)
221        5. Identify work type (1-3)
222        6. Determine contract type (FPCTIV)
223        7. Identify time frame for job postings
224
225        Return ONLY a JSON object with this format:
226        {
227          "title": "Job title or role to search for",
228          "location": "Geographic location for job search",
229          "companyName": ["array of specific companies mentioned"],
230          "companyId": ["array of LinkedIn company IDs if mentioned"],
231          "workType": "Either:
232            1 = On-Site
233            2 = Remote
234            3 = Hybrid",
235          "experienceLevel": "A numeric value from 1-5 where:
236            1 = Internship
237            2 = Entry Level
238            3 = Associate
239            4 = Mid-Senior Level
240            5 = Director",
241          "contractType": "A single letter representing employment type:
242            F = Full-Time
243            P = Part-Time
244            C = Contract
245            T = Temporary
246            I = Internship
247            V = Volunteer",
248          "publishedAt": "Time frame:
249            r86400 = Last 24 hours
250            r604800 = Last week
251            r2592000 = Last month
252            empty string = Any time",
253          "rows": "Number of job listings to return (integer)"
254        }
255
256        For any parameters not explicitly mentioned in the query, use null.
257
258        IMPORTANT: Your output must be ONLY the JSON object with no additional text.
259        """

src/llm_providers/openai_provider.py

1import json
2import logging
3import base64
4import re
5from typing import Dict, Any, Optional, List
6
7from openai import AsyncOpenAI
8from src.llm_providers.base_provider import LLMProvider
9
10logger = logging.getLogger(__name__)
11
12class OpenAIProvider(LLMProvider):
13    """Implementation of LLM provider for OpenAI"""
14    
15    def __init__(self, api_key: str, model: Optional[str] = None):
16        """Initialize the OpenAI provider"""
17        super().__init__(api_key, model)
18        self.client = AsyncOpenAI(api_key=api_key)
19        self.model = model or "gpt-4o"  # Default to most capable model
20    
21    def supports_document_processing(self) -> bool:
22        """Check if this provider/model supports direct document processing"""
23        document_capable_models = ["gpt-4-vision", "gpt-4o"]
24        return any(model_name in self.model for model_name in document_capable_models)
25    
26    async def validate_api_key(self) -> bool:
27        """Validate the API key by making a simple models.list call"""
28        try:
29            await self.client.models.list()
30            return True
31        except Exception as e:
32            logger.error(f"OpenAI API key validation failed: {str(e)}")
33            return False
34    
35    async def process_cv(self, cv_data: str) -> Dict[str, Any]:
36        """
37        Process CV with OpenAI
38        
39        Args:
40            cv_data: CV content (could be base64 encoded file or text)
41            
42        Returns:
43            Dictionary of extracted parameters
44        """
45        if self.supports_document_processing() and cv_data.startswith("data:"):
46            return await self._process_cv_with_vision(cv_data)
47        else:
48            # Assume it's already text
49            return await self._process_cv_text(cv_data)
50    
51    async def _process_cv_with_vision(self, cv_data: str) -> Dict[str, Any]:
52        """Process CV using OpenAI's vision capabilities"""
53        try:
54            response = await self.client.chat.completions.create(
55                model=self.model,
56                messages=[
57                    {"role": "system", "content": "Extract job search parameters from this CV/resume."},
58                    {"role": "user", "content": [
59                        {"type": "text", "text": self._get_cv_prompt()},
60                        {"type": "image_url", "image_url": {"url": cv_data}}
61                    ]}
62                ],
63                response_format={"type": "json_object"}
64            )
65            return json.loads(response.choices[0].message.content)
66        except Exception as e:
67            logger.error(f"OpenAI vision processing failed: {str(e)}")
68            raise
69    
70    async def _process_cv_text(self, cv_text: str) -> Dict[str, Any]:
71        """Process CV text with OpenAI"""
72        try:
73            response = await self.client.chat.completions.create(
74                model=self.model,
75                messages=[
76                    {"role": "system", "content": "Extract job search parameters from this CV/resume."},
77                    {"role": "user", "content": self._get_cv_prompt() + f"\n\nCV TEXT:\n{cv_text}"}
78                ],
79                response_format={"type": "json_object"}
80            )
81            return json.loads(response.choices[0].message.content)
82        except Exception as e:
83            logger.error(f"OpenAI text processing failed: {str(e)}")
84            raise
85    
86    async def process_prompt(self, prompt: str) -> Dict[str, Any]:
87        """Process user prompt and extract job search parameters"""
88        try:
89            response = await self.client.chat.completions.create(
90                model=self.model,
91                messages=[
92                    {"role": "system", "content": "Extract job search parameters from this query."},
93                    {"role": "user", "content": self._get_prompt_extraction_prompt() + f"\n\nUSER QUERY:\n{prompt}"}
94                ],
95                response_format={"type": "json_object"}
96            )
97            return json.loads(response.choices[0].message.content)
98        except Exception as e:
99            logger.error(f"OpenAI prompt processing failed: {str(e)}")
100            raise
101    
102    def _get_cv_prompt(self) -> str:
103        """Get the prompt for CV analysis"""
104        return """
105        Extract the following job search parameters from this CV/resume in JSON format:
106
107        Required JSON format:
108        {
109          "title": "The most recent job title or professional role",
110          "location": "Current or preferred location",
111          "experienceLevel": "A numeric value from 1-5 where:
112            1 = Internship
113            2 = Entry Level
114            3 = Associate
115            4 = Mid-Senior Level
116            5 = Director",
117          "workType": "Either:
118            1 = On-Site
119            2 = Remote
120            3 = Hybrid
121           Based on any workstyle preferences found in the CV",
122          "contractType": "A single letter representing employment type preference:
123            F = Full-Time
124            P = Part-Time
125            C = Contract
126            T = Temporary
127            I = Internship
128            V = Volunteer",
129          "skills": ["list", "of", "key", "technical", "and", "soft", "skills"]
130        }
131
132        If a piece of information is not clearly stated in the CV, make a reasonable inference based on the available information. If inference is not possible, use null.
133        """
134    
135    def _get_prompt_extraction_prompt(self) -> str:
136        """Get the prompt for extracting parameters from user query"""
137        return """
138        Extract LinkedIn job search parameters from this query in JSON format:
139
140        Required JSON format:
141        {
142          "title": "Job title or role to search for",
143          "location": "Geographic location for job search",
144          "companyName": ["array of specific companies mentioned"],
145          "companyId": ["array of LinkedIn company IDs if mentioned"],
146          "workType": "Either:
147            1 = On-Site
148            2 = Remote
149            3 = Hybrid",
150          "experienceLevel": "A numeric value from 1-5 where:
151            1 = Internship
152            2 = Entry Level
153            3 = Associate
154            4 = Mid-Senior Level
155            5 = Director",
156          "contractType": "A single letter representing employment type:
157            F = Full-Time
158            P = Part-Time
159            C = Contract
160            T = Temporary
161            I = Internship
162            V = Volunteer",
163          "publishedAt": "Time frame:
164            r86400 = Last 24 hours
165            r604800 = Last week
166            r2592000 = Last month
167            empty string = Any time",
168          "rows": "Number of job listings to return (integer)"
169        }
170
171        For any parameters not explicitly mentioned in the query, use null.
172        """

src/llm_providers/__init__.py

1# LLM Providers package

src/__pycache__/cv_processor.cpython-312.pyc

Download

src/__pycache__/main.cpython-312.pyc

Download

src/__pycache__/parameter_handler.cpython-312.pyc

Download

src/__pycache__/prompt_processor.cpython-312.pyc

Download

src/__pycache__/__init__.cpython-312.pyc

Download

src/__pycache__/__main__.cpython-312.pyc

Download

example/advanced-reddit-scraper/.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "reddit-subreddit-scraper",
4	"title": "Reddit Subreddit Scraper",
5	"description": "Scrapes Reddit subreddits.",
6	"version": "0.0",
7	"buildTag": "latest",
8	"meta": {
9		"templateId": "python-beautifulsoup"
10	},
11	"input": "./input_schema.json",
12	"dockerfile": "./Dockerfile"
13}

example/advanced-reddit-scraper/.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.12
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]

example/advanced-reddit-scraper/.actor/input_schema.json

1{
2    "title": "Advanced Reddit Scraper",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "queries": {
7            "title": "Start URLs or subreddits",
8            "type": "array",
9            "description": "Subreddits to scrape in the format of https://reddit.com/r/<subreddit>",
10            "prefill": [
11                "https://reddit.com/r/AskReddit"
12            ],
13            "default": [
14                "https://reddit.com/r/AskReddit"
15            ],
16            "editor": "stringList"
17        },
18        "postSort": {
19            "title": "Sorting",
20            "type": "string",
21            "enum": [
22                "hot",
23                "new",
24                "top",
25                "rising"
26            ],
27            "description": "Sorting of posts in the subreddit - (top, new, rising, hot) - If already given a sorted subreddit link (e.g.: https://reddit.com/r/eli5/top/?t=day) the sorting will be ignored",
28            "default": "top"
29        },
30        "topPeriod": {
31            "title": "Top posts period",
32            "type": "string",
33            "enum": [
34                "hour",
35                "day",
36                "week",
37                "month",
38                "year",
39                "all"
40            ],
41            "description": "Top posts period - (only works when sorting is top)",
42            "default": "week"
43        },
44        "limit": {
45            "title": "Max Posts",
46            "type": "integer",
47            "description": "Maximum number of posts to scrape per URL (default: 10)",
48            "default": 10
49        },
50
51        "commentSort": {
52            "title": "Comment sorting",
53            "description": "Sorting of comments in the post - (best, top, new, controversial, old, qa)",
54            "type": "string",
55            "enum": [
56                "best", 
57                "top", 
58                "new", 
59                "controversial", 
60                "old", 
61                "qa"
62            ],
63            "default": "top"
64        },
65        "numComments": {
66            "title": "Number of comments to scrape",
67            "type": "integer",
68            "description": "A few replies to comments is also returned",
69            "default": 0
70        }
71    },
72    "required": ["queries"]
73}

example/advanced-reddit-scraper/.git/COMMIT_EDITMSG

1release

example/advanced-reddit-scraper/.git/config

1[core]
2	repositoryformatversion = 0
3	filemode = false
4	bare = false
5	logallrefupdates = true
6	symlinks = false
7	ignorecase = true
8[remote "origin"]
9	url = https://github.com/deduble/advanced-reddit-scraper.git
10	fetch = +refs/heads/*:refs/remotes/origin/*
11[branch "main"]
12	remote = origin
13	merge = refs/heads/main
14[gui]
15	wmstate = normal
16	geometry = 1322x693+228+228 254 315

example/advanced-reddit-scraper/.git/description

1Unnamed repository; edit this file 'description' to name the repository.

example/advanced-reddit-scraper/.git/FETCH_HEAD

168034258495c18bfb133b925e51bbce4b07c2cf2		branch 'main' of https://github.com/deduble/advanced-reddit-scraper

example/advanced-reddit-scraper/.git/HEAD

1ref: refs/heads/main

example/advanced-reddit-scraper/.git/index

Download

example/advanced-reddit-scraper/.git/ORIG_HEAD

1d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45

example/advanced-reddit-scraper/src/cookies.json

1[
2    {
3        "domain": ".reddit.com",
4        "hostOnly": false,
5        "httpOnly": false,
6        "name": "csrf_token",
7        "path": "/",
8        "sameSite": "strict",
9        "secure": true,
10        "session": true,
11        "storeId": "0",
12        "value": "eb3157bc50b012701ac2f7aab49fcc4c",
13        "id": 1
14    },
15    {
16        "domain": ".reddit.com",
17        "expirationDate": 1757533137,
18        "hostOnly": false,
19        "httpOnly": false,
20        "name": "csv",
21        "path": "/",
22        "sameSite": "no_restriction",
23        "secure": true,
24        "session": false,
25        "storeId": "0",
26        "value": "2",
27        "id": 2
28    },
29    {
30        "domain": ".reddit.com",
31        "expirationDate": 1757533137,
32        "hostOnly": false,
33        "httpOnly": false,
34        "name": "edgebucket",
35        "path": "/",
36        "sameSite": "unspecified",
37        "secure": true,
38        "session": false,
39        "storeId": "0",
40        "value": "lH2AY01VDhdJZrAOeK",
41        "id": 3
42    },
43    {
44        "domain": ".reddit.com",
45        "expirationDate": 1761371475,
46        "hostOnly": false,
47        "httpOnly": false,
48        "name": "loid",
49        "path": "/",
50        "sameSite": "no_restriction",
51        "secure": true,
52        "session": false,
53        "storeId": "0",
54        "value": "0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg",
55        "id": 4
56    },
57    {
58        "domain": ".reddit.com",
59        "expirationDate": 1759110222,
60        "hostOnly": false,
61        "httpOnly": false,
62        "name": "pc",
63        "path": "/",
64        "sameSite": "unspecified",
65        "secure": true,
66        "session": false,
67        "storeId": "0",
68        "value": "81",
69        "id": 5
70    },
71    {
72        "domain": ".reddit.com",
73        "expirationDate": 1757612826,
74        "hostOnly": false,
75        "httpOnly": true,
76        "name": "reddit_session",
77        "path": "/",
78        "sameSite": "unspecified",
79        "secure": true,
80        "session": false,
81        "storeId": "0",
82        "value": "710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6",
83        "id": 6
84    },
85    {
86        "domain": ".reddit.com",
87        "hostOnly": false,
88        "httpOnly": false,
89        "name": "session_tracker",
90        "path": "/",
91        "sameSite": "no_restriction",
92        "secure": true,
93        "session": true,
94        "storeId": "0",
95        "value": "mdmqaqfjmphfropmga.0.1727922697221.Z0FBQUFBQm1fZ0lKOVhiTHFwazVhYXBQa0FSS2VUTllqd2ljRmhuNFozRHVnZmkxU1JOcmZBd1dteXRPSmJxS0x3S2s0YVE2VEVRaGk1M0JMei1TV1Q2RGN4STZ4aHhCWnJhSEtsRDZsdEZveFVxeUhnVjNrSFNjOFpJRmM0bEREdVZfR2UyYTdZM2U",
96        "id": 7
97    },
98    {
99        "domain": ".reddit.com",
100        "expirationDate": 1759458549,
101        "hostOnly": false,
102        "httpOnly": false,
103        "name": "t2_927nml2x_recentclicks3",
104        "path": "/",
105        "sameSite": "strict",
106        "secure": false,
107        "session": false,
108        "storeId": "0",
109        "value": "t3_pgyvok%2Ct3_1fsuzj4%2Ct3_1fk6551%2Ct3_eokkto%2Ct3_14x7ys7%2Ct3_17wo9ms%2Ct3_dpcb2z%2Ct3_16fac9r%2Ct3_analu0%2Ct3_142jsph",
110        "id": 8
111    },
112    {
113        "domain": ".reddit.com",
114        "expirationDate": 1728008948.6718,
115        "hostOnly": false,
116        "httpOnly": true,
117        "name": "token_v2",
118        "path": "/",
119        "sameSite": "unspecified",
120        "secure": true,
121        "session": false,
122        "storeId": "0",
123        "value": "eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI4MDA4OTQ4LjE0MDk0MywiaWF0IjoxNzI3OTIyNTQ4LjE0MDk0MiwianRpIjoiNE5wUE5zejMzWkhrWXI0cktxZU9hazJiY0tYMkRRIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.GX8N8AYcgK2DWqWPqiclkljcwEawb7GFRw6QMdL9C7lb5FS-_ofuZpR0bx77pgWjWJ9uOczItTUfZvjx9u4CgeS9dK3U8G1apuqUW9YWDrgxfQeFWNMPVd0IjDTEt6Sn8vrdWb5cjv_SsGzxHgtC2RjdDLQYfQu2ud-Qp_1sELlBDPHDfhgOPbuOpzuFz2NJ8ifj623r2a8XOgQi5UaAHEClgleVAdkN2bpMd1kUsYh0PmMZOpN2XqvgdwKJUuyce-9yAqhMLiIPneVJnaytpth0jeRkT5-Fyt-_CgsXYphTG9T9u8Q2Z5JwOrwiosBPEokbhjculNQ78QlUUlC7UA",
124        "id": 9
125    }
126    ]

example/advanced-reddit-scraper/src/main.py

1"""This module defines the main entry point for the Apify Actor.
2
3Feel free to modify this file to suit your specific needs.
4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
6https://docs.apify.com/sdk/python
7"""
8
9import sys
10import os
11
12from httpx import AsyncClient
13
14from apify import Actor, Request
15from .redditor import Redditor
16
17
18async def main() -> None:
19    """Main entry point for the Apify Actor.
20
21    This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.
22    Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
23    the field of web scraping significantly.
24    """
25    async with Actor:
26        # Retrieve the Actor input, and use default values if not provided.
27        actor_input = await Actor.get_input() or {}
28        queries = actor_input.get('queries', ['https://reddit.com/r/AskReddit'])
29        limit = actor_input.get('limit', 10)
30        num_comments = actor_input.get('numComments', 0)
31        sorting = actor_input.get('postSort', 'top')
32        comment_sort = actor_input.get('commentSort', 'top')
33        sorting_period = actor_input.get('topPeriod', 'today')
34        if sorting_period not in {'hour', 'day', 'week', 'month', 'year', 'all'}:
35            raise ValueError('topPeriod must be one of hour, day, week, month, year, all')
36        if sorting not in {'hot', 'new', 'top', 'rising'}:
37            raise ValueError('postSort must be one of hot, new, top, rising')
38        if comment_sort not in {'best', 'top', 'new','controversial', 'old', 'qa'}:
39            raise ValueError('commentSort must be one of hot, new, top, best, top, new,controversial, old, qa')
40        reddit_scraper = Redditor(logger=Actor.log)
41
42        # Exit if no start URLs are provided.
43        if not queries:
44            Actor.log.info('No queries specified in Actor input, exiting...')
45            await Actor.exit()
46
47        # Open the default request queue for handling URLs to be processed.
48        request_queue = await Actor.open_request_queue()
49
50        # Enqueue the start URLs with an initial crawl depth of 0.
51        for query in queries:
52            url = reddit_scraper.subreddit_link_from_query(query, sorting=sorting, period=sorting_period)
53            Actor.log.info(f'Enqueuing {url} ...')
54            request = Request.from_url(url, user_data={'limit': limit, 'numComments':  num_comments, 'query': query})
55            await request_queue.add_request(request)
56
57        # Process the URLs from the request queue.
58        while request := await request_queue.fetch_next_request():
59            url = request.url
60            query = request.user_data['query']
61            posts_limit = request.user_data['limit']
62            num_comments = request.user_data['numComments']
63            Actor.log.info(f'Scraping {request.url} ...')
64
65            try:
66                # Fetch the HTTP response from the specified URL using HTTPX.
67                async with AsyncClient() as client:
68                    # response = await client.get(url, follow_redirects=True)
69                    for post in reddit_scraper.get_all_posts(url, posts_limit=posts_limit, comments_limit=num_comments):
70                        await Actor.push_data(post)
71
72            except Exception:
73                Actor.log.exception(f'Failed to scrape {url}. Will be tried again.')
74                try:
75                    Actor.log.exception(f"Data failed to be pushed is {post}")
76                except:
77                    pass
78            finally:
79                # Mark the request as handled to ensure it is not processed again.
80                await request_queue.mark_request_as_handled(request)

example/advanced-reddit-scraper/src/redditor.py

1import requests
2from bs4 import BeautifulSoup
3from typing import Dict, Any, List, Optional
4import re
5import base64
6import urllib.parse
7from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
8from .session import cookies, headers
9from typing import Dict, Any, List, Optional, Tuple
10
11import time
12import functools
13
14def log_execution_time(func):
15    @functools.wraps(func)
16    def wrapper(self, *args, **kwargs):
17        start_time = time.time()
18        result = func(self, *args, **kwargs)
19        end_time = time.time()
20        duration = end_time - start_time
21        self.logger.debug(f"{func.__name__} took {duration:.2f} seconds to execute")
22        return result
23    return wrapper
24
25class Redditor:
26    BASE_URL = "https://www.reddit.com"
27
28    def __init__(self, logger):
29        self.logger = logger
30        self.cookies = cookies
31        self.headers = headers
32        self.session = requests.Session()
33        self.session.cookies.update(cookies)
34        self.session.headers.update(headers)
35
36    @log_execution_time
37    def get_community_posts(self, url: str, after: Optional[str] = None) -> str:
38        try:
39            parsed_query = self.parse_url(url)
40            sort = parsed_query.get('sort', 'top')
41            url = f"{self.BASE_URL}/svc/shreddit/community-more-posts/{sort}/"
42            params = {
43                "after": after,
44                "t": parsed_query['time'] or 'day',
45                "name": parsed_query['sub'],
46                "navigationSessionId": "a10adc86-f1ec-4221-9179-d9613e4c7d05",
47                "feedLength": "28"
48            }
49
50            response = self.session.get(url, params=params)
51            response.raise_for_status()
52
53            return response.text
54        except requests.exceptions.RequestException as e:
55            self.logger.error(f"Error fetching community posts: {e}")
56            raise
57        except Exception as e:
58            self.logger.error(f"Unexpected error: {e}")
59            raise
60
61    @log_execution_time
62    def get_post_content(self, permalink: str) -> str:
63        """Get the content of a post using its permalink"""
64        try:
65            url = f"{self.BASE_URL}{permalink}"
66            response = self.session.get(url)
67            response.raise_for_status()
68            soup = BeautifulSoup(response.text, 'html.parser')
69            
70            # Find the post content in the text-body slot
71            text_body = soup.find('div', {'slot': 'text-body'})
72            if text_body:
73                md_div = text_body.find('div', {'class': 'md'})
74                if md_div:
75                    paragraphs = md_div.find_all('p')
76                    return '\n'.join(p.get_text(strip=True) for p in paragraphs)
77            
78            # If no text content, check for media content
79            shreddit_post = soup.find('shreddit-post')
80            if shreddit_post:
81                content_href = shreddit_post.get('content-href')
82                if content_href:
83                    return content_href
84                    
85            return ''
86        except Exception as e:
87            self.logger.error(f"Error getting post content: {e}")
88            return ''
89
90    @log_execution_time
91    def parse_posts(self, html_content: str) -> List[Dict[str, Any]]:
92        try:
93            soup = BeautifulSoup(html_content, 'html.parser')
94            posts = []
95            for article in soup.find_all('article'):
96                shreddit_post = article.find('shreddit-post')
97                if shreddit_post:
98                    permalink = shreddit_post.get('permalink')
99                    post = {
100                        "id": shreddit_post.get('id'),
101                        "title": shreddit_post.get('post-title'),
102                        "author": shreddit_post.get('author'),
103                        "subreddit": shreddit_post.get('subreddit-prefixed-name'),
104                        "score": shreddit_post.get('score'),
105                        "num_comments": shreddit_post.get('comment-count'),
106                        "created_timestamp": shreddit_post.get('created-timestamp'),
107                        "permalink": permalink,
108                        "content": self.get_post_content(permalink)
109                    }
110                    posts.append(post)
111            return posts
112        except Exception as e:
113            self.logger.error(f"Error parsing posts: {e}")
114            raise
115    @log_execution_time
116    def get_next_cursor(self, html_content: str) -> Optional[str]:
117        try:
118            soup = BeautifulSoup(html_content, 'html.parser')
119            load_after = soup.find('faceplate-partial', slot='load-after')
120
121            if load_after:
122                src = load_after.get('src', '')
123                match = re.search(r'after=([^&]+)', src)
124                if match:
125                    encoded_cursor = match.group(1)
126                    decoded_cursor = urllib.parse.unquote(encoded_cursor)
127                    padding = '=' * ((4 - len(decoded_cursor) % 4) % 4)
128                    padded_cursor = decoded_cursor + padding
129                    return base64.b64decode(padded_cursor).decode('utf-8')
130
131        except Exception as e:
132            self.logger.error(f"Error retrieving next cursor: {e}")
133        return None
134    @log_execution_time
135    def get_all_posts(self, subreddit: str, posts_limit: int = 100, comments_limit: int = 0) -> List[Dict[str, Any]]:
136        all_posts = []
137        after = None
138
139        try:
140            while len(all_posts) < posts_limit:
141                self.logger.info(f"Fetching posts for subreddit {subreddit}...")
142                html_content = self.get_community_posts(subreddit, after)
143                new_posts = self.parse_posts(html_content)[:posts_limit - len(all_posts)]
144
145                if not new_posts:
146                    break
147
148                for post in new_posts:
149                    if comments_limit > 0:
150                        post['comments'] = self.get_all_comments(post['subreddit'].split('/')[1], post['id'], comments_limit)
151
152                all_posts.extend(new_posts)
153                after = self.get_next_cursor(html_content)
154
155                if not after:
156                    break
157            
158            self.logger.info(f"Retrieved {len(all_posts[:posts_limit])} posts.")
159            return all_posts[:posts_limit]
160        except Exception as e:
161            self.logger.error(f"Error retrieving posts: {e}")
162            raise
163
164    @log_execution_time
165    def parse_url(self, url: str) -> Dict[str, str]:
166        result = {'sub': '', 'sort': 'none', 'time': None}
167
168        try:
169            subreddit_pattern = re.compile(r'(?:/r/|reddit\.com/r/|^)(\w+)')
170            sort_pattern = re.compile(r'/(hot|new|top|rising)')
171            time_pattern = re.compile(r'[?&]t=(hour|day|week|month|year|all)')
172
173            if not url.startswith('http'):
174                match = subreddit_pattern.search(url)
175                if match:
176                    result['sub'] = match.group(1)
177                return result
178
179            path = urlparse(url).path
180            query_string = urlparse(url).query
181
182            sub_match = subreddit_pattern.search(path)
183            if sub_match:
184                result['sub'] = sub_match.group(1)
185
186            sort_match = sort_pattern.search(path)
187            if sort_match:
188                result['sort'] = sort_match.group(1)
189
190            time_match = time_pattern.search(query_string)
191            if time_match:
192                result['time'] = time_match.group(1)
193
194            return result
195        except Exception as e:
196            self.logger.error(f"Error parsing URL: {e}")
197            raise
198
199    @log_execution_time
200    def get_comments(self, subreddit: str, post_id: str, cursor: Optional[str] = None, sort: str = 'hot') -> Tuple[List[Dict[str, Any]], Optional[str]]:
201        try:
202            url = f"{self.BASE_URL}/svc/shreddit/more-comments/{subreddit}/t3_{post_id.split('_')[1]}"
203            params = {'sort': sort, 'top-level': '1'}
204            data = {}
205
206            if cursor:
207                params['cursor'] = cursor
208
209            response = self.session.post(url, params=params, data=data)
210            response.raise_for_status()
211
212            return self.parse_comments(response.text)
213        except requests.exceptions.RequestException as e:
214            self.logger.error(f"Error fetching comments: {e}")
215            raise
216        except Exception as e:
217            self.logger.error(f"Unexpected error: {e}")
218            raise
219
220    @log_execution_time
221    def parse_comments(self, html_content: str) -> Tuple[List[Dict[str, Any]], Optional[str]]:
222        try:
223            soup = BeautifulSoup(html_content, 'html.parser')
224            comments = []
225
226            for comment in soup.find_all('shreddit-comment'):
227                content_div = comment.find('div', {'class': 'md'})
228                # Extract clean comment text if content div exists
229                if content_div:
230                    # Get all paragraphs from the content
231                    paragraphs = content_div.find_all('p')
232                    # Join paragraphs with newlines, strip whitespace
233                    content = '\n'.join(p.get_text(strip=True) for p in paragraphs)
234                else:
235                    content = ''
236                parsed_comment = {
237                    "id": comment.get('thingid'),
238                    "author": comment.get('author'),
239                    "score": comment.get('score'),
240                    "depth": comment.get('depth'),
241                    "permalink": comment.get('permalink'),
242                    "content": content.strip()
243                }
244                comments.append(parsed_comment)
245
246            next_cursor = self.get_next_comment_cursor(html_content)
247            return comments, next_cursor
248        except Exception as e:
249            self.logger.error(f"Error parsing comments: {e}")
250            raise
251
252    @log_execution_time
253    def get_next_comment_cursor(self, html_content: str) -> Optional[str]:
254        try:
255            soup = BeautifulSoup(html_content, 'html.parser')
256            faceplate_partial = soup.find('faceplate-partial', attrs={'loading': 'action'})
257
258            if faceplate_partial:
259                hidden_input = faceplate_partial.find('input', attrs={'type': 'hidden', 'name': 'cursor'})
260                if hidden_input:
261                    return hidden_input.get('value')
262
263        except Exception as e:
264            self.logger.error(f"Error retrieving next comment cursor: {e}")
265        return None
266
267    @log_execution_time
268    def get_all_comments(self, subreddit: str, post_id: str, limit: int = 100) -> List[Dict[str, Any]]:
269        all_comments = []
270        cursor = None
271
272        try:
273            while len(all_comments) < limit:
274                comments, next_cursor = self.get_comments(subreddit, post_id, cursor)
275                all_comments.extend(comments)
276
277                if not next_cursor:
278                    self.logger.info(f"Next cursor not found for post {post_id}.")
279                    break
280
281                cursor = next_cursor
282            self.logger.info(f"Retrieved {len(all_comments)} comments.")
283            return all_comments[:limit]
284        except Exception as e:
285            self.logger.error(f"Error retrieving comments: {e}")
286            raise
287
288    @log_execution_time
289    def subreddit_link_from_query(self, query, sorting='top', period='week'):
290        try:
291            # If the input is just a subreddit name (with or without 'r/')
292            if not query.startswith('http'):
293                # Normalize input to the form 'r/subredditname'
294                if query.startswith('r/'):
295                    query = f'https://www.reddit.com/{query}/'
296                else:
297                    query = f'https://www.reddit.com/r/{query}/'
298
299            # Parse the subreddit link
300            parsed_url = urlparse(query)
301
302            # Ensure that the path ends with a trailing slash
303            path_parts = parsed_url.path.rstrip('/').split('/')
304
305            # Valid sorting options
306            valid_sorting = ['hot', 'new', 'rising', 'top']
307
308            # Check if the link is already sorted
309            if len(path_parts) > 3 and path_parts[3] in valid_sorting:
310                # Return the original link if already sorted
311                return query
312
313            # Otherwise, append the sorting method to the path
314            path_parts.append(sorting)
315
316            # Add the 't' parameter only if sorting is 'top'
317            query_params = parse_qs(parsed_url.query)
318            if sorting == 'top':
319                query_params['t'] = [period]
320
321            # Rebuild the URL
322            new_path = '/'.join(path_parts) + '/'
323            new_query = urlencode(query_params, doseq=True)
324
325            # Return the new URL
326            return urlunparse((parsed_url.scheme, parsed_url.netloc, new_path, parsed_url.params, new_query, parsed_url.fragment))
327        
328        except Exception as e:
329            self.logger.error(f"Error constructing subreddit URL from query: {e}")
330            raise

example/advanced-reddit-scraper/src/session.py

1cookies = {
2    'csv': '2',
3    'edgebucket': 'lH2AY01VDhdJZrAOeK',
4    'loid': '0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg',
5    'pc': '81',
6    'reddit_session': '710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6',
7    'token_v2': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI5MTk4MTQ0LjY5NDYyMSwiaWF0IjoxNzI5MTExNzQ0LjY5NDYyMSwianRpIjoiZXBsV3k0R1VURHl4aGwtdEhWZnI2U0lxY00xR0lnIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.FBxK7Xnbhy-bW3l71YopqqUBpjfkOdz8XBUauNi3o3pScQLvO0sOs72E2kMiaYX6iTfUPyklR5xRnGVF6PjQmurx2vu8XAm3W1IkGIYPZOOvjnWKhbzv1m8bzfOHGSIZg9bOy7RoCce6A-HCKfR6y4nQyMaiv5jCUdLILePHdUYw3kZEC_ASAXEXvv-dyyaO2GCW_Jxq95CU6lYxqLaO73xhPzR9YjNl_RaAC9xMip6d5Xe3n5wuMdY8bQ3dAfqNVNJKI4fkIij0v90-SJT7vKffNSbueqrckCPgDIXQrpJA1_bx-npHLl5gg7-uBLwDUzXpWMO_BTDxgekscFc6fQ',
8    'reddit_chat_view': 'closed',
9    't2_927nml2x_recentclicks3': 't3_1g57do3%2Ct3_e7ewoo%2Ct3_1fvuu0l%2Ct3_435p6x%2Ct3_d956ag%2Ct3_15svnqa%2Ct3_f2nxzt%2Ct3_e6ryal%2Ct3_79uq5s%2Ct3_7qry4j',
10    'csrf_token': 'd7886d7dde33b8ae9f535d8cf19dad8f',
11    'session_tracker': 'mifofnihaddjdlkjml.0.1729129739819.Z0FBQUFBQm5FRzBMWUZrSlZycUctVmcwZ25zZm9ZRTV4T1NMNjdQTW45dTI1eFQ1NDVqTWF2N20yQzlXNVFCUkEyNndKazVCbWJ1ZHFoVlFZMEFPS2xGYXpDY2Fxcm4xX1F6UEZfWFpfal92NTVuRDF6Q0EzTWtOT3lZOENQQUVBaFlScWQwMGpqZFk',
12}
13
14headers = {
15    'accept': 'text/vnd.reddit.partial+html, text/html;q=0.9',
16    'accept-language': 'en,en-US;q=0.9,tr-TR;q=0.8,tr;q=0.7,de;q=0.6',
17    'content-type': 'application/x-www-form-urlencoded',
18    # 'cookie': 'csv=2; edgebucket=lH2AY01VDhdJZrAOeK; loid=0000000000927nml2x.2.1606468634160.Z0FBQUFBQm1zN01jTVgwSUpobndnMVlHS2xfcGNXdk1SbXhpMjJtN0NNa2VCOFZBZ3Zlb3loSGFlZWtxWlNkdHk5cUxZVVZtNDdWQWl6M0xOdXhRc3FsWmVob0pfQXdjQjItZ1pkOHFmTWsxVVFQU194SjEwTi10MHI2ay1TU01EYjhDVjdpclUxVFg; pc=81; reddit_session=710093989689%2C2024-08-07T17%3A47%3A05%2C62b4116104fbf3597d47b0718c6986d009b6f8c6; token_v2=eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI5MTk4MTQ0LjY5NDYyMSwiaWF0IjoxNzI5MTExNzQ0LjY5NDYyMSwianRpIjoiZXBsV3k0R1VURHl4aGwtdEhWZnI2U0lxY00xR0lnIiwiY2lkIjoiMFItV0FNaHVvby1NeVEiLCJsaWQiOiJ0Ml85MjdubWwyeCIsImFpZCI6InQyXzkyN25tbDJ4IiwibGNhIjoxNjA2NDY4NjM0MTYwLCJzY3AiOiJlSnhra2RHT3REQUloZC1sMXo3Ql95cF9OaHRzY1lhc0xRYW9rM243RFZvY2s3MDdjTDRpSFA4bktJcUZMRTJ1QktHa0tXRUZXdE9VTmlMdjU4eTlPWkVGU3lGVFI4NDN5d29rYVVwUFVtTjVweWxSd1daa0xsZmFzVUtEQjZZcFZTNloyMEtQUzV2UTNJMUZ6MDZNcWx4V0h0VFlvM0pwYkdNSzJ4UGp6Y1pxUXlxdXk2bE1ZRmtvbjhXTGZ2eUctdFktZjdiZmhIWXdyS2dLRF9UT3VGeHdZX0hERkhiX25wcjBiRjJ3cUwzWGc5US0xLU4yN2JObW9kbTVfVnpQdnphU2NUbUc1aWZZdjd0LUNSMTQ1SG1aVVFjd1lnMF95ckFqNl9Ddk9vREtCUVdNSlloUEk1QXJsMl9fSmRpdVRmOGF0eWQtLUdiRVRXXzRyUm1vNXhMRW9VX2o2emNBQVBfX1hEX2U0dyIsInJjaWQiOiJBNjE2cG1hN0taX1R1SzRVOFJlQlJUaXVKV3VBZ3lUY2VRTUpyS01NRk93IiwiZmxvIjoyfQ.FBxK7Xnbhy-bW3l71YopqqUBpjfkOdz8XBUauNi3o3pScQLvO0sOs72E2kMiaYX6iTfUPyklR5xRnGVF6PjQmurx2vu8XAm3W1IkGIYPZOOvjnWKhbzv1m8bzfOHGSIZg9bOy7RoCce6A-HCKfR6y4nQyMaiv5jCUdLILePHdUYw3kZEC_ASAXEXvv-dyyaO2GCW_Jxq95CU6lYxqLaO73xhPzR9YjNl_RaAC9xMip6d5Xe3n5wuMdY8bQ3dAfqNVNJKI4fkIij0v90-SJT7vKffNSbueqrckCPgDIXQrpJA1_bx-npHLl5gg7-uBLwDUzXpWMO_BTDxgekscFc6fQ; reddit_chat_view=closed; t2_927nml2x_recentclicks3=t3_1g57do3%2Ct3_e7ewoo%2Ct3_1fvuu0l%2Ct3_435p6x%2Ct3_d956ag%2Ct3_15svnqa%2Ct3_f2nxzt%2Ct3_e6ryal%2Ct3_79uq5s%2Ct3_7qry4j; csrf_token=d7886d7dde33b8ae9f535d8cf19dad8f; session_tracker=mifofnihaddjdlkjml.0.1729129739819.Z0FBQUFBQm5FRzBMWUZrSlZycUctVmcwZ25zZm9ZRTV4T1NMNjdQTW45dTI1eFQ1NDVqTWF2N20yQzlXNVFCUkEyNndKazVCbWJ1ZHFoVlFZMEFPS2xGYXpDY2Fxcm4xX1F6UEZfWFpfal92NTVuRDF6Q0EzTWtOT3lZOENQQUVBaFlScWQwMGpqZFk',
19    'origin': 'https://www.reddit.com',
20    'priority': 'u=1, i',
21    'referer': 'https://www.reddit.com/r/AskReddit/comments/1g57do3/whats_a_bitter_life_lesson_you_learned_from_your/',
22    'sec-ch-ua': '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
23    'sec-ch-ua-mobile': '?0',
24    'sec-ch-ua-platform': '"Windows"',
25    'sec-fetch-dest': 'empty',
26    'sec-fetch-mode': 'cors',
27    'sec-fetch-site': 'same-origin',
28    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
29}

example/advanced-reddit-scraper/src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint.
6asyncio.run(main())

src/llm_providers/__pycache__/base_provider.cpython-312.pyc

Download

src/llm_providers/__pycache__/factory.cpython-312.pyc

Download

src/llm_providers/__pycache__/gemini_provider.cpython-312.pyc

Download

src/llm_providers/__pycache__/__init__.cpython-312.pyc

Download

example/advanced-reddit-scraper/.git/info/exclude

1# git ls-files --others --exclude-from=.git/info/exclude
2# Lines that start with '#' are comments.
3# For a project mostly in C, the following would be a good set of
4# exclude patterns (uncomment them if you want to use them):
5# *.[oa]
6# *~

example/advanced-reddit-scraper/.git/hooks/applypatch-msg.sample

1#!/bin/sh
2#
3# An example hook script to check the commit log message taken by
4# applypatch from an e-mail message.
5#
6# The hook should exit with non-zero status after issuing an
7# appropriate message if it wants to stop the commit.  The hook is
8# allowed to edit the commit message file.
9#
10# To enable this hook, rename this file to "applypatch-msg".
11
12. git-sh-setup
13commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
14test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
15:

example/advanced-reddit-scraper/.git/hooks/commit-msg.sample

1#!/bin/sh
2#
3# An example hook script to check the commit log message.
4# Called by "git commit" with one argument, the name of the file
5# that has the commit message.  The hook should exit with non-zero
6# status after issuing an appropriate message if it wants to stop the
7# commit.  The hook is allowed to edit the commit message file.
8#
9# To enable this hook, rename this file to "commit-msg".
10
11# Uncomment the below to add a Signed-off-by line to the message.
12# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
13# hook is more suited to it.
14#
15# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
16# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
17
18# This example catches duplicate Signed-off-by lines.
19
20test "" = "$(grep '^Signed-off-by: ' "$1" |
21	 sort | uniq -c | sed -e '/^[ 	]*1[ 	]/d')" || {
22	echo >&2 Duplicate Signed-off-by lines.
23	exit 1
24}

example/advanced-reddit-scraper/.git/hooks/fsmonitor-watchman.sample

1#!/usr/bin/perl
2
3use strict;
4use warnings;
5use IPC::Open2;
6
7# An example hook script to integrate Watchman
8# (https://facebook.github.io/watchman/) with git to speed up detecting
9# new and modified files.
10#
11# The hook is passed a version (currently 2) and last update token
12# formatted as a string and outputs to stdout a new update token and
13# all files that have been modified since the update token. Paths must
14# be relative to the root of the working tree and separated by a single NUL.
15#
16# To enable this hook, rename this file to "query-watchman" and set
17# 'git config core.fsmonitor .git/hooks/query-watchman'
18#
19my ($version, $last_update_token) = @ARGV;
20
21# Uncomment for debugging
22# print STDERR "$0 $version $last_update_token\n";
23
24# Check the hook interface version
25if ($version ne 2) {
26	die "Unsupported query-fsmonitor hook version '$version'.\n" .
27	    "Falling back to scanning...\n";
28}
29
30my $git_work_tree = get_working_dir();
31
32my $retry = 1;
33
34my $json_pkg;
35eval {
36	require JSON::XS;
37	$json_pkg = "JSON::XS";
38	1;
39} or do {
40	require JSON::PP;
41	$json_pkg = "JSON::PP";
42};
43
44launch_watchman();
45
46sub launch_watchman {
47	my $o = watchman_query();
48	if (is_work_tree_watched($o)) {
49		output_result($o->{clock}, @{$o->{files}});
50	}
51}
52
53sub output_result {
54	my ($clockid, @files) = @_;
55
56	# Uncomment for debugging watchman output
57	# open (my $fh, ">", ".git/watchman-output.out");
58	# binmode $fh, ":utf8";
59	# print $fh "$clockid\n@files\n";
60	# close $fh;
61
62	binmode STDOUT, ":utf8";
63	print $clockid;
64	print "\0";
65	local $, = "\0";
66	print @files;
67}
68
69sub watchman_clock {
70	my $response = qx/watchman clock "$git_work_tree"/;
71	die "Failed to get clock id on '$git_work_tree'.\n" .
72		"Falling back to scanning...\n" if $? != 0;
73
74	return $json_pkg->new->utf8->decode($response);
75}
76
77sub watchman_query {
78	my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
79	or die "open2() failed: $!\n" .
80	"Falling back to scanning...\n";
81
82	# In the query expression below we're asking for names of files that
83	# changed since $last_update_token but not from the .git folder.
84	#
85	# To accomplish this, we're using the "since" generator to use the
86	# recency index to select candidate nodes and "fields" to limit the
87	# output to file names only. Then we're using the "expression" term to
88	# further constrain the results.
89	my $last_update_line = "";
90	if (substr($last_update_token, 0, 1) eq "c") {
91		$last_update_token = "\"$last_update_token\"";
92		$last_update_line = qq[\n"since": $last_update_token,];
93	}
94	my $query = <<"	END";
95		["query", "$git_work_tree", {$last_update_line
96			"fields": ["name"],
97			"expression": ["not", ["dirname", ".git"]]
98		}]
99	END
100
101	# Uncomment for debugging the watchman query
102	# open (my $fh, ">", ".git/watchman-query.json");
103	# print $fh $query;
104	# close $fh;
105
106	print CHLD_IN $query;
107	close CHLD_IN;
108	my $response = do {local $/; <CHLD_OUT>};
109
110	# Uncomment for debugging the watch response
111	# open ($fh, ">", ".git/watchman-response.json");
112	# print $fh $response;
113	# close $fh;
114
115	die "Watchman: command returned no output.\n" .
116	"Falling back to scanning...\n" if $response eq "";
117	die "Watchman: command returned invalid output: $response\n" .
118	"Falling back to scanning...\n" unless $response =~ /^\{/;
119
120	return $json_pkg->new->utf8->decode($response);
121}
122
123sub is_work_tree_watched {
124	my ($output) = @_;
125	my $error = $output->{error};
126	if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
127		$retry--;
128		my $response = qx/watchman watch "$git_work_tree"/;
129		die "Failed to make watchman watch '$git_work_tree'.\n" .
130		    "Falling back to scanning...\n" if $? != 0;
131		$output = $json_pkg->new->utf8->decode($response);
132		$error = $output->{error};
133		die "Watchman: $error.\n" .
134		"Falling back to scanning...\n" if $error;
135
136		# Uncomment for debugging watchman output
137		# open (my $fh, ">", ".git/watchman-output.out");
138		# close $fh;
139
140		# Watchman will always return all files on the first query so
141		# return the fast "everything is dirty" flag to git and do the
142		# Watchman query just to get it over with now so we won't pay
143		# the cost in git to look up each individual file.
144		my $o = watchman_clock();
145		$error = $output->{error};
146
147		die "Watchman: $error.\n" .
148		"Falling back to scanning...\n" if $error;
149
150		output_result($o->{clock}, ("/"));
151		$last_update_token = $o->{clock};
152
153		eval { launch_watchman() };
154		return 0;
155	}
156
157	die "Watchman: $error.\n" .
158	"Falling back to scanning...\n" if $error;
159
160	return 1;
161}
162
163sub get_working_dir {
164	my $working_dir;
165	if ($^O =~ 'msys' || $^O =~ 'cygwin') {
166		$working_dir = Win32::GetCwd();
167		$working_dir =~ tr/\\/\//;
168	} else {
169		require Cwd;
170		$working_dir = Cwd::cwd();
171	}
172
173	return $working_dir;
174}

example/advanced-reddit-scraper/.git/hooks/post-update.sample

1#!/bin/sh
2#
3# An example hook script to prepare a packed repository for use over
4# dumb transports.
5#
6# To enable this hook, rename this file to "post-update".
7
8exec git update-server-info

example/advanced-reddit-scraper/.git/hooks/pre-applypatch.sample

1#!/bin/sh
2#
3# An example hook script to verify what is about to be committed
4# by applypatch from an e-mail message.
5#
6# The hook should exit with non-zero status after issuing an
7# appropriate message if it wants to stop the commit.
8#
9# To enable this hook, rename this file to "pre-applypatch".
10
11. git-sh-setup
12precommit="$(git rev-parse --git-path hooks/pre-commit)"
13test -x "$precommit" && exec "$precommit" ${1+"$@"}
14:

example/advanced-reddit-scraper/.git/hooks/pre-commit.sample

1#!/bin/sh
2#
3# An example hook script to verify what is about to be committed.
4# Called by "git commit" with no arguments.  The hook should
5# exit with non-zero status after issuing an appropriate message if
6# it wants to stop the commit.
7#
8# To enable this hook, rename this file to "pre-commit".
9
10if git rev-parse --verify HEAD >/dev/null 2>&1
11then
12	against=HEAD
13else
14	# Initial commit: diff against an empty tree object
15	against=$(git hash-object -t tree /dev/null)
16fi
17
18# If you want to allow non-ASCII filenames set this variable to true.
19allownonascii=$(git config --type=bool hooks.allownonascii)
20
21# Redirect output to stderr.
22exec 1>&2
23
24# Cross platform projects tend to avoid non-ASCII filenames; prevent
25# them from being added to the repository. We exploit the fact that the
26# printable range starts at the space character and ends with tilde.
27if [ "$allownonascii" != "true" ] &&
28	# Note that the use of brackets around a tr range is ok here, (it's
29	# even required, for portability to Solaris 10's /usr/bin/tr), since
30	# the square bracket bytes happen to fall in the designated range.
31	test $(git diff-index --cached --name-only --diff-filter=A -z $against |
32	  LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
33then
34	cat <<\EOF
35Error: Attempt to add a non-ASCII file name.
36
37This can cause problems if you want to work with people on other platforms.
38
39To be portable it is advisable to rename the file.
40
41If you know what you are doing you can disable this check using:
42
43  git config hooks.allownonascii true
44EOF
45	exit 1
46fi
47
48# If there are whitespace errors, print the offending file names and fail.
49exec git diff-index --check --cached $against --

example/advanced-reddit-scraper/.git/hooks/pre-merge-commit.sample

1#!/bin/sh
2#
3# An example hook script to verify what is about to be committed.
4# Called by "git merge" with no arguments.  The hook should
5# exit with non-zero status after issuing an appropriate message to
6# stderr if it wants to stop the merge commit.
7#
8# To enable this hook, rename this file to "pre-merge-commit".
9
10. git-sh-setup
11test -x "$GIT_DIR/hooks/pre-commit" &&
12        exec "$GIT_DIR/hooks/pre-commit"
13:

example/advanced-reddit-scraper/.git/hooks/pre-push.sample

1#!/bin/sh
2
3# An example hook script to verify what is about to be pushed.  Called by "git
4# push" after it has checked the remote status, but before anything has been
5# pushed.  If this script exits with a non-zero status nothing will be pushed.
6#
7# This hook is called with the following parameters:
8#
9# $1 -- Name of the remote to which the push is being done
10# $2 -- URL to which the push is being done
11#
12# If pushing without using a named remote those arguments will be equal.
13#
14# Information about the commits which are being pushed is supplied as lines to
15# the standard input in the form:
16#
17#   <local ref> <local oid> <remote ref> <remote oid>
18#
19# This sample shows how to prevent push of commits where the log message starts
20# with "WIP" (work in progress).
21
22remote="$1"
23url="$2"
24
25zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
26
27while read local_ref local_oid remote_ref remote_oid
28do
29	if test "$local_oid" = "$zero"
30	then
31		# Handle delete
32		:
33	else
34		if test "$remote_oid" = "$zero"
35		then
36			# New branch, examine all commits
37			range="$local_oid"
38		else
39			# Update to existing branch, examine new commits
40			range="$remote_oid..$local_oid"
41		fi
42
43		# Check for WIP commit
44		commit=$(git rev-list -n 1 --grep '^WIP' "$range")
45		if test -n "$commit"
46		then
47			echo >&2 "Found WIP commit in $local_ref, not pushing"
48			exit 1
49		fi
50	fi
51done
52
53exit 0

example/advanced-reddit-scraper/.git/hooks/pre-rebase.sample

1#!/bin/sh
2#
3# Copyright (c) 2006, 2008 Junio C Hamano
4#
5# The "pre-rebase" hook is run just before "git rebase" starts doing
6# its job, and can prevent the command from running by exiting with
7# non-zero status.
8#
9# The hook is called with the following parameters:
10#
11# $1 -- the upstream the series was forked from.
12# $2 -- the branch being rebased (or empty when rebasing the current branch).
13#
14# This sample shows how to prevent topic branches that are already
15# merged to 'next' branch from getting rebased, because allowing it
16# would result in rebasing already published history.
17
18publish=next
19basebranch="$1"
20if test "$#" = 2
21then
22	topic="refs/heads/$2"
23else
24	topic=`git symbolic-ref HEAD` ||
25	exit 0 ;# we do not interrupt rebasing detached HEAD
26fi
27
28case "$topic" in
29refs/heads/??/*)
30	;;
31*)
32	exit 0 ;# we do not interrupt others.
33	;;
34esac
35
36# Now we are dealing with a topic branch being rebased
37# on top of master.  Is it OK to rebase it?
38
39# Does the topic really exist?
40git show-ref -q "$topic" || {
41	echo >&2 "No such branch $topic"
42	exit 1
43}
44
45# Is topic fully merged to master?
46not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
47if test -z "$not_in_master"
48then
49	echo >&2 "$topic is fully merged to master; better remove it."
50	exit 1 ;# we could allow it, but there is no point.
51fi
52
53# Is topic ever merged to next?  If so you should not be rebasing it.
54only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
55only_next_2=`git rev-list ^master           ${publish} | sort`
56if test "$only_next_1" = "$only_next_2"
57then
58	not_in_topic=`git rev-list "^$topic" master`
59	if test -z "$not_in_topic"
60	then
61		echo >&2 "$topic is already up to date with master"
62		exit 1 ;# we could allow it, but there is no point.
63	else
64		exit 0
65	fi
66else
67	not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
68	/usr/bin/perl -e '
69		my $topic = $ARGV[0];
70		my $msg = "* $topic has commits already merged to public branch:\n";
71		my (%not_in_next) = map {
72			/^([0-9a-f]+) /;
73			($1 => 1);
74		} split(/\n/, $ARGV[1]);
75		for my $elem (map {
76				/^([0-9a-f]+) (.*)$/;
77				[$1 => $2];
78			} split(/\n/, $ARGV[2])) {
79			if (!exists $not_in_next{$elem->[0]}) {
80				if ($msg) {
81					print STDERR $msg;
82					undef $msg;
83				}
84				print STDERR " $elem->[1]\n";
85			}
86		}
87	' "$topic" "$not_in_next" "$not_in_master"
88	exit 1
89fi
90
91<<\DOC_END
92
93This sample hook safeguards topic branches that have been
94published from being rewound.
95
96The workflow assumed here is:
97
98 * Once a topic branch forks from "master", "master" is never
99   merged into it again (either directly or indirectly).
100
101 * Once a topic branch is fully cooked and merged into "master",
102   it is deleted.  If you need to build on top of it to correct
103   earlier mistakes, a new topic branch is created by forking at
104   the tip of the "master".  This is not strictly necessary, but
105   it makes it easier to keep your history simple.
106
107 * Whenever you need to test or publish your changes to topic
108   branches, merge them into "next" branch.
109
110The script, being an example, hardcodes the publish branch name
111to be "next", but it is trivial to make it configurable via
112$GIT_DIR/config mechanism.
113
114With this workflow, you would want to know:
115
116(1) ... if a topic branch has ever been merged to "next".  Young
117    topic branches can have stupid mistakes you would rather
118    clean up before publishing, and things that have not been
119    merged into other branches can be easily rebased without
120    affecting other people.  But once it is published, you would
121    not want to rewind it.
122
123(2) ... if a topic branch has been fully merged to "master".
124    Then you can delete it.  More importantly, you should not
125    build on top of it -- other people may already want to
126    change things related to the topic as patches against your
127    "master", so if you need further changes, it is better to
128    fork the topic (perhaps with the same name) afresh from the
129    tip of "master".
130
131Let's look at this example:
132
133		   o---o---o---o---o---o---o---o---o---o "next"
134		  /       /           /           /
135		 /   a---a---b A     /           /
136		/   /               /           /
137	       /   /   c---c---c---c B         /
138	      /   /   /             \         /
139	     /   /   /   b---b C     \       /
140	    /   /   /   /             \     /
141    ---o---o---o---o---o---o---o---o---o---o---o "master"
142
143
144A, B and C are topic branches.
145
146 * A has one fix since it was merged up to "next".
147
148 * B has finished.  It has been fully merged up to "master" and "next",
149   and is ready to be deleted.
150
151 * C has not merged to "next" at all.
152
153We would want to allow C to be rebased, refuse A, and encourage
154B to be deleted.
155
156To compute (1):
157
158	git rev-list ^master ^topic next
159	git rev-list ^master        next
160
161	if these match, topic has not merged in next at all.
162
163To compute (2):
164
165	git rev-list master..topic
166
167	if this is empty, it is fully merged to "master".
168
169DOC_END

example/advanced-reddit-scraper/.git/hooks/pre-receive.sample

1#!/bin/sh
2#
3# An example hook script to make use of push options.
4# The example simply echoes all push options that start with 'echoback='
5# and rejects all pushes when the "reject" push option is used.
6#
7# To enable this hook, rename this file to "pre-receive".
8
9if test -n "$GIT_PUSH_OPTION_COUNT"
10then
11	i=0
12	while test "$i" -lt "$GIT_PUSH_OPTION_COUNT"
13	do
14		eval "value=\$GIT_PUSH_OPTION_$i"
15		case "$value" in
16		echoback=*)
17			echo "echo from the pre-receive-hook: ${value#*=}" >&2
18			;;
19		reject)
20			exit 1
21		esac
22		i=$((i + 1))
23	done
24fi

example/advanced-reddit-scraper/.git/hooks/prepare-commit-msg.sample

1#!/bin/sh
2#
3# An example hook script to prepare the commit log message.
4# Called by "git commit" with the name of the file that has the
5# commit message, followed by the description of the commit
6# message's source.  The hook's purpose is to edit the commit
7# message file.  If the hook fails with a non-zero status,
8# the commit is aborted.
9#
10# To enable this hook, rename this file to "prepare-commit-msg".
11
12# This hook includes three examples. The first one removes the
13# "# Please enter the commit message..." help message.
14#
15# The second includes the output of "git diff --name-status -r"
16# into the message, just before the "git status" output.  It is
17# commented because it doesn't cope with --amend or with squashed
18# commits.
19#
20# The third example adds a Signed-off-by line to the message, that can
21# still be edited.  This is rarely a good idea.
22
23COMMIT_MSG_FILE=$1
24COMMIT_SOURCE=$2
25SHA1=$3
26
27/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"
28
29# case "$COMMIT_SOURCE,$SHA1" in
30#  ,|template,)
31#    /usr/bin/perl -i.bak -pe '
32#       print "\n" . `git diff --cached --name-status -r`
33# 	 if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;
34#  *) ;;
35# esac
36
37# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
38# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"
39# if test -z "$COMMIT_SOURCE"
40# then
41#   /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"
42# fi

example/advanced-reddit-scraper/.git/hooks/push-to-checkout.sample

1#!/bin/sh
2
3# An example hook script to update a checked-out tree on a git push.
4#
5# This hook is invoked by git-receive-pack(1) when it reacts to git
6# push and updates reference(s) in its repository, and when the push
7# tries to update the branch that is currently checked out and the
8# receive.denyCurrentBranch configuration variable is set to
9# updateInstead.
10#
11# By default, such a push is refused if the working tree and the index
12# of the remote repository has any difference from the currently
13# checked out commit; when both the working tree and the index match
14# the current commit, they are updated to match the newly pushed tip
15# of the branch. This hook is to be used to override the default
16# behaviour; however the code below reimplements the default behaviour
17# as a starting point for convenient modification.
18#
19# The hook receives the commit with which the tip of the current
20# branch is going to be updated:
21commit=$1
22
23# It can exit with a non-zero status to refuse the push (when it does
24# so, it must not modify the index or the working tree).
25die () {
26	echo >&2 "$*"
27	exit 1
28}
29
30# Or it can make any necessary changes to the working tree and to the
31# index to bring them to the desired state when the tip of the current
32# branch is updated to the new commit, and exit with a zero status.
33#
34# For example, the hook can simply run git read-tree -u -m HEAD "$1"
35# in order to emulate git fetch that is run in the reverse direction
36# with git push, as the two-tree form of git read-tree -u -m is
37# essentially the same as git switch or git checkout that switches
38# branches while keeping the local changes in the working tree that do
39# not interfere with the difference between the branches.
40
41# The below is a more-or-less exact translation to shell of the C code
42# for the default behaviour for git's push-to-checkout hook defined in
43# the push_to_deploy() function in builtin/receive-pack.c.
44#
45# Note that the hook will be executed from the repository directory,
46# not from the working tree, so if you want to perform operations on
47# the working tree, you will have to adapt your code accordingly, e.g.
48# by adding "cd .." or using relative paths.
49
50if ! git update-index -q --ignore-submodules --refresh
51then
52	die "Up-to-date check failed"
53fi
54
55if ! git diff-files --quiet --ignore-submodules --
56then
57	die "Working directory has unstaged changes"
58fi
59
60# This is a rough translation of:
61#
62#   head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX
63if git cat-file -e HEAD 2>/dev/null
64then
65	head=HEAD
66else
67	head=$(git hash-object -t tree --stdin </dev/null)
68fi
69
70if ! git diff-index --quiet --cached --ignore-submodules $head --
71then
72	die "Working directory has staged changes"
73fi
74
75if ! git read-tree -u -m "$commit"
76then
77	die "Could not update working tree to new HEAD"
78fi

example/advanced-reddit-scraper/.git/hooks/sendemail-validate.sample

1#!/bin/sh
2
3# An example hook script to validate a patch (and/or patch series) before
4# sending it via email.
5#
6# The hook should exit with non-zero status after issuing an appropriate
7# message if it wants to prevent the email(s) from being sent.
8#
9# To enable this hook, rename this file to "sendemail-validate".
10#
11# By default, it will only check that the patch(es) can be applied on top of
12# the default upstream branch without conflicts in a secondary worktree. After
13# validation (successful or not) of the last patch of a series, the worktree
14# will be deleted.
15#
16# The following config variables can be set to change the default remote and
17# remote ref that are used to apply the patches against:
18#
19#   sendemail.validateRemote (default: origin)
20#   sendemail.validateRemoteRef (default: HEAD)
21#
22# Replace the TODO placeholders with appropriate checks according to your
23# needs.
24
25validate_cover_letter () {
26	file="$1"
27	# TODO: Replace with appropriate checks (e.g. spell checking).
28	true
29}
30
31validate_patch () {
32	file="$1"
33	# Ensure that the patch applies without conflicts.
34	git am -3 "$file" || return
35	# TODO: Replace with appropriate checks for this patch
36	# (e.g. checkpatch.pl).
37	true
38}
39
40validate_series () {
41	# TODO: Replace with appropriate checks for the whole series
42	# (e.g. quick build, coding style checks, etc.).
43	true
44}
45
46# main -------------------------------------------------------------------------
47
48if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1
49then
50	remote=$(git config --default origin --get sendemail.validateRemote) &&
51	ref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&
52	worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&
53	git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" &&
54	git config --replace-all sendemail.validateWorktree "$worktree"
55else
56	worktree=$(git config --get sendemail.validateWorktree)
57fi || {
58	echo "sendemail-validate: error: failed to prepare worktree" >&2
59	exit 1
60}
61
62unset GIT_DIR GIT_WORK_TREE
63cd "$worktree" &&
64
65if grep -q "^diff --git " "$1"
66then
67	validate_patch "$1"
68else
69	validate_cover_letter "$1"
70fi &&
71
72if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"
73then
74	git config --unset-all sendemail.validateWorktree &&
75	trap 'git worktree remove -ff "$worktree"' EXIT &&
76	validate_series
77fi

example/advanced-reddit-scraper/.git/hooks/update.sample

1#!/bin/sh
2#
3# An example hook script to block unannotated tags from entering.
4# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
5#
6# To enable this hook, rename this file to "update".
7#
8# Config
9# ------
10# hooks.allowunannotated
11#   This boolean sets whether unannotated tags will be allowed into the
12#   repository.  By default they won't be.
13# hooks.allowdeletetag
14#   This boolean sets whether deleting tags will be allowed in the
15#   repository.  By default they won't be.
16# hooks.allowmodifytag
17#   This boolean sets whether a tag may be modified after creation. By default
18#   it won't be.
19# hooks.allowdeletebranch
20#   This boolean sets whether deleting branches will be allowed in the
21#   repository.  By default they won't be.
22# hooks.denycreatebranch
23#   This boolean sets whether remotely creating branches will be denied
24#   in the repository.  By default this is allowed.
25#
26
27# --- Command line
28refname="$1"
29oldrev="$2"
30newrev="$3"
31
32# --- Safety check
33if [ -z "$GIT_DIR" ]; then
34	echo "Don't run this script from the command line." >&2
35	echo " (if you want, you could supply GIT_DIR then run" >&2
36	echo "  $0 <ref> <oldrev> <newrev>)" >&2
37	exit 1
38fi
39
40if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
41	echo "usage: $0 <ref> <oldrev> <newrev>" >&2
42	exit 1
43fi
44
45# --- Config
46allowunannotated=$(git config --type=bool hooks.allowunannotated)
47allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)
48denycreatebranch=$(git config --type=bool hooks.denycreatebranch)
49allowdeletetag=$(git config --type=bool hooks.allowdeletetag)
50allowmodifytag=$(git config --type=bool hooks.allowmodifytag)
51
52# check for no description
53projectdesc=$(sed -e '1q' "$GIT_DIR/description")
54case "$projectdesc" in
55"Unnamed repository"* | "")
56	echo "*** Project description file hasn't been set" >&2
57	exit 1
58	;;
59esac
60
61# --- Check types
62# if $newrev is 0000...0000, it's a commit to delete a ref.
63zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
64if [ "$newrev" = "$zero" ]; then
65	newrev_type=delete
66else
67	newrev_type=$(git cat-file -t $newrev)
68fi
69
70case "$refname","$newrev_type" in
71	refs/tags/*,commit)
72		# un-annotated tag
73		short_refname=${refname##refs/tags/}
74		if [ "$allowunannotated" != "true" ]; then
75			echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
76			echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
77			exit 1
78		fi
79		;;
80	refs/tags/*,delete)
81		# delete tag
82		if [ "$allowdeletetag" != "true" ]; then
83			echo "*** Deleting a tag is not allowed in this repository" >&2
84			exit 1
85		fi
86		;;
87	refs/tags/*,tag)
88		# annotated tag
89		if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
90		then
91			echo "*** Tag '$refname' already exists." >&2
92			echo "*** Modifying a tag is not allowed in this repository." >&2
93			exit 1
94		fi
95		;;
96	refs/heads/*,commit)
97		# branch
98		if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
99			echo "*** Creating a branch is not allowed in this repository" >&2
100			exit 1
101		fi
102		;;
103	refs/heads/*,delete)
104		# delete branch
105		if [ "$allowdeletebranch" != "true" ]; then
106			echo "*** Deleting a branch is not allowed in this repository" >&2
107			exit 1
108		fi
109		;;
110	refs/remotes/*,commit)
111		# tracking branch
112		;;
113	refs/remotes/*,delete)
114		# delete tracking branch
115		if [ "$allowdeletebranch" != "true" ]; then
116			echo "*** Deleting a tracking branch is not allowed in this repository" >&2
117			exit 1
118		fi
119		;;
120	*)
121		# Anything else (is there anything else?)
122		echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
123		exit 1
124		;;
125esac
126
127# --- Finished
128exit 0

example/advanced-reddit-scraper/.git/logs/HEAD

10000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917571 +0300	commit (initial): init
268034258495c18bfb133b925e51bbce4b07c2cf2 0000000000000000000000000000000000000000 deduble <yunusemremre@gmail.com> 1727917683 +0300	Branch: renamed refs/heads/master to refs/heads/main
30000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917683 +0300	Branch: renamed refs/heads/master to refs/heads/main
468034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031615 +0300	commit: using illegal api
5d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419029 +0300	commit: v1
6c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420381 +0300	commit: release

example/advanced-reddit-scraper/src/__pycache__/main.cpython-312.pyc

Download

example/advanced-reddit-scraper/src/__pycache__/reddit.cpython-312.pyc

Download

example/advanced-reddit-scraper/src/__pycache__/redditor.cpython-312.pyc

Download

example/advanced-reddit-scraper/src/__pycache__/session.cpython-312.pyc

Download

example/advanced-reddit-scraper/src/__pycache__/__main__.cpython-312.pyc

Download

example/advanced-reddit-scraper/.git/objects/01/fe73e828e238ad7552d0d78359d99346f66976

Download

example/advanced-reddit-scraper/.git/objects/17/dccf81f638ab88fec85008e15a30e05de19877

Download

example/advanced-reddit-scraper/.git/objects/1d/59e0ed9229e85292631f660cce4af209b785b3

Download

example/advanced-reddit-scraper/.git/objects/1e/e8ab06eba9d300f69f072647853b4d62ee298a

Download

example/advanced-reddit-scraper/.git/objects/1f/7481b8559d3f11b9a5a7648a3aa5a5a05dd531

Download

example/advanced-reddit-scraper/.git/objects/1f/faf74e22ef23eb3a921262e20729c2b24ef690

Download

example/advanced-reddit-scraper/.git/objects/5c/a4deda17fcf053a1705f70aba40b1bf25281e2

Download

example/advanced-reddit-scraper/.git/objects/5e/6f1303c550883e16073bc4311e3b7c865b0ecb

Download

example/advanced-reddit-scraper/.git/objects/65/bd461b426ed5f9b7f3ffcba098c0731caca73b

Download

example/advanced-reddit-scraper/.git/objects/68/034258495c18bfb133b925e51bbce4b07c2cf2

Download

example/advanced-reddit-scraper/.git/objects/6b/379c02244bbfd26691bd18cdc16f035a32e30c

Download

example/advanced-reddit-scraper/.git/objects/6e/b49d35e2ee97a2802f1f2856c0ce992b1853a7

Download

example/advanced-reddit-scraper/.git/objects/81/2ba43c6f6467ac8a78d4f1d604a0657958875d

Download

example/advanced-reddit-scraper/.git/objects/83/7427c84eb5b8cfafb7def87871ba0a18fed7cf

Download

example/advanced-reddit-scraper/.git/objects/84/0a359e8a157c8775b63a90af2aa76e6c8bd7db

Download

example/advanced-reddit-scraper/.git/objects/85/d63c75f9bb3c33e272284381f470a0360eae1b

Download

example/advanced-reddit-scraper/.git/objects/86/38ecf130d57c0931d9f989f46dd82843f3ac5b

Download

example/advanced-reddit-scraper/.git/objects/8f/3b4bb4e6848ea6a83eeb0296ab347583102674

Download

example/advanced-reddit-scraper/.git/objects/92/511f33669b448318aaeb62d839f94b6d6a3dd3

Download

example/advanced-reddit-scraper/.git/objects/9b/b76e6ae6f1bdf77d036f3fc9a9c488c41fcefd

Download

example/advanced-reddit-scraper/.git/objects/a3/e82ad054e006d638c99f853e24c3051e35cce2

Download

example/advanced-reddit-scraper/.git/objects/ae/dce4aa3907d1600116f7e70d076f583dbf9928

Download

example/advanced-reddit-scraper/.git/objects/b9/7ba54c96388855906965dde76c36d1897ca309

Download

example/advanced-reddit-scraper/.git/objects/bc/eb6e5e1b3d54d438d5030fd5cffc837072b816

Download

example/advanced-reddit-scraper/.git/objects/c4/7f4c6523f8273b1c9f8c1014f96512ac4be855

Download

example/advanced-reddit-scraper/.git/objects/c4/bd7187db208a3aa4109eac9c12530387bdf740

Download

example/advanced-reddit-scraper/.git/objects/c8/a66ed48d5f42d9fea6c24f6722018721d2c0a1

Download

example/advanced-reddit-scraper/.git/objects/ca/b93bc8c6509d4dda383bef6294b4b6211ed8c4

Download

example/advanced-reddit-scraper/.git/objects/d1/87efc3a2a2f86394a568fbed9bbee43b72d378

Download

example/advanced-reddit-scraper/.git/objects/d3/7f42b765bfb553100f49eb289e97909e349732

Download

example/advanced-reddit-scraper/.git/objects/d3/9c6a0e47a92ee8fc7f4baace6c8c2ef406bb45

Download

example/advanced-reddit-scraper/.git/objects/d6/4855c06228adcbe500c652ea2751c82a7dc7d1

Download

example/advanced-reddit-scraper/.git/objects/d9/8007b2c5fca88a8ee506a46f387c27745a488c

Download

example/advanced-reddit-scraper/.git/objects/ed/c46777034ef1b655f5af883a4d84cdf03bd6a7

Download

example/advanced-reddit-scraper/.git/objects/f3/6f5c6f3e4df6f647792db86053f288851ad990

Download

example/advanced-reddit-scraper/.git/objects/fc/2fe4bb281877272221d421a43a50ff90b05a56

Download

example/advanced-reddit-scraper/.git/refs/heads/main

1f36f5c6f3e4df6f647792db86053f288851ad990

example/advanced-reddit-scraper/.git/logs/refs/heads/main

10000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917571 +0300	commit (initial): init
268034258495c18bfb133b925e51bbce4b07c2cf2 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917683 +0300	Branch: renamed refs/heads/master to refs/heads/main
368034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031615 +0300	commit: using illegal api
4d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419029 +0300	commit: v1
5c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420381 +0300	commit: release

example/advanced-reddit-scraper/.git/refs/remotes/origin/main

1f36f5c6f3e4df6f647792db86053f288851ad990

example/advanced-reddit-scraper/.git/logs/refs/remotes/origin/main

10000000000000000000000000000000000000000 68034258495c18bfb133b925e51bbce4b07c2cf2 deduble <yunusemremre@gmail.com> 1727917690 +0300	update by push
268034258495c18bfb133b925e51bbce4b07c2cf2 d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 deduble <yunusemremre@gmail.com> 1730031636 +0300	update by push
3d39c6a0e47a92ee8fc7f4baace6c8c2ef406bb45 c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 deduble <yunusemremre@gmail.com> 1739419037 +0300	update by push
4c8a66ed48d5f42d9fea6c24f6722018721d2c0a1 f36f5c6f3e4df6f647792db86053f288851ad990 deduble <yunusemremre@gmail.com> 1739420388 +0300	update by push
Developer
Maintained by Community

Actor Metrics

  • 1 monthly user

  • 0 No bookmarks yet

  • >99% runs succeeded

  • Created in Mar 2025

  • Modified a day ago