Street Fighter 6 CFN Scraper avatar

Street Fighter 6 CFN Scraper

Try for free

No credit card required

Go to Store
Street Fighter 6 CFN Scraper

Street Fighter 6 CFN Scraper

3ternal/street-fighter-6-cfn-scraper
Try for free

No credit card required

SF6 scraper for the Capcom Fighters Network (Buckler's Boot Camp)

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python-playwright
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "street-fighter-6-cfn-scraper",
4    "title": "Street Fighter 6 CFN Scraper",
5    "description": "SF6 scraper for the Capcom Fighters Network (Buckler's Boot Camp)",
6    "version": "1.0",
7    "meta": {
8        "templateId": "python-beautifulsoup"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile",
12    "storages": {
13        "dataset": {
14            "actorSpecification": 1,
15            "title": "URLs and their titles",
16            "views": {
17                "titles": {
18                    "title": "URLs and their titles",
19                    "transformation": {
20                        "fields": [
21                            "url",
22                            "title"
23                        ]
24                    },
25                    "display": {
26                        "component": "table",
27                        "properties": {
28                            "url": {
29                                "label": "URL",
30                                "format": "text"
31                            },
32                            "title": {
33                                "label": "Title",
34                                "format": "text"
35                            }
36                        }
37                    }
38                }
39            }
40        }
41    }
42}

.actor/input_schema.json

1{
2    "title": "Street Fighter 6 CFN Scraper",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "email": {
7            "title": "CFN Email",
8            "type": "string",
9            "description": "Your CFN email address",
10            "editor": "textfield"
11        },
12        "password": {
13            "title": "CFN Password",
14            "type": "string",
15            "description": "Your CFN password",
16            "editor": "textfield"
17        },
18        "rank_to_search": {
19            "title": "Only Search for Specific Rank (Index)",
20            "type": "integer",
21            "description": "Finds the starting point of a specific rank (0 is Master, 35 is Rookie 1). If blank, the scraper will start from Master and attempt to find all of the ranks. However, the site will usually block you after a few tries, so it might be better to search for a specific rank instead.",
22            "editor": "number"
23        },
24        "initial_page_jump": {
25            "title": "Initial Page Jump",
26            "type": "integer",
27            "description": "How many pages should we skip per attempt? Leave blank unless your page to start searching is very close to the target page.",
28            "editor": "number"
29        },
30        "start_page": {
31            "title": "Start Searching at This Page",
32            "type": "integer",
33            "description": "Start searching for target rank on this page (will speed up execution if accurate). If blank, we'll use our predetermined page numbers (accurate as of March 2024).",
34            "editor": "number"
35        }    
36    },
37    "required": ["email", "password"]
38}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1from urllib.parse import urljoin
2from apify import Actor
3from apify.log import ActorLogFormatter
4import logging
5from playwright.async_api import async_playwright, Page, expect
6from bs4 import BeautifulSoup
7import math
8
9browser = None
10context = None
11page = None
12readable_content = None
13
14was_above_target = None
15iterations = 0
16page_jump = 1000
17user_dict = {}
18current_page_int = 1
19current_target_lp = 0
20
21email = ""
22password = ""
23start_pages = []
24search_only_this_rank = 0
25initial_page_jump = 0
26
27base_start_url = "https://www.streetfighter.com/6/buckler/ranking/league?character_filter=2&character_id=luke&platform=1&user_status=1&home_filter=1&home_category_id=0&home_id=1&league_rank=0&page="
28
29target_lp_per_rank = [
30    25000,                                      # Master
31    23800, 22600, 21400, 20200, 19000,          # Diamond
32    17800, 16600, 15400, 14200, 13000,          # Platinum
33    12200, 11400, 10600, 9800, 9000,            # Gold
34    8200, 7400, 6600, 5800, 5000,               # Silver
35    4600, 4200, 3800, 3400, 3000,               # Bronze
36    2600, 2200, 1800, 1400, 1000,               # Iron
37    800, 600, 400, 200, 0                       # Rookie
38 ]
39
40ranks = [
41    "Master",
42    "Diamond 5", "Diamond 4", "Diamond 3", "Diamond 2", "Diamond 1",
43    "Platinum 5", "Platinum 4", "Platinum 3", "Platinum 2", "Platinum 1",
44    "Gold 5", "Gold 4", "Gold 3", "Gold 2", "Gold 1",
45    "Silver 5", "Silver 4", "Silver 3", "Silver 2", "Silver 1",
46    "Bronze 5", "Bronze 4", "Bronze 3", "Bronze 2", "Bronze 1",
47    "Iron 5", "Iron 4", "Iron 3", "Iron 2", "Iron 1",
48    "Rookie 5", "Rookie 4", "Rookie 3", "Rookie 2", "Rookie 1",
49]
50
51estimated_start_pages = [
52	6102,								    #master
53	6462, 7248, 8300, 9637, 12306,	  		#diamond
54	13994, 15815, 18101, 20841, 26407,   	#platinum
55	28248, 29682, 31164, 32676, 35215,    	#gold
56	36138, 37770, 39772, 41749, 44936,   	#silver
57	45672, 47114, 48594, 49774, 51799,   	#bronze
58	52580, 54017, 55291, 56674, 59273,    	#iron
59	59795, 60333, 60962, 61552, 61552  		#rookie
60]
61
62placement_of_users_per_rank = []
63players_in_each_rank = []
64
65async def main():
66    async with Actor:
67        # get global variables from the user's input
68        actor_input = await Actor.get_input() or {}
69        GetInfoFromActorInput(actor_input)
70
71        # set up the Playwright browser and begin the search
72        async with async_playwright() as playwright:
73            await DoSearch(playwright)
74
75def GetInfoFromActorInput(actor_input):
76    global email
77    global password
78    global start_pages
79    global search_only_this_rank
80
81    email = actor_input.get('email')        
82    password = actor_input.get('password')
83
84    start_pages = estimated_start_pages
85    start_page_override = actor_input.get('start_page')
86
87    search_only_this_rank = actor_input.get('rank_to_search')
88    initial_page_jump = actor_input.get('initial_page_jump')
89
90    if search_only_this_rank is not None and start_page_override is not None:
91        start_pages[search_only_this_rank] = start_page_override
92    
93async def DoSearch(playwright):
94    global placement_of_users_per_rank
95    global players_in_each_rank
96
97    # create the browser and log in
98    await SetUp(playwright)
99
100    # get the total number of players
101    total_players_str = await page.locator("span[class='ranking_ranking_now__last__oqSXS']").last.text_content()
102    total_players_str = total_players_str[1:]
103    total_players_str = total_players_str.strip()
104    total_players_int = int(total_players_str)
105    
106    # we might want to only search for one rank at a time, because the site will kick us out after we load a few pages
107    if (search_only_this_rank is not None):
108        start_index = search_only_this_rank
109        end_index = search_only_this_rank + 1
110    #otherwise, we'll attempt to search all of them
111    else:
112        start_index = 0
113        end_index = len(ranks)
114
115    print("Start rank index: " + str(start_index) + "\nEnd rank index: " + str(end_index - 1))
116
117    # find the starting point of each rank in our list
118    for i in range(start_index, end_index):                    
119        target_lp = target_lp_per_rank[i]
120        start_page = start_pages[i]
121        rank_name = ranks[i]
122        print("Searching for " + rank_name + "...")
123
124        #find where the rank begins
125        placement_of_first_user = await FindPlacingOfFirstUserInRank(playwright, start_page, target_lp)
126        print("\n" + rank_name + " begins at #" + str(placement_of_first_user) + "\nThere are " + str(total_players_int) + " players in total\n")
127
128        #and add it to a list
129        placement_of_users_per_rank.append(placement_of_first_user)
130    
131    # this part is only relevant if we somehow managed to get a list of all ranks in a single run of this Actor, which is unlikely
132    if len(placement_of_users_per_rank) > 1:
133        for i in range(len(placement_of_users_per_rank)):
134            rank_name = ranks[i]
135            current_rank_starts_at = placement_of_users_per_rank[i]
136            prev_rank_starts_at = placement_of_users_per_rank[i - 1] if i > 0 else 0
137            players_in_rank = current_rank_starts_at - prev_rank_starts_at
138
139            percentage = GetPercentageString(players_in_rank, total_players_int)
140            print(rank_name + " contains " + str(players_in_rank) + " players\nIt represents " + percentage + " of the playerbase")
141            
142            players_in_each_rank.append(players_in_rank)
143        
144            print("Placements of all users: " + str(placement_of_users_per_rank))
145            print("Players in each rank: " + str(players_in_each_rank))
146
147def GetPercentageString(players_in_rank, total_players):
148    percentage_int = (players_in_rank / total_players) * 100
149    return str(percentage_int) + "%"
150
151async def SetUp(playwright):
152    """This will launch the browser and log you into CFN. Call this first, and only call it once."""
153    # create the Playwright browser
154    global browser
155    global context
156    global page
157    global iterations
158    global page_jump
159    
160    browser = await playwright.firefox.launch(headless = Actor.config.headless)
161    context = await browser.new_context()
162    page = await context.new_page();
163    login_page = "https://www.streetfighter.com/6/buckler/auth/loginep?redirect_url=/?status=login"
164
165    # go to the login page
166    await page.goto(login_page);
167    await GetPageHtml()
168
169    # fill out the age check dropdown
170    await InputAgeCheck()
171    await GetPageHtml()
172
173    # uncomment this to check if cloudflare is blocking you for logging in too many times
174    # print(readable_content)
175
176    # log in
177    await LogIn()
178    await GetPageHtml()
179
180    # uncomment this to check if cloudflare is blocking you for logging in too many times
181    #print(readable_content)
182
183    # it'll be easier if we just start on the ranking page
184    start_url = base_start_url + str(1)
185    await page.goto(start_url, timeout=60000);
186
187async def FindPlacingOfFirstUserInRank(playwright, start_page, target_lp):
188    global current_target_lp
189    global page_jump
190    
191    current_target_lp = target_lp
192
193    if (start_page is None):
194        Actor.log.error('Start page is null!')
195        return
196        
197    # go to the ranking page
198    start_url = base_start_url + str(start_page)
199    await page.goto(start_url, timeout=60000);
200    
201    print("Start URL: " + page.url)
202    #await GetPageHtml()
203    #print(readable_content)
204
205    # figure out what page we're on            
206    pagination = page.locator("div[class='ranking_pc__LlGv4']").locator("div[class='ranking_ranking_pager__top__etBHR']").locator("ul[class='pagination']").first
207    await expect(pagination).to_be_visible(timeout=30000)
208
209    page_jump = initial_page_jump or 1000
210    iterations = 0
211    highest_user_in_last_rank = None
212
213    # loop the search function
214    while True:    
215        # each time this is called, we'll navigate to a new page
216        highest_user_in_last_rank = await SearchForBeginningOfRank(pagination)
217
218        # if we reached our goal, then break
219        if highest_user_in_last_rank is not None:
220            break
221
222        # just in case something went wrong, we should prevent the loop from running infinitely
223        iterations += 1
224        if iterations > 30:
225            break
226    
227    placement_str = await highest_user_in_last_rank.locator("dt").text_content()
228    placement_str = placement_str.strip()
229    placement_str = placement_str[1:]
230    placement_int = int(placement_str)
231
232    lp_str = await highest_user_in_last_rank.locator("dd").text_content()
233    lp_str = lp_str[:-3]
234    lp_int = int(lp_str)
235        
236    username = await highest_user_in_last_rank.locator("span[class='ranking_name__El29_']").text_content()
237    
238    # print the results
239    print("\nHighest ranked user in previous rank: " + str(username))
240    print("LP: " + lp_str + "\nPosition: " + placement_str + "\nPage: " + str(current_page_int) + "\nURL: " + page.url)
241
242    # placement_int refers to the highest-placed user in the prev rank
243    # so we'll add 1 to give us the first user in the current rank
244    return placement_int + 1
245
246async def SearchForBeginningOfRank(pagination):
247    """Each time this method is called, we'll load a page and attempt to find the highest-ranked user in the previous rank. This would tell us where the current rank begins.
248    If the loaded page doesn't contain the highest ranked user, nothing will be returned, and you'll have to call this again"""
249    global was_above_target
250    global page_jump
251    global current_page_int
252
253    # get info about current and next page
254    current_page = pagination.locator("xpath=/li[@class='active']").first
255    await expect(current_page).to_be_visible()
256
257    current_page_text = await current_page.text_content()
258    current_page_str = str(current_page_text)
259    current_page_int = int(current_page_str)
260    print("\nCurrent page: " + current_page_str)
261
262    # first, we need to find the ranking page
263    ranking_list = page.locator("xpath=//ul[@class='ranking_ranking_list__szajj']").first
264    await expect(ranking_list).to_be_visible()
265
266    # find the last user's lp
267    all_users_on_page = await ranking_list.locator("xpath=/li").all()
268    last_user = all_users_on_page[len(all_users_on_page) - 1]
269    await expect(last_user).to_be_visible()
270
271    last_user_lp_str = await last_user.locator("dd").text_content()
272    last_user_lp_str = last_user_lp_str[:-3]
273    last_user_lp_int = int(last_user_lp_str)
274    print("LP of the last user on the page: " + str(last_user_lp_int) + "\nWe're looking for " + str(current_target_lp))
275
276    # we're trying to roughly find the last page of the lower rank
277    highest_lp_in_last_rank = current_target_lp - 1
278
279    # if we're above the target, then count downwards
280    if last_user_lp_int > highest_lp_in_last_rank:
281        #every time we overshoot, we'll halve the size of the jump
282        if was_above_target == True:
283            page_jump = math.floor(page_jump / 2)
284
285        # if we're getting close, we'll need a much lower page jump
286        if last_user_lp_int == current_target_lp:
287            if abs(page_jump) > 50:
288                page_jump = 50
289                
290        page_jump = abs(page_jump)
291        print("Page jump: " + str(page_jump))
292            
293        was_above_target = False
294
295    # if we're below the target (most likely if we overshot), then count upwards
296    elif last_user_lp_int < highest_lp_in_last_rank - 20:
297        #every time we overshoot, we'll halve the size of the jump
298        if was_above_target == False:
299            page_jump = math.floor(page_jump / 2)
300            
301        page_jump = -abs(page_jump)
302        print("Page jump: " + str(page_jump))
303
304        was_above_target = True
305
306    # once you've found the new rank, we have no choice but to iterate slowly and find where the old rank ends
307    else:
308        print("We're very close to our target! It's time to start incrementing one page at a time")
309        
310        # get all lp on page
311        lp_list = await GetAllLpOnPage(ranking_list)
312
313        # basically, we want to find the first person with the LP of the target rank
314        if not current_target_lp in lp_list:
315            print("We overshot a little, so we'll have to move backward one page at a time to find the first user with an LP of " + str(current_target_lp))
316            page_jump = -1
317
318        # if we found someone at current_target_lp, then I think that's it?
319        else:        
320            target_index = -1
321            
322            for i in range(len(lp_list)):
323                if lp_list[i] < current_target_lp:
324                    target_index = i
325                    break
326            
327            # we've found our target, so let's return the user and let the main method take control
328            return all_users_on_page[target_index]
329    
330    # figure out the name of the URL to move to
331    
332    target_page = current_page_int + page_jump  
333    print("Target page: " + str(target_page))
334    
335    target_url = GetURLForPage(target_page)
336
337    await page.goto(target_url, timeout=180000)
338    await page.wait_for_url(target_url)
339    await page.content()
340
341def GetURLForPage(new_page_int):
342    string_to_replace = "page=" + str(current_page_int)
343    target_page = current_page_int + page_jump
344
345    current_url = page.url
346    return current_url.replace(string_to_replace, "page=" + str(target_page))
347
348async def GetAllLpOnPage(ranking_list):
349    children = await ranking_list.locator("xpath=/li").all()
350    output = []
351    # iterate over each <li>
352    count = 0         
353    for i in range(len(children)):
354        userLi = children[i]
355
356        # get the lp
357        lpRaw = await userLi.locator("dd").text_content()
358        lpStr = str(lpRaw)
359        lpStr = lpStr[:-3]
360        lpInt = int(lpStr)
361
362        # add the user to a dictionary
363        output.append(lpInt)
364
365    print("All LP on page: " + str(output))
366    return output
367
368async def GetPageHtml():
369    """We don't really need this, but it's useful for debugging"""
370    html = await page.content()
371    soup = BeautifulSoup(html, features = "html.parser")
372
373    global readable_content
374    readable_content = soup.prettify()   
375    
376async def InputAgeCheck():
377    """Fills out the age check dropdown"""
378    # locate the dropdown (instant), then await our selection of the dropdown item
379    dropdown = page.locator("select[id='country']")            
380    await dropdown.select_option("Canada")
381
382    # if we've made it this far without hitting an error, we can go ahead and fill out the other options
383    await page.locator("select[id='birthYear']").select_option('1992')
384    await page.locator("select[id='birthMonth']").select_option('1')
385    await page.locator("select[id='birthDay']").select_option('15')
386
387    # press submit
388    await page.locator("button[name='submit']").click()
389
390    # wait for the new page to load
391    await page.wait_for_timeout(3000)
392
393    print("Passed age check!\n")
394
395async def LogIn():
396    # fill out email and pw
397    email_field = page.locator("input[type='email']")
398    await email_field.fill(email)
399
400    pw_field = page.locator("input[type='password']")
401    await pw_field.fill(password)
402
403    # press submit
404    await page.locator("button[name='submit']").click()
405
406    # wait for the new page to load
407    await page.wait_for_timeout(10000)
408
409    print("Logged in!\n")

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23.ruff_cache
24
25.scrapy
26*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.6.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7
Developer
Maintained by Community

Actor Metrics

  • 5 monthly users

  • 2 stars

  • 0% runs succeeded

  • Created in Mar 2024

  • Modified 3 months ago

Categories