import requests
from bs4 import BeautifulSoup
url = 'https://www.basketball-reference.com/players/a/'
urlb = 'https://www.basketball-reference.com/players/b/'
urlc = 'https://www.basketball-reference.com/players/c/'
result = requests.get(url)
doc = BeautifulSoup(result.text, 'lxml')
college = doc.find_all(string="Kentucky")
result = requests.get(urlb)
doc = BeautifulSoup(result.text, 'lxml')
collegeb = doc.find_all(string='Kentucky')
result = requests.get(urlc)
doc = BeautifulSoup(result.text, 'lxml')
collegec = doc.find_all(string='Kentucky')
print(college)
print(collegeb)
print(collegec)
I need to do this for every letter of the alphabet for like at least 30 schools and I would really like to know how to do this more efficiently
CodePudding user response:
Deduplicate nearly identical code with a loop over inputs and a list
or dict
of results:
import requests
from bs4 import BeautifulSoup
url_template = 'https://www.basketball-reference.com/players/{}/'
folders = ['a', 'b', 'c'] # The only varying thing in your original tripled code
colleges = [] # Store the results for each varied thing here in same order
for folder in folders: # Loop over varying component
result = requests.get(url_template.format(folder)) # Substitute it in template
doc = BeautifulSoup(result.text, 'lxml')
colleges.append(doc.find_all(string="Kentucky")) # Append result in same order
# Loop over results to print them
for college in colleges:
print(college)
If you're having it work for many schools, for every letter of the alphabet, you'd likely use a dict
(better, a defaultdict
) for the results instead of a list
(so you can group the results by school) with an inner loop parsing out data by school:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from string import ascii_lowercase
url_template = 'https://www.basketball-reference.com/players/{}/'
folders = ascii_lowercase # Will run for every lowercase alphabet letter
schoolnames = ("Kentucky", "Gonzaga", ...)
colleges = defaultdict(list) # Store a list of results for each school
for folder in folders: # Loop over varying component
result = requests.get(url_template.format(folder)) # Substitute it in template
doc = BeautifulSoup(result.text, 'lxml')
for schoolname in schoolnames:
colleges[schoolname].append(doc.find_all(schoolname=school))
# Loop over results to print them
for collegename, results in colleges.items():
print(collegename)
for result in results:
print(result)
CodePudding user response:
you can write a function with the instructions you need to "re-do" for every school. Then you deploy a main()
func with every parameter or characteristic/trait for every school. Your code seems a big chunk of lines, you should separate them in different instructions and rely your work more on "tidy-coding"
CodePudding user response:
Consider using a for loop:
"""Scrapes No. of Basketball Players who reached NBA/ABA from major US Colleges."""
from collections import defaultdict
from string import ascii_lowercase
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable
BASE_PLAYERS_URL = 'https://www.basketball-reference.com/players'
COLLEGES = frozenset({
'Gonzaga',
'Houston',
'Kansas',
'Arizona',
'Baylor',
'Villanova',
'Tennessee',
'Texas Tech',
'Kentucky',
'Duke',
})
def main() -> None:
college_player_counts = defaultdict(int)
for letter in ascii_lowercase:
url = f'{BASE_PLAYERS_URL}/{letter}/'
result = requests.get(url)
doc = BeautifulSoup(result.text, 'lxml')
for college in COLLEGES:
college_player_counts[college] = len(doc.find_all(string=college))
results_table = PrettyTable(['College', 'Player Count'])
for college, player_count in sorted(college_player_counts.items(),
key=lambda item: item[1], reverse=True):
results_table.add_row([college, player_count])
print(results_table)
if __name__ == '__main__':
main()
Output:
------------ --------------
| College | Player Count |
------------ --------------
| Kentucky | 125 |
| Duke | 91 |
| Kansas | 81 |
| Houston | 64 |
| Arizona | 62 |
| Villanova | 52 |
| Tennessee | 44 |
| Gonzaga | 25 |
| Baylor | 23 |
| Texas Tech | 17 |
------------ --------------
CodePudding user response:
You can just use a for
loop.
import requests
from bs4 import BeautifulSoup
colleges = []
for char in "abcdefghijklmnopqrstuvwxyz":
url = f"https://www.basketball-reference.com/players/{char}/"
result = requests.get(url)
doc = BeautifulSoup(result.text, 'lxml')
college = doc.find_all(string="Kentucky")
colleges.append(college)
print(*colleges, sep = "\n")