I have little issue. I wrote a webscrapper which is downloading table tennis data. There is info about players, match score etc. I would like to display players which lost the most matches per day. I've created data frame like this, but I'm struggling with this. I would like to sum p1_status and p2_status, then I would like to display Surname and number of loses next to player.
https://gyazo.com/19c70e071db78071e83045bfcea0e772
Here is my code:
s = Service("D:/setka/chromedriver.exe")
option = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=s)
hall = 10
num =1
filename = "C:/Users/filip/result2.csv"
f=open(filename,"w")
headers = "p1_surname, p1_name, p1_score, p2_surname, p2_name, p2_score, p1_status, p2_status \n"
f.write(headers)
while hall <= 10:
for period in [1]:
url = 'https://tabletennis.setkacup.com/en/schedule?date=2021-12-04&hall=' \
str(hall) '&' 'period=' str(period)
driver.get(url)
time.sleep(5)
divs = driver.find_elements(By.CSS_SELECTOR, "div.score-result")
for div in divs:
data = div.text.split()
#print(data)
if(num % 2) == 0:
f.write(str(data[0]) "," str(data[1]) "," str(data[2] "," "\n"))
else:
f.write(str(data[0]) "," str(data[1]) "," str(data[2] ","))
num = num 1
hall =hall 1
f.close()
df_results=pd.read_csv('C:/Users/filip/result2.csv', sep = r',',
skipinitialspace = True)
df_results.reset_index(drop=True, inplace=True)
df_results.loc[df_results['p1_score'] > df_results['p2_score'], ['p1_status','p2_status']] = ['won','lost']
df_results.loc[df_results['p1_score'] < df_results['p2_score'], ['p1_status','p2_status']] = ['lost','won']
df_results.loc[df_results['p1_score'] == df_results['p2_score'], ['p1_status','p2_status']] = ['not played','not played']
df_results.loc[((df_results['p1_score'] < 3) & (df_results['p1_score']!=0) & (df_results['p2_score'] <3) & (df_results['p2_score']!=0)), ['p1_status','p2_status']] = ['inplay','inplays']
df_results.loc[df_results['p1_status'] != df_results['p2_status'], ['match_status']] = ['finished']
df_results.loc[df_results['p1_status'] == df_results['p2_status'], ['match_status']] = ['not played']
df_results.loc[((df_results['p1_status'] =='inplay') & (df_results['p2_status']=='inplays')), ['match_status']] = ['inplay']
df_results = df_results.dropna(axis=1)
df_results.head(30)
CodePudding user response:
Split your dataframe in 2 parts (p1_, p2_) to count defeats of each player then merge them:
Setup a MRE:
df = pd.DataFrame({'p1_surname': list('AABB'), 'p2_surname': list('CDCD'),
'p1_status': list('LWWW'), 'p2_status': list('WLLL')})
print(df)
# Output:
p1_surname p2_surname p1_status p2_status
0 A C L W
1 A D W L
2 B C W L
3 B D W L
>>> pd.concat([
df.filter(like='p1_').set_index('p1_surname')['p1_status'].eq('L').rename('loses'),
df.filter(like='p2_').set_index('p2_surname')['p2_status'].eq('L').rename('loses')]) \
.groupby(level=0).sum().rename_axis('surname').reset_index()
surname loses
0 A 1
1 B 0
2 C 1
3 D 2