data = [[12, 112, 14],
[120, 112, 114],
[88, 92, 74],
[17, 118, 133],
[19, 19, 14],
[11, 12, 14]]
columns = ['Subject_1', 'Subject_2', 'Subject_3']
dataframe = spark.createDataFrame(data, columns)
dataframe.show()
# --------- --------- ---------
# |Subject_1|Subject_2|Subject_3|
# --------- --------- ---------
# | 12| 112| 14|
# | 120| 112| 114|
# | 88| 92| 74|
# | 17| 118| 133|
# | 19| 19| 14|
# | 11| 12| 14|
# --------- --------- ---------
Consider that a school only wants to publish the subjects where it performs best.
How to have a DF2
such that DF2
would only have a few columns from dataframe
such that subjects where:
average marks is greater than a threshold
average marks is 75th percentile and above given there are too many subjects
sort the columns ascending order of average score
PySpark and Pandas DataFrame are considered.
CodePudding user response:
Using this dataframe...
from pyspark.sql import functions as F
data = [[12, 112, 14, 15, 100],
[120, 112, 114, 17, 87],
[88, 92, 74, 76, 11],
[17, 118, 133, 99, 51],
[19, 19, 14, 54, 48],
[11, 12, 14, 75, 60]]
columns = ['Subject_1', 'Subject_2', 'Subject_3', 'Subject_4', 'Subject_5']
df = spark.createDataFrame(data, columns)
this is how averages look like:
df_avg = df.agg(*[F.avg(c).alias(c) for c in df.columns])
df_avg.show()
# --------- --------- --------- --------- ---------
# |Subject_1|Subject_2|Subject_3|Subject_4|Subject_5|
# --------- --------- --------- --------- ---------
# | 44.5| 77.5| 60.5| 56.0| 59.5|
# --------- --------- --------- --------- ---------
sort the columns ascending order of average score
d = df.agg(*[F.avg(c).alias(c) for c in df.columns]).head().asDict()
df = df.select(*sorted(d, key=d.get, reverse=False))
df.show()
# --------- --------- --------- --------- ---------
# |Subject_1|Subject_4|Subject_5|Subject_3|Subject_2|
# --------- --------- --------- --------- ---------
# | 12| 15| 100| 14| 112|
# | 120| 17| 87| 114| 112|
# | 88| 76| 11| 74| 92|
# | 17| 99| 51| 133| 118|
# | 19| 54| 48| 14| 19|
# | 11| 75| 60| 14| 12|
# --------- --------- --------- --------- ---------
average marks is greater than a threshold
threshold = 58
d = df.agg(*[F.avg(c).alias(c) for c in df.columns]).head().asDict()
df = df.select([k for k, v in d.items() if v >= threshold])
df.show()
# --------- --------- ---------
# |Subject_2|Subject_3|Subject_5|
# --------- --------- ---------
# | 112| 14| 100|
# | 112| 114| 87|
# | 92| 74| 11|
# | 118| 133| 51|
# | 19| 14| 48|
# | 12| 14| 60|
# --------- --------- ---------
average marks is 75th percentile and above
import numpy as np
d = df.agg(*[F.avg(c).alias(c) for c in df.columns]).head().asDict()
perc_75 = np.percentile(list(d.values()), 75)
df = df.select([k for k, v in d.items() if v >= perc_75])
df.show()
# --------- ---------
# |Subject_2|Subject_3|
# --------- ---------
# | 112| 14|
# | 112| 114|
# | 92| 74|
# | 118| 133|
# | 19| 14|
# | 12| 14|
# --------- ---------
Pandas:
sort the columns ascending order of average score
df[df.mean().sort_values(ascending=True).index]
average marks is greater than a threshold
threshold = 58
df[df.mean()[lambda x: x >= threshold].index]
average marks is 75th percentile and above
perc_75 = df.mean().quantile(.75)
df[df.mean()[lambda x: x >= perc_75].index]