I want to add new column "Null_Values" in PySpark dataframe as below
=======================================================|
ID | Maths | Science | English | NUll_Values |
=======================================================|
11 | 80 | NULL | 89 | Science |
12 | NULL | NULL | 89 | Maths,Science |
13 | 90 | 95 | 85 | NULL |
14 | NULL | NULL | NULL | Maths,Science,English |
=======================================================|
CodePudding user response:
df = df.withColumn(
"NUll_Values",
F.concat_ws(",", *[F.when(F.col(c).isNull(), c) for c in df.columns])
)
df.show(truncate=False)
# --- ----- ------- ------- ---------------------
# |ID |Maths|Science|English|NUll_Values |
# --- ----- ------- ------- ---------------------
# |11 |80 |null |89 |Science |
# |12 |null |null |89 |Maths,Science |
# |13 |90 |95 |85 | |
# |14 |null |null |null |Maths,Science,English|
# --- ----- ------- ------- ---------------------
or
df = df.withColumn(
"NUll_Values",
F.concat_ws(",", *[F.when(F.col(c).isNull(), c) for c in df.columns])
).replace("", None, subset=["NUll_Values"])
df.show(truncate=False)
# --- ----- ------- ------- ---------------------
# |ID |Maths|Science|English|NUll_Values |
# --- ----- ------- ------- ---------------------
# |11 |80 |null |89 |Science |
# |12 |null |null |89 |Maths,Science |
# |13 |90 |95 |85 |null |
# |14 |null |null |null |Maths,Science,English|
# --- ----- ------- ------- ---------------------