I have a use case where I want to replace nulls in StructType columns with empty values. Below is sample which you can use to recreate the scenario:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
data = [(1, [1, "Aman"]), (2, [2, "Raman"]), (3, [3, "Baman"]), (4, None)]
schema = StructType(
[
StructField("ID", IntegerType(), True),
StructField(
"Name",
StructType(
[
StructField("NameID", IntegerType(), True),
StructField("FirstName", StringType(), True),
]
),
True,
),
]
)
df = spark.createDataFrame(data, schema)
df.show()
I want to know how we can replace nulls in Name
column.
CodePudding user response:
Using when
expression:
from pyspark.sql import functions as F
df1 = df.withColumn(
"Name",
F.when(
F.col("Name").isNull(),
F.struct(
F.lit(None).alias("NameID"), #replace None by default ID value if needed
F.lit("").alias("FirstName")
)
).otherwise(F.col("Name"))
)
df1.show()
# --- ----------
#| ID| Name|
# --- ----------
#| 1| {1, Aman}|
#| 2|{2, Raman}|
#| 3|{3, Baman}|
#| 4| {null, }|
# --- ----------
CodePudding user response:
Here is a solution with coalesce
:
from pyspark.sql import functions as F
df1 = df.withColumn(
"Name",
F.coalesce(
F.col("Name"),
F.struct(
F.lit(None).alias("NameID"),
F.lit("").alias("FirstName"),
),
),
)