I'm using PySpark to apply a function to get the cell value, split by ' ' and get first and last index of the split, but this column contains null values and I'm not managing to handle this null before split.
Here is my code:
def get_name(full_name):
for i in full_name:
if i is not None:
name_list = full_name.split(' ')
#first and last item of list
return f"{name_list[0]} {name_list[-1]}"
else:
return full_name
udf_get_name = udf(lambda x: get_name(x), StringType())
df_parquet = df_parquet.withColumn("NameReduz", udf_get_name(col("FullName")))
It complains about the NoneType
This is what I'm expecting:
FullName | NameReduz |
---|---|
NAME SURNAME LAST | NAME LAST |
NAME SURNAME1 SURNAME2 LAST | NAME LAST |
null | null |
CodePudding user response:
I would suggest not using udf
:
from pyspark.sql import functions as F
df_parquet = spark.createDataFrame(
[('NAME SURNAME LAST',),
('NAME SURNAME1 SURNAME2 LAST',),
(None,)],
['FullName'])
split_col = F.split("FullName", " ")
name_reduz = F.when(~F.isnull("FullName"), F.concat_ws(" ", split_col[0], F.element_at(split_col, -1)))
df_parquet = df_parquet.withColumn("NameReduz", name_reduz)
df_parquet.show(truncate=0)
# --------------------------- ---------
# |FullName |NameReduz|
# --------------------------- ---------
# |NAME SURNAME LAST |NAME LAST|
# |NAME SURNAME1 SURNAME2 LAST|NAME LAST|
# |null |null |
# --------------------------- ---------
But if you want to, the following udf should work:
def get_name(full_name):
if full_name is not None:
name_list = full_name.split(' ')
return f"{name_list[0]} {name_list[-1]}"