I have a pyspark dataframe with below structure.
Current Schema:
root
|-- ID
|-- Information
| |-- Name
| |-- Age
| |-- Gender
|-- Description
I would like to add first name and last name to Information.Name
Is there a way to add new columns so multi level struct types in pyspark?
Expected Schema:
root
|-- ID
|-- Information
| |-- Name
| | |-- firstName
| | |-- lastName
| |-- Age
| |-- Gender
|-- Description
CodePudding user response:
Use withField
, this would work:
df=df.withColumn('Information', F.col('Information').withField('Name', F.struct(*[F.col('Information.Name').alias('FName'), F.lit('').alias('LName')])))
Schema Before:
root
|-- Id: string (nullable = true)
|-- Information: struct (nullable = true)
| |-- Name: string (nullable = true)
| |-- Age: integer (nullable = true)
Schema After:
root
|-- Id: string (nullable = true)
|-- Information: struct (nullable = true)
| |-- Name: struct (nullable = false)
| | |-- FName: string (nullable = true)
| | |-- LName: string (nullable = false)
| |-- Age: integer (nullable = true)
I initialized the value of Fname with the current value of Name, you can use substring if that is needed.
CodePudding user response:
If all Names follow below pattern then you can split on whitespace.
FirstName LastName
Example code with data.
from pyspark.sql.types import *
import pyspark.sql.functions as sqlf
data = [{
"ID":1,
"Information":{
"Name":"Alice Wonderland",
"Age":20,
"Gender":"Female"
},
"Description":"Test data"
}]
schema = StructType([
StructField("Description", StringType(), True),
StructField("ID", IntegerType(), True),
StructField("Information",
StructType([
StructField("Name", StringType(), True),
StructField("Age", IntegerType(), True),
StructField("Gender", StringType(), True)
]),True)
])
df = spark.createDataFrame(data,schema)
splitName = sqlf.split(df.Information.Name,' ')
df=df.withColumn('Information', sqlf.col('Information')
.withField('Name', sqlf.struct(splitName[0].alias('firstName'), splitName[1].alias('lastName'))))
df.printSchema()
root
|-- Description: string (nullable = true)
|-- ID: integer (nullable = true)
|-- Information: struct (nullable = true)
| |-- Name: struct (nullable = false)
| | |-- firstName: string (nullable = true)
| | |-- lastName: string (nullable = true)
| |-- Age: integer (nullable = true)
| |-- Gender: string (nullable = true)
df.show(truncate=False)
----------- --- ---------------------------------
|Description|ID |Information |
----------- --- ---------------------------------
|Test data |1 |{{Alice, Wonderland}, 20, Female}|
----------- --- ---------------------------------