In my dataframe, I need to convert an array type column to string without losing the element names/schema for the data in the column.
My dataframe schema:
root
|-- accountId: string (nullable = true)
|-- documents: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- accountId: string (nullable = true)
| | |-- agreementId: string (nullable = true)
| | |-- createdBy: string (nullable = true)
| | |-- createdDate: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- obligations: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- resourceVersion: long (nullable = true)
| | |-- updatedBy: string (nullable = true)
| | |-- updatedDate: string (nullable = true)
Dataframe sample data (I am showing it in JSON format, but it's columns in Spark dataframe):
{
"accountId":"1",
"documents":{
"list":[{
"element":{
"accountId":"1",
"agreementId":"1.2",
"createdDate":"2022-10-06T19:33:42.539646Z",
"externalId":"16",
"id":"123",
"name":"test1.docx",
"obligations":{},
"resourceVersion":1,
"updatedDate":"2022-10-06T19:33:42.680233Z"
}
}]
}
}
{
"accountId":"2",
"documents":{
"list":[{
"element":{
"accountId":"2",
"agreementId":"2.2",
"createdDate":"2022-10-06T19:33:42.539646Z",
"externalId":"18",
"id":"123",
"name":"test2.docx",
"obligations":{},
"resourceVersion":1,
"updatedDate":"2022-10-06T19:33:42.680233Z"
}
}]
}
}
My current code:
df_string = df.select([col(c).cast("string") for c in df.columns])
What it can do (column names disappear in documents):
{
"accountId":"1",
"documents":[{"1","1.2","2022-10-06T19:33:42.539646Z","16",:"123","test1.docx","",1,"2022-10-06T19:33:42.680233Z"}]
}
{
"accountId":"2",
"documents":[{"2","2.2","2022-10-06T19:33:42.539646Z","18","123","test2.docx","","1","2022-10-06T19:33:42.680233Z"}]
}
What I need to accomplish (column names must remain in documents):
{
"accountId":"1",
"documents":[{"accountId":"1","agreementId":"1.2","createdDate":"2022-10-06T19:33:42.539646Z","externalId":"16","id":"123","name":"test1.docx","obligations":"","resourceVersion":"1","updatedDate":"2022-10-06T19:33:42.680233Z"}]
}
{
"accountId":"2",
"documents":[{"accountId":"2","agreementId":"2.2","createdDate":"2022-10-06T19:33:42.539646Z","externalId":"18","id":"123","name":"test2.docx","obligations":"","resourceVersion":"1","updatedDate":"2022-10-06T19:33:42.680233Z"}]
}
CodePudding user response:
It seems, to_json
could be your answer. It transforms a complex type column (structs, arrays, maps) into a JSON string.
Input dataframe:
from pyspark.sql import functions as F
df = spark.createDataFrame(
[("1", [("1", "1.2", "Tom", "2022-10-06T19:33:42.539646Z", "123", [""], 1, "Tom", "2022-10-06T19:33:42.680233Z")]),
("2", [("2", "2.2", "Tim", "2022-10-06T19:33:42.539646Z", "123", [""], 1, "Tim", "2022-10-06T19:33:42.680233Z")])],
'accountId:string,documents:array<struct<accountId:string,agreementId:string,createdBy:string,createdDate:string,id:string,obligations:array<string>,resourceVersion:bigint,updatedBy:string,updatedDate:string>>')
df.printSchema()
# root
# |-- accountId: string (nullable = true)
# |-- documents: array (nullable = true)
# | |-- element: struct (containsNull = true)
# | | |-- accountId: string (nullable = true)
# | | |-- agreementId: string (nullable = true)
# | | |-- createdBy: string (nullable = true)
# | | |-- createdDate: string (nullable = true)
# | | |-- id: string (nullable = true)
# | | |-- obligations: array (nullable = true)
# | | | |-- element: string (containsNull = true)
# | | |-- resourceVersion: long (nullable = true)
# | | |-- updatedBy: string (nullable = true)
# | | |-- updatedDate: string (nullable = true)
Script:
df = df.withColumn("documents", F.to_json("documents"))
df.show(truncate=0)
# --------- ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# |accountId|documents |
# --------- ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# |1 |[{"accountId":"1","agreementId":"1.2","createdBy":"Tom","createdDate":"2022-10-06T19:33:42.539646Z","id":"123","obligations":[""],"resourceVersion":1,"updatedBy":"Tom","updatedDate":"2022-10-06T19:33:42.680233Z"}]|
# |2 |[{"accountId":"2","agreementId":"2.2","createdBy":"Tim","createdDate":"2022-10-06T19:33:42.539646Z","id":"123","obligations":[""],"resourceVersion":1,"updatedBy":"Tim","updatedDate":"2022-10-06T19:33:42.680233Z"}]|
# --------- ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
df.printSchema()
# root
# |-- accountId: string (nullable = true)
# |-- documents: string (nullable = true)