I have a column in my DF where data type is :
--testcolumn:array
--element: map
-----key:string
-----value: string
testcolumn
Row1:
[{"class":"6","Roll:"1","Name":"Ram1"},{"class":"6","Roll:"2","Name":"Ram2"},{"class":"6","Roll:"3","Name":"Ram3"},{"":""},{"":""}]
Row2:
[{"class":"6","Roll:"1","Name":"Ram1"}]
[{"":""},{"":""}{"class":"6","Roll:"1","Name":"Ram1"},{"class":"6","Roll:"2","Name":"Ram2"},{"class":"6","Roll:"3","Name":"Ram3"}]
Row3:
[{"class":"6","Roll:"1","Name":"Ram1"},{"class":"6","Roll:"2","Name":"Ram2"},{"class":"6","Roll:"3","Name":"Ram3"}]
Row4:
[{"":""},{"":""}{"class":"6","Roll:"1","Name":"Ram1"},{"":""},{"":""}]
expecting output:
outputcolumn
Row1:
[{"class":"6","Roll:"1","Name":"Ram1"},{"class":"6","Roll:"2","Name":"Ram2"},{"class":"6","Roll:"3","Name":"Ram3"}]
Row2:
[{"class":"6","Roll:"1","Name":"Ram1"},{"class":"6","Roll:"2","Name":"Ram2"},{"class":"6","Roll:"3","Name":"Ram3"}]
Row3:
[{"class":"6","Roll:"1","Name":"Ram1"},{"class":"6","Roll:"2","Name":"Ram2"},{"class":"6","Roll:"3","Name":"Ram3"}]
Row4:
[{"class":"6","Roll:"1","Name":"Ram1"}]
I have tried this code but it is not giving me output:
test=df.withColumn('outputcolumn',F.expr("translate"(testcolumn,x-> replace(x,'{"":""}','')))
it will be really great if someone can help me.
CodePudding user response:
Assuming all your values remain strings, and that your data frame does look something like this:
df = spark.createDataFrame([
{"testcolumn": [{"class":"6","Roll":"1","Name":"Ram1"},{"class":"6","Roll":"2","Name":"Ram2"},{"class":"6","Roll":"3","Name":"Ram3"},{"":""},{"":""}]},
{"testcolumn": [{"class":"6","Roll":"1","Name":"Ram1"},{"":""},{"":""},{"class":"6","Roll":"1","Name":"Ram1"},{"class":"6","Roll":"2","Name":"Ram2"},{"class":"6","Roll":"3","Name":"Ram3"}]},
{"testcolumn": [{"class":"6","Roll":"1","Name":"Ram1"},{"class":"6","Roll":"2","Name":"Ram2"},{"class":"6","Roll":"3","Name":"Ram3"}]},
{"testcolumn": [{"":""},{"":""},{"class":"6","Roll":"1","Name":"Ram1"},{"":""},{"":""}]}
])
df.show(truncate=False)
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# |testcolumn |
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# |[{Roll -> 1, class -> 6, Name -> Ram1}, {Roll -> 2, class -> 6, Name -> Ram2}, {Roll -> 3, class -> 6, Name -> Ram3}, { -> }, { -> }] |
# |[{Roll -> 1, class -> 6, Name -> Ram1}, { -> }, { -> }, {Roll -> 1, class -> 6, Name -> Ram1}, {Roll -> 2, class -> 6, Name -> Ram2}, {Roll -> 3, class -> 6, Name -> Ram3}]|
# |[{Roll -> 1, class -> 6, Name -> Ram1}, {Roll -> 2, class -> 6, Name -> Ram2}, {Roll -> 3, class -> 6, Name -> Ram3}] |
# |[{ -> }, { -> }, {Roll -> 1, class -> 6, Name -> Ram1}, { -> }, { -> }] |
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
You could accomplish this by a) casting the whole column to a string with to_json
, b) cleaning up the empty maps with regexp_replace
, and c) casting the column back with from_json
.
import pyspark.sql.functions as F
df_out = df \
.withColumn("testcolumn", F.to_json(F.col("testcolumn"))) \
.withColumn("testcolumn", F.regexp_replace(F.col("testcolumn"), ',?\{"":""\}', '')) \
.withColumn("testcolumn", F.regexp_replace(F.col("testcolumn"), "(?<=\[),", "")) \
.withColumn("testcolumn", F.from_json(F.col("testcolumn"), "ARRAY<MAP<STRING, STRING>>"))
df_out.show(truncate=False)
# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# |testcolumn |
# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# |[{Roll -> 1, class -> 6, Name -> Ram1}, {Roll -> 2, class -> 6, Name -> Ram2}, {Roll -> 3, class -> 6, Name -> Ram3}] |
# |[{Roll -> 1, class -> 6, Name -> Ram1}, {Roll -> 1, class -> 6, Name -> Ram1}, {Roll -> 2, class -> 6, Name -> Ram2}, {Roll -> 3, class -> 6, Name -> Ram3}]|
# |[{Roll -> 1, class -> 6, Name -> Ram1}, {Roll -> 2, class -> 6, Name -> Ram2}, {Roll -> 3, class -> 6, Name -> Ram3}] |
# |[{Roll -> 1, class -> 6, Name -> Ram1}] |
# ------------------------------------------------------------------------------------------------------------------------------------------------------------