I have a dataframe which looks like this below
-----------------------------
| Item |
-----------------------------
|[[a,b,c], [d,e,f], [g,h,i]] |
-------------------- --------
How to transform it into below tables?
a b c
d e f
g h i
I have tried using the explode
and withColumn
function
a b c
a e c
a h c
d b c
d e c
d h c
... (many other combinations)
CodePudding user response:
You need to explode only the first level array then you can select array elements as columns:
import pyspark.sql.functions as F
df = spark.createDataFrame(
[([["a","b","c"], ["d","e","f"], ["g","h","i"]],)],
["Item"]
)
df.withColumn("Item", F.explode("Item")).select(
*[F.col("Item")[i].alias(f"col_{i}") for i in range(3)]
).show()
# ----- ----- -----
#|col_0|col_1|col_2|
# ----- ----- -----
#| a| b| c|
#| d| e| f|
#| g| h| i|
# ----- ----- -----
CodePudding user response:
@blackbishop improving your answer...
import pyspark.sql.functions as F
df = spark.createDataFrame(
[([["a","b","c"], ["d","e","f"], ["g","h","i", "j"]],)],
["data"]
)
df.show(20, False)
df = df.withColumn("data1", F.explode("data"))
df.select('data1').show()
# Row(max(size(data1))=4) ---> 4
max_size = df.select(F.max(F.size('data1'))).collect()[0][0]
df.select(
*[F.col("data1")[i].alias(f"col_{i}") for i in range(max_size)]
).show()
------------------------------------
|data |
------------------------------------
|[[a, b, c], [d, e, f], [g, h, i, j]]|
------------------------------------
------------
| data1|
------------
| [a, b, c]|
| [d, e, f]|
|[g, h, i, j]|
------------
----- ----- ----- -----
|col_0|col_1|col_2|col_3|
----- ----- ----- -----
| a| b| c| null|
| d| e| f| null|
| g| h| i| j|
----- ----- ----- -----