import pyspark.sql.functions as F
data1 = [("10/05/21", 1, "White", 3000), ("10/06/21", 2, "Blue", 4100), ("10/07/21", 3, "Green", 6200)]
df1 = spark.createDataFrame(data1, ["START", "KEY1", "Color", "OTHER"])
data2 = [(1, 2,3, 3000), (2, 3,2, 4100), (3, 1,2 6200)]
df2 = spark.createDataFrame(data2, ["KEY2", "KEY3", "KEY4","NUMBER"])
df_result = df1.withColumnRenamed("START", "DATE1").join(
df2,
F.col("KEY1") == F.col("KEY2")
).select("DATE1", "KEY3").join(
df1.withColumnRenamed("START", "DATE2"),
F.col("KEY1") == F.col("KEY3")
).select("DATE1", "DATE2","KEY4").join(
df1.withColumnRenamed("START", "DATE3"),
F.col("KEY1") == F.col("KEY4")
).select("DATE1", "DATE2","DATE3")
df_result.show()
I am trying to use approach found at Combine two dataframes with seperate keys for one dataframe so can select two column based on keys
but cannot seem to join it as says cannot resolve key4.
CodePudding user response:
You have not selected KEY4
after the first join making it unavailable for the select
after the join with condition KEY1 = KEY3
.
df_result = df1.withColumnRenamed("START", "DATE1").join(
df2,
F.col("KEY1") == F.col("KEY2")
).select("DATE1", "KEY3", "KEY4").join(
df1.withColumnRenamed("START", "DATE2"),
F.col("KEY1") == F.col("KEY3")
).select("DATE1", "DATE2", "KEY4").join(
df1.withColumnRenamed("START", "DATE3"),
F.col("KEY1") == F.col("KEY4")
).select("DATE1", "DATE2", "DATE3")
Output
-------- -------- --------
| DATE1| DATE2| DATE3|
-------- -------- --------
|10/05/21|10/06/21|10/07/21|
|10/07/21|10/05/21|10/06/21|
|10/06/21|10/07/21|10/06/21|
-------- -------- --------