I have a table like this:
src_id | src_source | dst_id | dst_source | metadata
--------------------------------------------------------
123 | A | 345 | B | some_string
234 | B | 567 | A | some_other_string
498 | A | 432 | A | another_one # this line should be ignored
765 | B | 890 | B | another_one # this line should be ignored
What I would like is:
A_id | B_id | metadata
-----------------------
123 | 345 | some string
567 | 234 | some_other_string
Here's the data to replicate:
data = [
("123", "A", "345", "B", "some_string"),
("234", "B", "567", "A", "some_other_string"),
("498", "A", "432", "A", "another_one"),
("765", "B", "890", "B", "another_two"),
]
cols = ["src_id", "src_source", "dst_id", "dst_source", "metadata"]
df = spark.createDataFrame(data).toDF(*cols)
I am a bit confused as to how to do this - I got to here:
output = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.withColumn("A_id",
F.when(F.col("src_source") == "A", F.col("src_id")))
.withColumn("B_id",
F.when(F.col("src_source") == "B", F.col("src_id")))
)
CodePudding user response:
I think i figured it out - I need to split the df and union again!
ab_df = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.filter((F.col("src_source") == "A") & (F.col("dst_source") == "B"))
.select(F.col("src_id").alias("A_id"),
F.col("dst_id").alias("B_id"),
"metadata")
)
ba_df = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.filter((F.col("src_source") == "B") & (F.col("dst_source") == "A"))
.select(F.col("src_id").alias("B_id"),
F.col("dst_id").alias("A_id"),
"metadata")
)
all = ab_df.unionByName(ba_df)
CodePudding user response:
You can do it without union, just in one select, without the need to write the same filter twice.
output = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.select(
F.when(F.col("src_source") == "A", F.col("src_id")).otherwise(F.col("dst_id")).alias("A_id"),
F.when(F.col("src_source") == "A", F.col("dst_id")).otherwise(F.col("src_id")).alias("B_id"),
"metadata"
)
)
output.show()
# ---- ---- -----------------
# |A_id|B_id| metadata|
# ---- ---- -----------------
# | 123| 345| some_string|
# | 567| 234|some_other_string|
# ---- ---- -----------------