Home > database >  Swap values between columns based on third column
Swap values between columns based on third column

Time:09-13

I have a table like this:

src_id  | src_source | dst_id | dst_source | metadata
--------------------------------------------------------
123     | A          | 345    | B          | some_string
234     | B          | 567    | A          | some_other_string
498     | A          | 432    | A          | another_one        # this line should be ignored
765     | B          | 890    | B          | another_one        # this line should be ignored

What I would like is:

A_id  | B_id | metadata
-----------------------
123   | 345  | some string
567   | 234  | some_other_string

Here's the data to replicate:

data = [
    ("123", "A", "345", "B", "some_string"), 
    ("234", "B", "567", "A", "some_other_string"), 
    ("498", "A", "432", "A", "another_one"), 
    ("765", "B", "890", "B", "another_two"), 
] 

cols = ["src_id", "src_source", "dst_id", "dst_source", "metadata"]
df = spark.createDataFrame(data).toDF(*cols)

I am a bit confused as to how to do this - I got to here:

output = (
    df
    .filter(F.col("src_source") != F.col("dst_source"))
    .withColumn("A_id", 
                F.when(F.col("src_source") == "A", F.col("src_id")))
    .withColumn("B_id", 
                F.when(F.col("src_source") == "B", F.col("src_id")))
)

CodePudding user response:

I think i figured it out - I need to split the df and union again!

ab_df = (
    df
    .filter(F.col("src_source") != F.col("dst_source"))
    .filter((F.col("src_source") == "A") & (F.col("dst_source") == "B"))
    .select(F.col("src_id").alias("A_id"), 
            F.col("dst_id").alias("B_id"),
            "metadata")
)

ba_df = (
        df
    .filter(F.col("src_source") != F.col("dst_source"))
    .filter((F.col("src_source") == "B") & (F.col("dst_source") == "A"))
    .select(F.col("src_id").alias("B_id"), 
            F.col("dst_id").alias("A_id"),
            "metadata")
)

all = ab_df.unionByName(ba_df)

CodePudding user response:

You can do it without union, just in one select, without the need to write the same filter twice.

output = (
    df
    .filter(F.col("src_source") != F.col("dst_source"))
    .select(
        F.when(F.col("src_source") == "A", F.col("src_id")).otherwise(F.col("dst_id")).alias("A_id"),
        F.when(F.col("src_source") == "A", F.col("dst_id")).otherwise(F.col("src_id")).alias("B_id"),
        "metadata"
    )
)
output.show()
#  ---- ---- ----------------- 
# |A_id|B_id|         metadata|
#  ---- ---- ----------------- 
# | 123| 345|      some_string|
# | 567| 234|some_other_string|
#  ---- ---- ----------------- 
  • Related