Home > database >  Apache Beam Left Join in Go
Apache Beam Left Join in Go

Time:02-03

Is there a simple to perform a left join of 2 PCollections using Go? I see that the SQL joins are available only in Java.

package main

import (
    "context"
    "flag"

    "github.com/apache/beam/sdks/v2/go/pkg/beam"
    "github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
)

type customer struct {
    CustID int
    FName  string
}

type order struct {
    OrderID int
    Amount  int
    Cust_ID int
}

func main() {

    flag.Parse()
    beam.Init()

    ctx := context.Background()

    p := beam.NewPipeline()
    s := p.Root()

    var custList = []customer{
        {1, "Bob"},
        {2, "Adam"},
        {3, "John"},
        {4, "Ben"},
        {5, "Jose"},
        {6, "Bryan"},
        {7, "Kim"},
        {8, "Tim"},
    }

    var orderList = []order{
        {123, 100, 1},
        {125, 30, 3},
        {128, 50, 7},
    }

    custPCol := beam.CreateList(s, custList)

    orderPCol := beam.CreateList(s, orderList)

    // Left Join custPcol with orderPCol
    // Expected Result
    // CustID | FName   |OrderID| Amount
    //     1  | Bob     |   123 | 100
    //     2  | Adam    |       |
    //     3  | John    |   125 | 100
    //     4  | Ben     |       |
    //     5  | Jose    |       |
    //     6  | Bryan   |       |
    //     7  | Kim     |   125 | 100
    //     8  | Tim     |       |

    if err := beamx.Run(ctx, p); err != nil {
        log.Exitf(ctx, "Failed to execute job: %v", err)
    }

}

I want join these 2 PCollections and perform further operations. I saw the documentations about CoGroupByKey but unable to get it into the format which a normal SQL Join would do.

Any suggestions on this?

CodePudding user response:

try like this

type resultType struct {
    CustID  int
    FName   string
    OrderID int
    Amount  int
}

result := beam.ParDo(s, func(c customer, iterOrder func(*order) bool) resultType {
    var o order

    for iterOrder(&o) {
        if c.CustID == o.Cust_ID {
            return resultType{
                CustID:  c.CustID,
                FName:   c.FName,
                OrderID: o.OrderID,
                Amount:  o.Amount,
            }
        }
    }

    return resultType{
        CustID: c.CustID,
        FName:  c.FName,
    }
}, custPCol, beam.SideInput{Input: orderPCol})

or if you want using CoGroupByKey ...

custWithKeyPCol := beam.ParDo(s, func(c customer) (int, customer) {
    return c.CustID, c
}, custPCol)

orderWithKeyPCol := beam.ParDo(s, func(o order) (int, order) {
    return o.Cust_ID, o
}, orderPCol)

resultPCol := beam.CoGroupByKey(s, custWithKeyPCol, orderWithKeyPCol)

beam.ParDo0(s, func(CustID int, custIter func(*customer) bool, orderIter func(*order) bool) {
    c, o := customer{}, order{}
    for custIter(&c) {
        if ok := orderIter(&o); ok {
            fmt.Println(CustID, c.FName, o.OrderID, o.Amount)
        }
        fmt.Println(CustID, c.FName)
    }
}, resultPCol)
  • Related