Home > Back-end >  merge two dataframes with two columns with condition
merge two dataframes with two columns with condition

Time:02-02

I have two tables (A and B) , I want to get the value for each latest data_date of each ID in every fiscal year from the latest data_date in the same fiscal year of the same ID from table B. I have the results in the third table. I am trying to code this in python

Table A

ID          data_date   fiscal_year
A           2016-03-31  2016          
A           2016-03-31  2016      
A           2018-09-31  2018            
B           2017-06-30  2017         
B           2017-09-30  2017     
B           2018-06-30  2018     
C           2013-03-31  2013     

Table B

ID          data_date   Value
A           2015-12-31     1   
A           2016-12-31     4  
A           2018-03-30     85           
B           2015-12-31     7  
B           2016-12-31     14   
B           2017-12-31     12 
C           2013-03-30     45 
C           2013-12-31     9  
C           2014-12-31     64 
C           2015-12-31     25

Results

ID          data_date   fiscal_year Value
A           2016-03-31  2016          4
A           2016-03-31  2016          4
A           2018-09-31  2018          85 [85 is the value of the latest date of B in 2018
B           2017-06-30  2017          NA [It is not the latest data date in 2017 for B]
B           2017-09-30  2017          12
B           2018-06-30  2018          NA [There is no value for B in 2018]  
C           2013-03-31  2013          9 

CodePudding user response:

IIUC, use merge_asof with a forward direction.

Assuming (df1) and (df2) are your two tables/dataframes, you can use this :

df1["data_date"] = pd.to_datetime(df1["data_date"])
df2["data_date"] = pd.to_datetime(df2["data_date"])
df1 = df_A.sort_values(by="data_date")
df2 = df_B.sort_values(by="data_date")
​
out = (pd.merge_asof(df1, df2, on="data_date", by="ID",
                    allow_exact_matches=False, direction="forward")
           .sort_values(by="ID", ignore_index=True))

Output :

print(out)
​
  ID  data_date  fiscal_year  Value
0  A 2016-03-31         2016    4.0
1  A 2016-03-31         2016    4.0
2  A 2018-09-30         2018    NaN
3  B 2017-06-30         2017   12.0
4  B 2017-09-30         2017   12.0
5  B 2018-06-30         2018    NaN
6  C 2013-03-31         2013    9.0

CodePudding user response:

To get the same output

#1. If your data uses dictionaries

dataDictA = [
#ID          data_date   fiscal_year
{"ID": "A", "data_date": "2016-03-31", "fiscal_year": 2016},
{"ID": "A", "data_date": "2016-03-31", "fiscal_year": 2016},
{"ID": "A", "data_date": "2018-09-31", "fiscal_year": 2018},
{"ID": "B", "data_date": "2017-06-30", "fiscal_year": 2017},
{"ID": "B", "data_date": "2017-09-30", "fiscal_year": 2017},
{"ID": "B", "data_date": "2018-06-30", "fiscal_year": 2018},
{"ID": "C", "data_date": "2013-03-31", "fiscal_year": 2013},
]

dataDictB = [
#ID          data_date   Value
{"ID": "A", "data_date": "2015-12-31", "Value": 1},
{"ID": "A", "data_date": "2016-12-31", "Value": 4},
{"ID": "A", "data_date": "2018-03-30", "Value": 85},
{"ID": "B", "data_date": "2015-12-31", "Value": 7},
{"ID": "B", "data_date": "2016-12-31", "Value": 14},
{"ID": "B", "data_date": "2017-12-31", "Value": 12},
{"ID": "C", "data_date": "2013-03-30", "Value": 45},
{"ID": "C", "data_date": "2013-12-31", "Value": 9},
{"ID": "C", "data_date": "2014-12-31", "Value": 64},
{"ID": "C", "data_date": "2015-12-31", "Value": 25},
]

def mergeDict(a,b):
    res = []
    for listA in a:
        year = listA["fiscal_year"]# or listA["data_date"].split("-")[0]
        tick = dateTick(listA["data_date"])

        latest = 0
        latestVal = "NA"
        
        #is it the latest date?
        previous = False
        for listA2 in a:
            if listA["ID"]==listA2["ID"]:#same ID
                if listA2["fiscal_year"] == year:#same year
                    if tick<dateTick(listA2["data_date"]):#not the latest
                        previous = True
                        break
        if previous:
            res.append(listA)
            res[-1]["Value"] = "NA"
            res[-1]["Reason"] = f"-not the lastest data in {year} for {listA2['ID']}"
            continue#skip if not the latest
        
        #check latest in dataB
        for listB in b:
            if listA["ID"]==listB["ID"]:#same ID
                if int(listB["data_date"].split("-")[0]) == year:#same year
                    tickB = dateTick(listB["data_date"])
                    if latest<tick:#get the latest value (compared in seconds)
                        latest = tickB
                        latestVal = listB["Value"]
        
        res.append(listA)
        res[-1]["Value"] = latestVal
        if latestVal == "NA":
            res[-1]["Reason"] = f"-no value for {listA['ID']} in {year}"
    return res

for v in mergeDict(dataDictA,dataDictB):
    print(v)

Output:

{'ID': 'A', 'data_date': '2016-03-31', 'fiscal_year': 2016, 'Value': 4}
{'ID': 'A', 'data_date': '2016-03-31', 'fiscal_year': 2016, 'Value': 4}
{'ID': 'A', 'data_date': '2018-09-31', 'fiscal_year': 2018, 'Value': 85}
{'ID': 'B', 'data_date': '2017-06-30', 'fiscal_year': 2017, 'Value': 'NA', 'Reason': '-not the lastest data in 2017 for B'}
{'ID': 'B', 'data_date': '2017-09-30', 'fiscal_year': 2017, 'Value': 12}
{'ID': 'B', 'data_date': '2018-06-30', 'fiscal_year': 2018, 'Value': 'NA', 'Reason': '-no value for B in 2018'}
{'ID': 'C', 'data_date': '2013-03-31', 'fiscal_year': 2013, 'Value': 9}

#2. If your data uses arrays

dataA = [
#ID          data_date   fiscal_year
["A", "2016-03-31", 2016],
["A", "2016-03-31", 2016],
["A", "2018-09-31", 2018],
["B", "2017-06-30", 2017],
["B", "2017-09-30", 2017],
["B", "2018-06-30", 2018],
["C", "2013-03-31", 2013],
]

dataB = [
#ID          data_date   Value
["A", "2015-12-31", 1],
["A", "2016-12-31", 4],
["A", "2018-03-30", 85],
["B", "2015-12-31", 7],
["B", "2016-12-31", 14],
["B", "2017-12-31", 12],
["C", "2013-03-30", 45],
["C", "2013-12-31", 9],
["C", "2014-12-31", 64],
["C", "2015-12-31", 25],
]

#date to seconds
def dateTick(date):
    tick = date.split("-")
    return int(tick[0])*31536000 int(tick[1])*2628000 int(tick[2])*86400

def merge(a,b):
    res = []
    for listA in a:
        year = listA[2]# or listA[1].split("-")[0]
        tick = dateTick(listA[1])

        latest = 0
        latestVal = ["NA",f"-no value for {listA[0]} in {year}"]
        
        #is it the latest date?
        previous = False
        for listA2 in a:
            if listA[0]==listA2[0]:#same ID
                if listA2[2] == year:#same year
                    if tick<dateTick(listA2[1]):#not the latest
                        previous = True
                        break
        if previous:
            res.append(listA ["NA",f"-not the lastest data in {year} for {listA2[0]}"])
            continue#skip if not the latest
        
        #check latest in dataB
        for listB in b:
            if listA[0]==listB[0]:#same ID
                if int(listB[1].split("-")[0]) == year:#same year
                    tickB = dateTick(listB[1])
                    if latest<tick:#get the latest value (compared in seconds)
                        latest = tickB
                        latestVal = [listB[2]]
        
        res.append(listA latestVal)
    return res

for v in merge(dataA,dataB):
    print(v)

Output:

['A', '2016-03-31', 2016, 4]
['A', '2016-03-31', 2016, 4]
['A', '2018-09-31', 2018, 85]
['B', '2017-06-30', 2017, 'NA', '-not the lastest data in 2017 for B']
['B', '2017-09-30', 2017, 12]
['B', '2018-06-30', 2018, 'NA', '-no value for B in 2018']
['C', '2013-03-31', 2013, 9]
  • Related