I have two tables (A and B) , I want to get the value for each latest data_date of each ID in every fiscal year from the latest data_date in the same fiscal year of the same ID from table B. I have the results in the third table. I am trying to code this in python
Table A
ID data_date fiscal_year
A 2016-03-31 2016
A 2016-03-31 2016
A 2018-09-31 2018
B 2017-06-30 2017
B 2017-09-30 2017
B 2018-06-30 2018
C 2013-03-31 2013
Table B
ID data_date Value
A 2015-12-31 1
A 2016-12-31 4
A 2018-03-30 85
B 2015-12-31 7
B 2016-12-31 14
B 2017-12-31 12
C 2013-03-30 45
C 2013-12-31 9
C 2014-12-31 64
C 2015-12-31 25
Results
ID data_date fiscal_year Value
A 2016-03-31 2016 4
A 2016-03-31 2016 4
A 2018-09-31 2018 85 [85 is the value of the latest date of B in 2018
B 2017-06-30 2017 NA [It is not the latest data date in 2017 for B]
B 2017-09-30 2017 12
B 2018-06-30 2018 NA [There is no value for B in 2018]
C 2013-03-31 2013 9
CodePudding user response:
IIUC, use merge_asof
with a forward
direction.
Assuming (df1
) and (df2
) are your two tables/dataframes, you can use this :
df1["data_date"] = pd.to_datetime(df1["data_date"])
df2["data_date"] = pd.to_datetime(df2["data_date"])
df1 = df_A.sort_values(by="data_date")
df2 = df_B.sort_values(by="data_date")
out = (pd.merge_asof(df1, df2, on="data_date", by="ID",
allow_exact_matches=False, direction="forward")
.sort_values(by="ID", ignore_index=True))
Output :
print(out)
ID data_date fiscal_year Value
0 A 2016-03-31 2016 4.0
1 A 2016-03-31 2016 4.0
2 A 2018-09-30 2018 NaN
3 B 2017-06-30 2017 12.0
4 B 2017-09-30 2017 12.0
5 B 2018-06-30 2018 NaN
6 C 2013-03-31 2013 9.0
CodePudding user response:
To get the same output
#1. If your data uses dictionaries
dataDictA = [
#ID data_date fiscal_year
{"ID": "A", "data_date": "2016-03-31", "fiscal_year": 2016},
{"ID": "A", "data_date": "2016-03-31", "fiscal_year": 2016},
{"ID": "A", "data_date": "2018-09-31", "fiscal_year": 2018},
{"ID": "B", "data_date": "2017-06-30", "fiscal_year": 2017},
{"ID": "B", "data_date": "2017-09-30", "fiscal_year": 2017},
{"ID": "B", "data_date": "2018-06-30", "fiscal_year": 2018},
{"ID": "C", "data_date": "2013-03-31", "fiscal_year": 2013},
]
dataDictB = [
#ID data_date Value
{"ID": "A", "data_date": "2015-12-31", "Value": 1},
{"ID": "A", "data_date": "2016-12-31", "Value": 4},
{"ID": "A", "data_date": "2018-03-30", "Value": 85},
{"ID": "B", "data_date": "2015-12-31", "Value": 7},
{"ID": "B", "data_date": "2016-12-31", "Value": 14},
{"ID": "B", "data_date": "2017-12-31", "Value": 12},
{"ID": "C", "data_date": "2013-03-30", "Value": 45},
{"ID": "C", "data_date": "2013-12-31", "Value": 9},
{"ID": "C", "data_date": "2014-12-31", "Value": 64},
{"ID": "C", "data_date": "2015-12-31", "Value": 25},
]
def mergeDict(a,b):
res = []
for listA in a:
year = listA["fiscal_year"]# or listA["data_date"].split("-")[0]
tick = dateTick(listA["data_date"])
latest = 0
latestVal = "NA"
#is it the latest date?
previous = False
for listA2 in a:
if listA["ID"]==listA2["ID"]:#same ID
if listA2["fiscal_year"] == year:#same year
if tick<dateTick(listA2["data_date"]):#not the latest
previous = True
break
if previous:
res.append(listA)
res[-1]["Value"] = "NA"
res[-1]["Reason"] = f"-not the lastest data in {year} for {listA2['ID']}"
continue#skip if not the latest
#check latest in dataB
for listB in b:
if listA["ID"]==listB["ID"]:#same ID
if int(listB["data_date"].split("-")[0]) == year:#same year
tickB = dateTick(listB["data_date"])
if latest<tick:#get the latest value (compared in seconds)
latest = tickB
latestVal = listB["Value"]
res.append(listA)
res[-1]["Value"] = latestVal
if latestVal == "NA":
res[-1]["Reason"] = f"-no value for {listA['ID']} in {year}"
return res
for v in mergeDict(dataDictA,dataDictB):
print(v)
Output:
{'ID': 'A', 'data_date': '2016-03-31', 'fiscal_year': 2016, 'Value': 4}
{'ID': 'A', 'data_date': '2016-03-31', 'fiscal_year': 2016, 'Value': 4}
{'ID': 'A', 'data_date': '2018-09-31', 'fiscal_year': 2018, 'Value': 85}
{'ID': 'B', 'data_date': '2017-06-30', 'fiscal_year': 2017, 'Value': 'NA', 'Reason': '-not the lastest data in 2017 for B'}
{'ID': 'B', 'data_date': '2017-09-30', 'fiscal_year': 2017, 'Value': 12}
{'ID': 'B', 'data_date': '2018-06-30', 'fiscal_year': 2018, 'Value': 'NA', 'Reason': '-no value for B in 2018'}
{'ID': 'C', 'data_date': '2013-03-31', 'fiscal_year': 2013, 'Value': 9}
#2. If your data uses arrays
dataA = [
#ID data_date fiscal_year
["A", "2016-03-31", 2016],
["A", "2016-03-31", 2016],
["A", "2018-09-31", 2018],
["B", "2017-06-30", 2017],
["B", "2017-09-30", 2017],
["B", "2018-06-30", 2018],
["C", "2013-03-31", 2013],
]
dataB = [
#ID data_date Value
["A", "2015-12-31", 1],
["A", "2016-12-31", 4],
["A", "2018-03-30", 85],
["B", "2015-12-31", 7],
["B", "2016-12-31", 14],
["B", "2017-12-31", 12],
["C", "2013-03-30", 45],
["C", "2013-12-31", 9],
["C", "2014-12-31", 64],
["C", "2015-12-31", 25],
]
#date to seconds
def dateTick(date):
tick = date.split("-")
return int(tick[0])*31536000 int(tick[1])*2628000 int(tick[2])*86400
def merge(a,b):
res = []
for listA in a:
year = listA[2]# or listA[1].split("-")[0]
tick = dateTick(listA[1])
latest = 0
latestVal = ["NA",f"-no value for {listA[0]} in {year}"]
#is it the latest date?
previous = False
for listA2 in a:
if listA[0]==listA2[0]:#same ID
if listA2[2] == year:#same year
if tick<dateTick(listA2[1]):#not the latest
previous = True
break
if previous:
res.append(listA ["NA",f"-not the lastest data in {year} for {listA2[0]}"])
continue#skip if not the latest
#check latest in dataB
for listB in b:
if listA[0]==listB[0]:#same ID
if int(listB[1].split("-")[0]) == year:#same year
tickB = dateTick(listB[1])
if latest<tick:#get the latest value (compared in seconds)
latest = tickB
latestVal = [listB[2]]
res.append(listA latestVal)
return res
for v in merge(dataA,dataB):
print(v)
Output:
['A', '2016-03-31', 2016, 4]
['A', '2016-03-31', 2016, 4]
['A', '2018-09-31', 2018, 85]
['B', '2017-06-30', 2017, 'NA', '-not the lastest data in 2017 for B']
['B', '2017-09-30', 2017, 12]
['B', '2018-06-30', 2018, 'NA', '-no value for B in 2018']
['C', '2013-03-31', 2013, 9]