Combining two pandas dataframes together Python [duplicate]-CodePudding

The code below calculates the list_val of the vals and vals_2 values that are in correlation to the dates month_changes and month_changes_2 . The code calculates the mean','median' or 'max','min' by separating the intervals in years. I want to implement a bit to the code where it combines the two outputs graph and graph_2 together and return the Expected Output below, how would I be able to do that? The code below has been derived from the answer from this question : link

Code:

import numpy as np 
import pandas as pd 

month_changes = np.array(["2018-04-01 00:00:00", "2018-05-01 00:00:00", "2019-03-01 00:00:00", "2019-04-01 00:00:00","2019-08-01 00:00:00", "2019-11-01 00:00:00", "2019-12-01 00:00:00","2021-01-01 00:00:00"]) 
vals = np.array([10, 23, 45, 4,5,12,4,-6])

month_changes_2 = np.array(["2018-04-06 00:00:00", "2018-05-13 00:00:00", "2018-03-01 00:00:00", "2019-02-01 00:00:00","2019-03-12 00:00:00", "2019-12-01 00:00:00", "2019-12-22 00:00:00","2020-04-01 00:00:00","2021-01-01 00:00:00"]) 
vals_2 = np.array([140, 213, 15, 4,53,1,42,-63,120])

list_val = ['mean', 'median', 'max', 'min']
def yearly_intervals(mc, vs, start_year, end_year,series_val):
    print(series_val)
    data = pd.DataFrame({
        "Date": pd.to_datetime(mc),  # Convert to_datetime immediately
        "Averages": vs
    })
    out = (
        data.groupby(data["Date"].dt.year)["Averages"]  # Access Series
            .agg(list_val[series_val[0]:series_val[-1]])
            .rename(columns=lambda x: 'Average' if x == 'mean' else x.title())
    )
    # If start_year
    if start_year is not None:
        # Reindex to ensure index contains all years in range
        out = out.reindex(range(
            start_year,
            # Use last year (maximum value) from index or user defined arg
            (end_year if end_year is not None else out.index.max())   1
        ), fill_value=0)
    return out

graph= yearly_intervals(month_changes, vals, start_year=2016, end_year=2021,series_val=[0,2])
graph_2= yearly_intervals(month_changes_2, vals_2, start_year=2016, end_year=2021,series_val = [2,4])

Output:

      Average  Median
Date                 
2016      0.0     0.0
2017      0.0     0.0
2018     16.5    16.5
2019     14.0     5.0
2020      0.0     0.0
2021     -6.0    -6.0

      Max  Min
Date          
2016    0    0
2017    0    0
2018  213   15
2019   53    1
2020  -63  -63
2021  120  120

Expected Output

      Average  Median  Max  Min
Date                 
2016      0.0     0.0   0    0
2017      0.0     0.0   0    0
2018     16.5    16.5  213   15
2019     14.0     5.0   53    1
2020      0.0     0.0  -63  -63
2021     -6.0    -6.0  120  120

CodePudding user response：

Something like this?


import pandas as pd
df1 = pd.DataFrame({
    'Average' : [0.0, 0.0, 16.5],
    'Median' : [0.0, 0.0, 16.5]
}, index=[2016, 2017, 2018])

df2 = pd.DataFrame({
    'Max' : [0, 0, 213],
    'Min' : [0, 0, 15]
}, index= [2016, 2017, 2018])


print(df1)
print(df2)

df = pd.concat([df1, df2], axis=1)

print(df)

CodePudding user response：

I assume that you have the two dataframes graph and graph_2 already created and processed.

Try this

combined_df = pd.concat([graph, graph_2], axis=1)
print(combined_df)

It will output:

      Average  Median  Max  Min
Date
2016      0.0     0.0    0    0
2017      0.0     0.0    0    0
2018     16.5    16.5  213   15
2019     14.0     5.0   53    1
2020      0.0     0.0  -63  -63
2021     -6.0    -6.0  120  120

CodePudding user response：

Just take your existing work and run graph.join(graph_2):

import numpy as np 
import pandas as pd 

month_changes = np.array(["2018-04-01 00:00:00", "2018-05-01 00:00:00", "2019-03-01 00:00:00", "2019-04-01 00:00:00","2019-08-01 00:00:00", "2019-11-01 00:00:00", "2019-12-01 00:00:00","2021-01-01 00:00:00"]) 
vals = np.array([10, 23, 45, 4,5,12,4,-6])

month_changes_2 = np.array(["2018-04-06 00:00:00", "2018-05-13 00:00:00", "2018-03-01 00:00:00", "2019-02-01 00:00:00","2019-03-12 00:00:00", "2019-12-01 00:00:00", "2019-12-22 00:00:00","2020-04-01 00:00:00","2021-01-01 00:00:00"]) 
vals_2 = np.array([140, 213, 15, 4,53,1,42,-63,120])

list_val = ['mean', 'median', 'max', 'min']
def yearly_intervals(mc, vs, start_year, end_year,series_val):
    print(series_val)
    data = pd.DataFrame({
        "Date": pd.to_datetime(mc),  # Convert to_datetime immediately
        "Averages": vs
    })
    out = (
        data.groupby(data["Date"].dt.year)["Averages"]  # Access Series
            .agg(list_val[series_val[0]:series_val[-1]])
            .rename(columns=lambda x: 'Average' if x == 'mean' else x.title())
    )
    # If start_year
    if start_year is not None:
        # Reindex to ensure index contains all years in range
        out = out.reindex(range(
            start_year,
            # Use last year (maximum value) from index or user defined arg
            (end_year if end_year is not None else out.index.max())   1
        ), fill_value=0)
    return out

graph= yearly_intervals(month_changes, vals, start_year=2016, end_year=2021,series_val=[0,2])
graph_2= yearly_intervals(month_changes_2, vals_2, start_year=2016, end_year=2021,series_val = [2,4])

print(graph.join(graph_2))

which prints

[0, 2]
[2, 4]

      Average  Median  Max  Min
Date                           
2016      0.0     0.0    0    0
2017      0.0     0.0    0    0
2018     16.5    16.5  213   15
2019     14.0     5.0   53    1
2020      0.0     0.0  -63  -63
2021     -6.0    -6.0  120  120