Say I have a DataFrame
consisting of the following four Series
:
use polars::prelude::*;
use chrono::prelude::*;
use chrono::Duration;
fn main() {
let series_one = Series::new(
"a",
(0..4).into_iter().map(|v| v as f64).collect::<Vec<_>>(),
);
let series_two = Series::new(
"a",
(4..8).into_iter().map(|v| v as f64).collect::<Vec<_>>(),
);
let series_three = Series::new(
"a",
(8..12).into_iter().map(|v| v as f64).collect::<Vec<_>>(),
);
let series_dates = Series::new(
"date",
(0..4)
.into_iter()
.map(|v| NaiveDate::default() Duration::days(v))
.collect::<Vec<_>>(),
);
and I join them as such:
let df_one = DataFrame::new(vec![series_one, series_dates.clone()]).unwrap();
let df_two = DataFrame::new(vec![series_two, series_dates.clone()]).unwrap();
let df_three = DataFrame::new(vec![series_three, series_dates.clone()]).unwrap();
let df = df_one
.join(
&df_two,
["date"],
["date"],
JoinType::Outer,
Some("1".into()),
)
.unwrap()
.join(
&df_three,
["date"],
["date"],
JoinType::Outer,
Some("2".into()),
)
.unwrap();
which produces the following DataFrame
:
shape: (4, 4)
┌─────┬────────────┬─────┬──────┐
│ a ┆ date ┆ a1 ┆ a2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ date ┆ f64 ┆ f64 │
╞═════╪════════════╪═════╪══════╡
│ 0.0 ┆ 1970-01-01 ┆ 4.0 ┆ 8.0 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 1.0 ┆ 1970-01-02 ┆ 5.0 ┆ 9.0 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2.0 ┆ 1970-01-03 ┆ 6.0 ┆ 10.0 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3.0 ┆ 1970-01-04 ┆ 7.0 ┆ 11.0 │
└─────┴────────────┴─────┴──────┘
How can I make a new DataFrame
which contains a date
column and a a_median
column like so?:
┌────────────┬────────────┐
│ a_median ┆ date ┆
│ --- ┆ --- ┆
│ f64 ┆ date ┆
╞════════════╪════════════╡
│ 4.0 ┆ 1970-01-01 ┆
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5.0 ┆ 1970-01-02 ┆
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6.0 ┆ 1970-01-03 ┆
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7.0 ┆ 1970-01-04 ┆
└────────────┴────────────┘
I think this is best accomplished via LazyFrame
s but I'm not sure how to get this exact result.
CodePudding user response:
To get the results you're looking for, you can union the three DataFrames using the vstack method:
let mut unioned = df_one.vstack(&df_two).unwrap();
unioned = unioned.vstack(&df_three).unwrap();
Once you have a single DataFrame with all the records, you can group and aggregate them:
let aggregated = unioned.lazy()
.groupby(["date"])
.agg([
col("a").median().alias("a_median")
])
.sort(
"date",
SortOptions {
descending: false,
nulls_last: true
}
)
.collect()
.unwrap();
Which gives the expected results:
┌────────────┬──────────┐
│ date ┆ a_median │
│ --- ┆ --- │
│ date ┆ f64 │
╞════════════╪══════════╡
│ 1970-01-01 ┆ 4.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1970-01-02 ┆ 5.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1970-01-03 ┆ 6.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1970-01-04 ┆ 7.0 │
└────────────┴──────────┘