Home > Net >  How do I take the median of several columns in polars-rs?
How do I take the median of several columns in polars-rs?

Time:12-01

Say I have a DataFrame consisting of the following four Series:

use polars::prelude::*;
use chrono::prelude::*;
use chrono::Duration;

fn main() {

    let series_one = Series::new(
        "a",
        (0..4).into_iter().map(|v| v as f64).collect::<Vec<_>>(),
    );
    let series_two = Series::new(
        "a",
        (4..8).into_iter().map(|v| v as f64).collect::<Vec<_>>(),
    );
    let series_three = Series::new(
        "a",
        (8..12).into_iter().map(|v| v as f64).collect::<Vec<_>>(),
    );

    let series_dates = Series::new(
        "date",
        (0..4)
            .into_iter()
            .map(|v| NaiveDate::default()   Duration::days(v))
            .collect::<Vec<_>>(),
    );

and I join them as such:

    let df_one = DataFrame::new(vec![series_one, series_dates.clone()]).unwrap();
    let df_two = DataFrame::new(vec![series_two, series_dates.clone()]).unwrap();
    let df_three = DataFrame::new(vec![series_three, series_dates.clone()]).unwrap();
    let df = df_one
        .join(
            &df_two,
            ["date"],
            ["date"],
            JoinType::Outer,
            Some("1".into()),
        )
        .unwrap()
        .join(
            &df_three,
            ["date"],
            ["date"],
            JoinType::Outer,
            Some("2".into()),
        )
        .unwrap();

which produces the following DataFrame:

shape: (4, 4)
┌─────┬────────────┬─────┬──────┐
│ a   ┆ date       ┆ a1  ┆ a2   │
│ --- ┆ ---        ┆ --- ┆ ---  │
│ f64 ┆ date       ┆ f64 ┆ f64  │
╞═════╪════════════╪═════╪══════╡
│ 0.0 ┆ 1970-01-01 ┆ 4.0 ┆ 8.0  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 1.0 ┆ 1970-01-02 ┆ 5.0 ┆ 9.0  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2.0 ┆ 1970-01-03 ┆ 6.0 ┆ 10.0 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3.0 ┆ 1970-01-04 ┆ 7.0 ┆ 11.0 │
└─────┴────────────┴─────┴──────┘

How can I make a new DataFrame which contains a date column and a a_median column like so?:

┌────────────┬────────────┐
│ a_median   ┆ date       ┆
│ ---        ┆ ---        ┆
│ f64        ┆ date       ┆
╞════════════╪════════════╡
│ 4.0        ┆ 1970-01-01 ┆
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5.0        ┆ 1970-01-02 ┆
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6.0        ┆ 1970-01-03 ┆
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7.0        ┆ 1970-01-04 ┆
└────────────┴────────────┘

I think this is best accomplished via LazyFrames but I'm not sure how to get this exact result.

CodePudding user response:

To get the results you're looking for, you can union the three DataFrames using the vstack method:

let mut unioned = df_one.vstack(&df_two).unwrap();
unioned = unioned.vstack(&df_three).unwrap();

Once you have a single DataFrame with all the records, you can group and aggregate them:

let aggregated = unioned.lazy()
                        .groupby(["date"])
                        .agg([
                            col("a").median().alias("a_median")
                         ])
                         .sort(
                            "date", 
                            SortOptions {
                                descending: false, 
                                nulls_last: true
                            }
                         )
                         .collect()
                         .unwrap();

Which gives the expected results:

    ┌────────────┬──────────┐
    │ date       ┆ a_median │
    │ ---        ┆ ---      │
    │ date       ┆ f64      │
    ╞════════════╪══════════╡
    │ 1970-01-01 ┆ 4.0      │
    ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
    │ 1970-01-02 ┆ 5.0      │
    ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
    │ 1970-01-03 ┆ 6.0      │
    ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
    │ 1970-01-04 ┆ 7.0      │
    └────────────┴──────────┘
  • Related