I have a polars dataframe with two date columns that represent a start and end date and then a value that I want to repeat for all dates in between those two dates so that I can join those on other tables.
Example input is
id | start | end | value |
---|---|---|---|
123 | 2022-01-01 | 2022-01-04 | 10 |
abc | 2022-03-04 | 2022-03-04 | 3 |
456 | 2022-05-11 | 2022-05-16 | 4 |
and expected output is
id | date | value |
---|---|---|
123 | 2022-01-01 | 10 |
123 | 2022-01-02 | 10 |
123 | 2022-01-03 | 10 |
123 | 2022-01-04 | 10 |
abc | 2022-03-04 | 3 |
456 | 2022-05-11 | 4 |
456 | 2022-05-12 | 4 |
456 | 2022-05-13 | 4 |
456 | 2022-05-14 | 4 |
456 | 2022-05-15 | 4 |
456 | 2022-05-16 | 4 |
CodePudding user response:
I struggled today with the same problem and I thought I share my solution.
As cbilot already mentions pl.dat_range doesn't take expressions as low and high value. So I worked around by using apply.
Data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
Solution:
(
df.with_columns(
[(pl.struct(["start", "end"])
.apply(lambda x: pl.date_range(x["start"], x["end"], "1d"))
.alias("date"))])
.explode(pl.col("date"))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
CodePudding user response:
Starting with this data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
df
shape: (3, 4)
┌─────┬────────────┬────────────┬───────┐
│ id ┆ start ┆ end ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 │
╞═════╪════════════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴────────────┴───────┘
The Algorithm
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
In steps
Normally, we create a range of dates using the date_range
expression. However, date_range
does not take an Expression as its low
and high
parameters.
However, arange
does allow Expressions as its low
and high
parameters. We can (implicitly) cast the start
and end
dates to integers, which represent the number of days since the UNIX epoch.
The result is a list of integers which represents the days between (and including) the start
and end
dates (expressed as days since the UNIX epoch)..
Notice that we have to add 1 to the high
parameter to make sure we capture the end date.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") 1).alias("date")])
)
shape: (3, 5)
┌─────┬────────────┬────────────┬───────┬───────────────────────────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ list[i64] │
╞═════╪════════════╪════════════╪═══════╪═══════════════════════════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ [18993, 18994, ... 18996] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ [19055] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ [19123, 19124, ... 19128] │
└─────┴────────────┴────────────┴───────┴───────────────────────────┘
Next we can use explode
to place each of the integers on a separate row.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") 1).alias("date")])
.explode("date")
)
shape: (11, 5)
┌─────┬────────────┬────────────┬───────┬───────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ i64 │
╞═════╪════════════╪════════════╪═══════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18993 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18994 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18995 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18996 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ 19055 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19123 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19124 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19125 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19126 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19127 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19128 │
└─────┴────────────┴────────────┴───────┴───────┘
The final step is to cast the date
column back to a pl.Date
, and then select only the columns that we want.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘