Home > Blockchain >  How can I split or cut a dataframe based on a date interval
How can I split or cut a dataframe based on a date interval

Time:10-18

I got a big data file (x) with attributes such as the the dataframe, the frequency, filename, header and missingness. The dataframe (data) has 4 columns (date, X, Y, and Z) containing over 20 million datapoints. I want to split up the dataframe to a new dateframe based on an date interval. E.g., the 20 million datapoints range from 2016-09-28 09:30:00 to 2016-10-06 20:30:00 and I only want to keep the datapoints X, Y, and Z where the date is between 2016-09-30 08:00:00 to 2016-10-02 20:15:00. I tried that with the following code:

This is to create a new dataframe with only the data and not the other attributes

df1 = x$data
as.POSIXct(df1$time[1],format="%Y-%m-%dT%H:%M")

Then I try to make a date interval where the new df2 should only contain the values that range within that date zone

df2 <- df1[df1$time > "2016-09-30 09:30:00" &   df1$time <= "2016-10-03 20:15:00",]

However, the new dataframe does not start from 09:30:01 and ends on 20:14:59. So the question is how can I change the code so that I will be between this interval.

structure(list(time = structure(c(1475220600.03333, 1475220600.06667, 
1475220600.1, 1475220600.13333, 1475220600.16667, 1475220600.2, 
1475220600.23333, 1475220600.26667, 1475220600.3, 1475220600.33333, 
1475220600.36667, 1475220600.4, 1475220600.43333, 1475220600.46667, 
1475220600.5, 1475220600.53333, 1475220600.56667, 1475220600.6, 
1475220600.63333, 1475220600.66667, 1475220600.7, 1475220600.73333, 
1475220600.76667, 1475220600.8, 1475220600.83333, 1475220600.86667, 
1475220600.9, 1475220600.93333, 1475220600.96667, 1475220601, 
1475220601.03333, 1475220601.06667, 1475220601.1, 1475220601.13333, 
1475220601.16667, 1475220601.2, 1475220601.23333, 1475220601.26667, 
1475220601.3, 1475220601.33333), class = c("POSIXct", "POSIXt"
), tzone = "GMT"), X = c(0.039, 0.043, 0.043, 0.043, 0.039, 0.043, 
0.035, 0.039, 0.039, 0.043, 0.039, 0.039, 0.043, 0.043, 0.035, 
0.043, 0.035, 0.043, 0.035, 0.039, 0.039, 0.043, 0.043, 0.039, 
0.035, 0.035, 0.039, 0.039, 0.039, 0.031, 0.035, 0.035, 0.035, 
0.039, 0.035, 0.035, 0.039, 0.035, 0.039, 0.043), Y = c(0.016, 
0.012, 0.012, 0.02, 0.016, 0.02, 0.016, 0.012, 0.012, 0.02, 0.012, 
0.016, 0.012, 0.016, 0.02, 0.02, 0.012, 0.02, 0.016, 0.012, 0.02, 
0.012, 0.02, 0.023, 0.016, 0.016, 0.016, 0.02, 0.016, 0.012, 
0.016, 0.012, 0.016, 0.012, 0.016, 0.016, 0.02, 0.016, 0.012, 
0.012), Z = c(-0.977, -0.977, -0.969, -0.977, -0.969, -0.969, 
-0.977, -0.969, -0.973, -0.965, -0.973, -0.977, -0.977, -0.973, 
-0.969, -0.977, -0.973, -0.973, -0.973, -0.977, -0.973, -0.969, 
-0.969, -0.969, -0.973, -0.969, -0.969, -0.973, -0.973, -0.977, 
-0.973, -0.969, -0.973, -0.973, -0.973, -0.977, -0.973, -0.977, 
-0.973, -0.973)), subject_name = "1", time_zone = "02:00:00", missingness = structure(list(
    time = structure(c(1475747248, 1475747249, 1475747250, 1475747292, 
    1475747293, 1475747294), class = c("POSIXct", "POSIXt"), tzone = "GMT"), 
    n_missing = c(30L, 30L, 1230L, 30L, 30L, 32490L)), class = "data.frame", row.names = c("1475747248", 
"1475747249", "1475747250", "1475747292", "1475747293", "1475747294"
)), old_version = FALSE, firmware = "1.5.0", last_sample_time = structure(1475748377, tzone = "GMT", class = c("POSIXct", 
"POSIXt")), serial_prefix = "TAS", sample_rate = 30L, acceleration_min = "-8.0", acceleration_max = "8.0", header = structure(list(
    Field = c("Serial Number", "Device Type", "Firmware", "Battery Voltage", 
    "Sample Rate", "Start Date", "Stop Date", "Last Sample Time", 
    "TimeZone", "Download Date", "Board Revision", "Unexpected Resets", 
    "Acceleration Scale", "Acceleration Min", "Acceleration Max", 
    "Mass", "Age", "Limb", "DateOfBirth", "Subject Name", "Serial Prefix"
    ), Value = c(`Serial Number` = "TAS1E44150325", `Device Type` = "Link", 
    Firmware = "1.5.0", `Battery Voltage` = "3,88", `Sample Rate` = "30", 
    `Start Date` = "2016-09-28 08:00:00", `Stop Date` = "2016-10-07 17:00:00", 
    `Last Sample Time` = "2016-10-06 10:06:17", TimeZone = "02:00:00", 
    `Download Date` = "2016-10-06 10:06:17", `Board Revision` = "5", 
    `Unexpected Resets` = "0", `Acceleration Scale` = "256", 
    `Acceleration Min` = "-8.0", `Acceleration Max` = "8.0", 
    Mass = "77,1107028999572", Age = "22", Limb = "Waist", DateOfBirth = "627890912111111100", 
    `Subject Name` = "1", `Serial Prefix` = "TAS")), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -21L)), start_time = structure(1475049600, tzone = "GMT", class = c("POSIXct", 
"POSIXt")), stop_time = structure(1475859600, tzone = "GMT", class = c("POSIXct", 
"POSIXt")), total_records = 20930820L, bad_samples = FALSE, row.names = 5130002:14067001, class = c("activity_df", 
"activity_df", "data.frame"), n_head = 40)

CodePudding user response:

To expand on my comment...

Your test data frame doesn't include any data in the date range you specify and "Hmm it still does not work" is not very informative. So I'll demonstrate with some artificial data.

library(tidyverse)
library(lubridate)

df1 <- tibble(
        time=seq.POSIXt(
               ymd_hms("2016-09-30 07:30:00"), 
               ymd_hms("2016-10-06 20:30:00"), 
               120
             )
       )
df1
# A tibble: 4,711 × 1
   time               
   <dttm>             
 1 2016-09-30 07:30:00
 2 2016-09-30 07:32:00
 3 2016-09-30 07:34:00
 4 2016-09-30 07:36:00
 5 2016-09-30 07:38:00
 6 2016-09-30 07:40:00
 7 2016-09-30 07:42:00
 8 2016-09-30 07:44:00
 9 2016-09-30 07:46:00
10 2016-09-30 07:48:00
# … with 4,701 more rows

So, 4711 rows with datetimes every two minutes between 2016-09-30 07:30:00 and 2016-10-06 20:30:00.

Filter as required.

df1 %>% 
  filter(
    time > ymd_hms("2016-09-30 09:30:00") & 
    time < ymd_hms("2016-10-03 20:15:00")
  )
# A tibble: 2,482 × 1
   time               
   <dttm>             
 1 2016-09-30 09:32:00
 2 2016-09-30 09:34:00
 3 2016-09-30 09:36:00
 4 2016-09-30 09:38:00
 5 2016-09-30 09:40:00
 6 2016-09-30 09:42:00
 7 2016-09-30 09:44:00
 8 2016-09-30 09:46:00
 9 2016-09-30 09:48:00
10 2016-09-30 09:50:00
# … with 2,472 more rows
  • Related