Home > Software design >  How to choose columns if they do not contain "NA" in the rows when using pivot longer
How to choose columns if they do not contain "NA" in the rows when using pivot longer

Time:01-30

I am trying to use the pivot longer function, except for my date-time column, and except for the columns that have "NA"'s in them. Previously I was using this code because instead of choosing not to pivot columns with NA's I was choosing not to pivot with columns that had the name stationary in it's title.

results <-
  as_tibble(LaHave_WA2021) %>%
  # Pivot non-stationary loggers
  pivot_longer(
    -c(Date_Time_GMT_3, ends_with("Stationary")), #this is the line I need to change
    names_to = "response",
    values_to = "y"
  )

I was thinking of using !contains, but I think that is looking if the column name (not the values within it) does not contain NA, for e.g.

results <-
  as_tibble(LaHave_WA2021) %>%
  # Pivot non-stationary loggers
  pivot_longer(
    -c(Date_Time_GMT_3, !contains("NA")), #changed to contains here
    names_to = "response",
    values_to = "y"
  )

Any ideas how I can do this?

here is a sample of my datafram

structure(list(Date_Time_GMT_3 = structure(c(1622552400, 1622553300, 
1622554200, 1622555100, 1622556000, 1622556900), class = c("POSIXct", 
"POSIXt"), tzone = "EST"), X20822244_27LH_Stationary = c(13.173, 
13.269, 13.269, 13.269, 13.269, 13.269), X20874232_B2LH_Stationary = c(14.517, 
14.517, 14.517, 14.517, 14.517, 14.517), X20874286_X1LH_Stationary = c(13.558, 
13.558, 13.558, 13.654, 13.654, 13.75), X2322844_X1LH_AIR = c(12.11, 
12.207, 12.304, 12.401, 12.401, 12.594), X20817728_X3LH_Stationary = c(14.709, 
14.709, 14.709, 14.709, 14.804, 14.804), X20676884_X3LH_U_StationaryCompare = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), X20819743_X6LH_Stationary = c(15.282, 
15.091, 15.282, 15.091, 14.996, 15.187), X20676900_50LH_U = c(12.207, 
12.207, 12.207, 12.207, 12.207, 12.207), X20822214_73LH_U = c(12.401, 
12.304, 12.304, 12.304, 12.304, 12.401), X20676887_44LH_U = c(11.722, 
11.722, 11.722, 11.722, 11.722, 11.722), X20822223_46LH_U = c(12.401, 
12.401, 12.401, 12.401, 12.497, 12.497), X20874231_56LH_U = c(15.282, 
15.187, 15.187, 15.187, 15.187, 15.187), X20874287_86LH_U = c(11.722, 
11.722, 11.722, 11.722, 11.819, 11.819), X20874298_71LH_U = c(12.69, 
12.594, 12.594, 12.594, 12.594, 12.594), X20874309_51LH_U = c(12.594, 
12.594, 12.69, 12.69, 12.69, 12.787), X20676887_X2LH_S = c(16.9674670250765, 
16.9674670250765, 16.9674670250765, 16.9674670250765, 17.0422755866248, 
17.0422755866248), X20819831_11LH_S = c(12.8720832098541, 12.8720832098541, 
12.8720832098541, 12.8720832098541, 12.8720832098541, 12.8720832098541
), X20822214_X4LH_S = c(17.5627317604855, 17.4148767005618, 17.5627317604855, 
17.4148767005618, 17.3413362257306, 17.4891912856543), X20822223_B3LH_S = c(19.4491451568519, 
19.4491451568519, 19.4491451568519, 19.4491451568519, 19.4491451568519, 
19.4491451568519), X20874231_62LH_S = c(15.271566973638, 15.271566973638, 
15.271566973638, 15.271566973638, 15.352583630419, 15.352583630419
), X20874298_B5LH_S = c(16.18926182318, 16.0019888653251, 16.18926182318, 
16.0019888653251, 15.9088426297429, 16.0961155875977), X20874309_B4LH_S = c(16.4017335880702, 
16.4017335880702, 16.4017335880702, 16.4017335880702, 16.4017335880702, 
16.4017335880702), X20676900_X5LH_S = c(13.1517913863509, 13.1517913863509, 
13.1517913863509, 13.2570884294554, 13.2570884294554, 13.3623854725599
), X20874287_58LH_U = c(12.913887422357, 12.913887422357, 12.913887422357, 
13.0000515069191, 13.0000515069191, 13.0862155914811), X20676900_54LH_S = c(11.405378599334, 
11.405378599334, 11.405378599334, 11.5019113471797, 11.5019113471797, 
11.5984440950255), X20819831_B1LH_S = c(16.3133453742116, 16.3133453742116, 
16.3133453742116, 16.3133453742116, 16.3133453742116, 16.3133453742116
), X20822214_76LH_S = c(12.704891245155, 12.704891245155, 12.704891245155, 
12.7949447496304, 12.7949447496304, 12.8849982541058), X20874231_31LH_S = c(13.8728743143662, 
13.8728743143662, 13.8728743143662, 13.8728743143662, 13.9537598661273, 
13.9537598661273), X20874298_55LHA_S = c(15.4572659453372, 15.2879259022681, 
15.4572659453372, 15.2879259022681, 15.2036991792756, 15.3730392223447
), X20822223_55LHB_S = c(15.4378290607062, 15.2700462580074, 
15.4378290607062, 15.2700462580074, 15.1865940786546, 15.3543768813534
), X20874287_70LH_S = c(16.429020745951, 16.429020745951, 16.429020745951, 
16.429020745951, 16.505152496343, 16.505152496343), X20874309_69LH_S = c(12.693578938679, 
12.693578938679, 12.693578938679, 12.7722769090551, 12.7722769090551, 
12.8509748794313), X10694645_20LH_S = c(14.1427870767824, 14.2416357374311, 
14.2416357374311, 14.2416357374311, 14.2416357374311, 14.2416357374311
), X20676900_57LH_S = c(14.4625399465288, 14.5352849038226, 14.5352849038226, 
14.5352849038226, 14.5352849038226, 14.5352849038226), X20819831_3LH_S = c(13.7114345882018, 
13.8015037541561, 13.8015037541561, 13.8015037541561, 13.8015037541561, 
13.8015037541561), X20822223_4LH_S = c(14.5467983081353, 14.628865937343, 
14.628865937343, 14.628865937343, 14.628865937343, 14.628865937343
), X20874231_79LH_S = c(13.5990052211839, 13.5990052211839, 13.5990052211839, 
13.5990052211839, 13.5990052211839, 13.5990052211839), X20874287_67LH_S = c(16.7919014799185, 
16.7919014799185, 16.7919014799185, 16.7919014799185, 16.8571822586659, 
16.8571822586659), X20874298_47LH_S = c(13.1257581381716, 13.2199456919256, 
13.2199456919256, 13.2199456919256, 13.2199456919256, 13.2199456919256
), X20874309_74LH_S = c(14.2622498116035, 14.2622498116035, 14.2622498116035, 
14.2622498116035, 14.3482543034687, 14.3482543034687), X20676887_49LH_S = c(12.3859934583537, 
12.3859934583537, 12.3859934583537, 12.461253579944, 12.461253579944, 
12.5365137015344)), row.names = c(NA, 6L), class = "data.frame")

CodePudding user response:

We may select to remove the columns that have all NAs, and this will remove those columns as part of the output

library(dplyr)
library(tidyr)
LaHave_WA2021 %>%
   select(where(~ !all(is.na(.x)))) %>% 
   pivot_longer(cols = -c(Date_Time_GMT_3), names_to = 'response', 
   values_to = 'y')

-output

# A tibble: 240 × 3
   Date_Time_GMT_3     response                      y
   <dttm>              <chr>                     <dbl>
 1 2021-06-01 08:00:00 X20822244_27LH_Stationary  13.2
 2 2021-06-01 08:00:00 X20874232_B2LH_Stationary  14.5
 3 2021-06-01 08:00:00 X20874286_X1LH_Stationary  13.6
 4 2021-06-01 08:00:00 X2322844_X1LH_AIR          12.1
 5 2021-06-01 08:00:00 X20817728_X3LH_Stationary  14.7
 6 2021-06-01 08:00:00 X20819743_X6LH_Stationary  15.3
 7 2021-06-01 08:00:00 X20676900_50LH_U           12.2
 8 2021-06-01 08:00:00 X20822214_73LH_U           12.4
 9 2021-06-01 08:00:00 X20676887_44LH_U           11.7
10 2021-06-01 08:00:00 X20822223_46LH_U           12.4
# … with 230 more rows

If we want to remove the NA rows, it is also possible with values_drop_na = TRUE

LaHave_WA2021 %>%
  pivot_longer(cols = -c(Date_Time_GMT_3), names_to = 'response', 
   values_to = 'y', values_drop_na = TRUE)

-output

# A tibble: 240 × 3
   Date_Time_GMT_3     response                      y
   <dttm>              <chr>                     <dbl>
 1 2021-06-01 08:00:00 X20822244_27LH_Stationary  13.2
 2 2021-06-01 08:00:00 X20874232_B2LH_Stationary  14.5
 3 2021-06-01 08:00:00 X20874286_X1LH_Stationary  13.6
 4 2021-06-01 08:00:00 X2322844_X1LH_AIR          12.1
 5 2021-06-01 08:00:00 X20817728_X3LH_Stationary  14.7
 6 2021-06-01 08:00:00 X20819743_X6LH_Stationary  15.3
 7 2021-06-01 08:00:00 X20676900_50LH_U           12.2
 8 2021-06-01 08:00:00 X20822214_73LH_U           12.4
 9 2021-06-01 08:00:00 X20676887_44LH_U           11.7
10 2021-06-01 08:00:00 X20822223_46LH_U           12.4
# … with 230 more rows

CodePudding user response:

Try:

results <-
  as_tibble(LaHave_WA2021) %>%
  pivot_longer(
    -c(Date_Time_GMT_3, where(anyNA)), # or where(~ all(is.na(.)))
    names_to = "response",
    values_to = "y"
  )

Output:

results[1:10, ]

# A tibble: 10 × 4
   Date_Time_GMT_3     X20676884_X3LH_U_StationaryCompare response                      y
   <dttm>                                           <dbl> <chr>                     <dbl>
 1 2021-06-01 08:00:00                                 NA X20822244_27LH_Stationary  13.2
 2 2021-06-01 08:00:00                                 NA X20874232_B2LH_Stationary  14.5
 3 2021-06-01 08:00:00                                 NA X20874286_X1LH_Stationary  13.6
 4 2021-06-01 08:00:00                                 NA X2322844_X1LH_AIR          12.1
 5 2021-06-01 08:00:00                                 NA X20817728_X3LH_Stationary  14.7
 6 2021-06-01 08:00:00                                 NA X20819743_X6LH_Stationary  15.3
 7 2021-06-01 08:00:00                                 NA X20676900_50LH_U           12.2
 8 2021-06-01 08:00:00                                 NA X20822214_73LH_U           12.4
 9 2021-06-01 08:00:00                                 NA X20676887_44LH_U           11.7
10 2021-06-01 08:00:00                                 NA X20822223_46LH_U           12.4

This is excluding columns from pivoting which have any missing values. If you're looking to exclude only columns that only have missing values, you could replace the where part with e.g. where(~ all(is.na(.))).

CodePudding user response:

We could use the values_drop_na = TRUE argument:

library(dplyr)
library(tidyr)

df %>% 
  pivot_longer(
    -Date_Time_GMT_3,
      names_to = "response",
      values_to = "y",
    values_drop_na = TRUE
  ) 
  Date_Time_GMT_3     response                      y
   <dttm>              <chr>                     <dbl>
 1 2021-06-01 08:00:00 X20822244_27LH_Stationary  13.2
 2 2021-06-01 08:00:00 X20874232_B2LH_Stationary  14.5
 3 2021-06-01 08:00:00 X20874286_X1LH_Stationary  13.6
 4 2021-06-01 08:00:00 X2322844_X1LH_AIR          12.1
 5 2021-06-01 08:00:00 X20817728_X3LH_Stationary  14.7
 6 2021-06-01 08:00:00 X20819743_X6LH_Stationary  15.3
 7 2021-06-01 08:00:00 X20676900_50LH_U           12.2
 8 2021-06-01 08:00:00 X20822214_73LH_U           12.4
 9 2021-06-01 08:00:00 X20676887_44LH_U           11.7
10 2021-06-01 08:00:00 X20822223_46LH_U           12.4
# … with 230 more rows

Another verbose solution is:

library(dplyr)
library(tidyr)
library(purrr)
with_na <- df %>%
  purrr::map_df(~sum(is.na(.))) %>% 
  select(where(~sum(.) != 0)) %>% 
  colnames()

df %>% 
  pivot_longer(
    -c(Date_Time_GMT_3, with_na),
      names_to = "response",
      values_to = "y"
  ) %>% 
  select(contains("Date"), response, y)
 Date_Time_GMT_3     response                      y
   <dttm>              <chr>                     <dbl>
 1 2021-06-01 08:00:00 X20822244_27LH_Stationary  13.2
 2 2021-06-01 08:00:00 X20874232_B2LH_Stationary  14.5
 3 2021-06-01 08:00:00 X20874286_X1LH_Stationary  13.6
 4 2021-06-01 08:00:00 X2322844_X1LH_AIR          12.1
 5 2021-06-01 08:00:00 X20817728_X3LH_Stationary  14.7
 6 2021-06-01 08:00:00 X20819743_X6LH_Stationary  15.3
 7 2021-06-01 08:00:00 X20676900_50LH_U           12.2
 8 2021-06-01 08:00:00 X20822214_73LH_U           12.4
 9 2021-06-01 08:00:00 X20676887_44LH_U           11.7
10 2021-06-01 08:00:00 X20822223_46LH_U           12.4
# … with 230 more rows
# ℹ Use `print(n = ...)` to see more rows
  • Related