Home > Software design >  How to remove NAs based on column data in data frames in a list?
How to remove NAs based on column data in data frames in a list?

Time:06-21

I have a list (my.list) that looks like this:

$S1
  Study_ID   B   C         D
1      100  NA  C1 0.9124000
2      100 1.5 PTA        NA
3      200 1.8  C1 0.5571429
4      200 2.1 PTA 0.7849462
5      300 3.2  C1 0.3271900
6      300 1.4 PTA        NA
7      400  NA  C1 0.8248200
8      400 9.3 PTA 0.2847020

$S2
  Study_ID    B   C         D
1      100   NA  C1 0.9124000
2      100 0.70 PTA        NA
3      200   NA  C1 0.5571429
4      200 0.45 PTA 0.7849462
5      300 0.91  C1 0.3271900
6      300 0.78 PTA 0.6492000
7      400 0.65  C1 0.8248200
8      400   NA PTA        NA

If a patient has 'NA' in column D, I would like to remove the entire patient from the list - that is, remove them based on Study_ID.

In other words, if there is an NA in Column D, I would like to remove the two rows that have the same Study_ID.

My desired output would look like this:

$S1
  Study_ID   B   C         D
1      200 1.8  C1 0.5571429
2      200 2.1 PTA 0.7849462
3      400  NA  C1 0.8248200
4      400 9.3 PTA 0.2847020

$S2
  Study_ID    B   C         D
1      200   NA  C1 0.5571429
2      200 0.45 PTA 0.7849462
3      300 0.91  C1 0.3271900
4      300 0.78 PTA 0.6492000

How can I go about doing this?

Reproducible Data:

my.list <- structure(list(S1 = structure(list(Study_ID = c(100, 100, 200, 
200, 300,300,400,400), B = c(NA, 1.5, 1.8, 2.1, 3.2, 1.4, NA, 9.3), C = c("C1", "PTA", "C1", "PTA", "C1", "PTA","C1", "PTA"), D = c(0.9124, NA, 0.5571429, 0.7849462, 0.32719, NA, 0.82482, 0.284702
)), .Names = c("Study_ID", "B", "C", "D"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8")), S2 = structure(list(Study_ID = c(100, 100, 200, 
200, 300,300,400,400), B = c(NA, 0.7, NA, 0.45, 
0.91, 0.78, 0.65, NA), C = c("C1", "PTA", "C1", "PTA", "C1", "PTA", "C1", "PTA"), D = c(0.9124, NA, 0.5571429, 0.7849462, 0.32719,0.6492, 0.82482, NA
)), .Names = c("Study_ID", "B", "C", 
"D"), class = "data.frame", row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8"))), .Names = c("S1", "S2"))

CodePudding user response:

tidyverse

library(tidyverse)

my.list %>% 
  map(~group_by(.x, Study_ID)) %>% 
  map(~filter(.x, !any(is.na(D))))

#> $S1
#> # A tibble: 4 × 4
#> # Groups:   Study_ID [2]
#>   Study_ID     B C         D
#>      <dbl> <dbl> <chr> <dbl>
#> 1      200   1.8 C1    0.557
#> 2      200   2.1 PTA   0.785
#> 3      400  NA   C1    0.825
#> 4      400   9.3 PTA   0.285
#> 
#> $S2
#> # A tibble: 4 × 4
#> # Groups:   Study_ID [2]
#>   Study_ID     B C         D
#>      <dbl> <dbl> <chr> <dbl>
#> 1      200 NA    C1    0.557
#> 2      200  0.45 PTA   0.785
#> 3      300  0.91 C1    0.327
#> 4      300  0.78 PTA   0.649

data.table

library(magrittr)
library(data.table)

lapply(my.list, setDT) %>% 
  lapply(function(x) x[, .SD[!any(is.na(D))], by = Study_ID])
#> $S1
#>    Study_ID   B   C         D
#> 1:      200 1.8  C1 0.5571429
#> 2:      200 2.1 PTA 0.7849462
#> 3:      400  NA  C1 0.8248200
#> 4:      400 9.3 PTA 0.2847020
#> 
#> $S2
#>    Study_ID    B   C         D
#> 1:      200   NA  C1 0.5571429
#> 2:      200 0.45 PTA 0.7849462
#> 3:      300 0.91  C1 0.3271900
#> 4:      300 0.78 PTA 0.6492000

data

my.list <-
  structure(list(
    S1 = structure(
      list(
        Study_ID = c(100, 100, 200,
                     200, 300, 300, 400, 400),
        B = c(NA, 1.5, 1.8, 2.1, 3.2, 1.4, NA, 9.3),
        C = c("C1", "PTA", "C1", "PTA", "C1", "PTA", "C1", "PTA"),
        D = c(0.9124, NA, 0.5571429, 0.7849462, 0.32719, NA, 0.82482, 0.284702)
      ),
      .Names = c("Study_ID", "B", "C", "D"),
      class = "data.frame",
      row.names = c("1",
                    "2", "3", "4", "5", "6", "7", "8")
    ),
    S2 = structure(
      list(
        Study_ID = c(100, 100, 200,
                     200, 300, 300, 400, 400),
        B = c(NA, 0.7, NA, 0.45,
              0.91, 0.78, 0.65, NA),
        C = c("C1", "PTA", "C1", "PTA", "C1", "PTA", "C1", "PTA"),
        D = c(0.9124, NA, 0.5571429, 0.7849462, 0.32719, 0.6492, 0.82482, NA)
      ),
      .Names = c("Study_ID", "B", "C",
                 "D"),
      class = "data.frame",
      row.names = c("1", "2", "3", "4",
                    "5", "6", "7", "8")
    )
  ), .Names = c("S1", "S2"))

CodePudding user response:

Small alternative to @Yuriy answer:

library(dplyr)
library(purrr)

map(my.list, function(x) {
  x %>% 
    group_by(Study_ID) %>% 
    filter(all(!is.na(D))) %>% 
    ungroup()
})

In base R:

lapply(my.list, function(x) {
  to_remove <- unique(x[which(is.na(x$D)), "Study_ID"])
  x[!x$Study_ID %in% to_remove, ]
})
  • Related