Weird behavior when wrapping purrr::map within dplyr::mutate-CodePudding

I am running into some errors I do not fully understand when trying to call purrr::map around dplyr::mutate. The reproducible code is as follows:

library(purrr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tibble)
# data 
test_dset <- structure(list(genus = c("Aureitalea", "Aureivirga", "Auricoccucs"), 
                            t_count = c(0L, 0L, 0L), n = c(1L, 1L, 1L), 
                            ncbi_id = list("1176327", "1433990", character(0)), 
                            g_test = list(c(`1176327` = 0), 
                                          c(`1433990` = 0), 
                                          structure(numeric(0), .Names = character(0)))), 
                       class = c("rowwise_df", "tbl_df", "tbl", "data.frame"), 
                       row.names = c(NA, -3L), 
                       groups = structure(list(.rows = structure(list(1L, 2L, 3L), 
                                                                 ptype = integer(0), 
                                                                 class = c("vctrs_list_of","vctrs_vctr", "list"))), 
                                          row.names = c(NA, -3L), 
                                          class = c("tbl_df", "tbl", "data.frame")))
test_dset
#> # A tibble: 3 × 5
#> # Rowwise: 
#>   genus       t_count     n ncbi_id   g_test   
#>   <chr>         <int> <int> <list>    <list>   
#> 1 Aureitalea        0     1 <chr [1]> <dbl [1]>
#> 2 Aureivirga        0     1 <chr [1]> <dbl [1]>
#> 3 Auricoccucs       0     1 <chr [0]> <dbl [0]>
# process a vector of pvals 
proc_gtest <- function(pvals){
  if (length(pvals) == 0){
    return(NA_character_)
  } 
  sig <- which(pvals < 0.05)
  if (length(sig) == 0){
    return(NA_character_)
  } else {
    return(names(pvals)[sig])
  }
}

# returns errors 
test_dset |> mutate(ncbi_filt = map(g_test, proc_gtest))
#> Error: Problem with `mutate()` column `ncbi_filt`.
#> ℹ `ncbi_filt = map(g_test, proc_gtest)`.
#> ℹ `ncbi_filt` must be size 1, not 0.
#> ℹ Did you mean: `ncbi_filt = list(map(g_test, proc_gtest))` ?
#> ℹ The error occurred in row 3.
# this is okay 
map(test_dset$g_test, proc_gtest)
#> [[1]]
#> [1] "1176327"
#> 
#> [[2]]
#> [1] "1433990"
#> 
#> [[3]]
#> [1] NA
# adding list doesn't work because it returns a list of NULL 
# with names as the quantities I wanted. 
test_dset |> mutate(ncbi_filt = list(map(g_test, proc_gtest))) |> pull(ncbi_filt)
#> [[1]]
#> [[1]]$`1176327`
#> NULL
#> 
#> 
#> [[2]]
#> [[2]]$`1433990`
#> NULL
#> 
#> 
#> [[3]]
#> named list()

^{Created on 2021-10-13 by the reprex package (v2.0.1)}

Session info

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 4.1.1 (2021-08-10)
#>  os       macOS Mojave 10.14.6        
#>  system   x86_64, darwin17.0          
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_US.UTF-8                 
#>  ctype    en_US.UTF-8                 
#>  tz       America/New_York            
#>  date     2021-10-13                  
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  ! package     * version date       lib source        
#>    assertthat    0.2.1   2019-03-21 [1] CRAN (R 4.1.0)
#>    backports     1.2.1   2020-12-09 [1] CRAN (R 4.1.0)
#>    cli           3.0.1   2021-07-17 [1] CRAN (R 4.1.0)
#>    crayon        1.4.1   2021-02-08 [1] CRAN (R 4.1.0)
#>    DBI           1.1.1   2021-01-15 [1] CRAN (R 4.1.0)
#>    digest        0.6.27  2020-10-24 [1] CRAN (R 4.1.0)
#>    dplyr       * 1.0.7   2021-06-18 [1] CRAN (R 4.1.0)
#>    ellipsis      0.3.2   2021-04-29 [1] CRAN (R 4.1.0)
#>    evaluate      0.14    2019-05-28 [1] CRAN (R 4.1.0)
#>    fansi         0.5.0   2021-05-25 [1] CRAN (R 4.1.0)
#>    fastmap       1.1.0   2021-01-25 [1] CRAN (R 4.1.0)
#>    fs            1.5.0   2020-07-31 [1] CRAN (R 4.1.0)
#>    generics      0.1.0   2020-10-31 [1] CRAN (R 4.1.0)
#>    glue          1.4.2   2020-08-27 [1] CRAN (R 4.1.0)
#>    highr         0.9     2021-04-16 [1] CRAN (R 4.1.0)
#>    htmltools     0.5.2   2021-08-25 [1] CRAN (R 4.1.0)
#>    knitr         1.34    2021-09-09 [1] CRAN (R 4.1.0)
#>    lifecycle     1.0.0   2021-02-15 [1] CRAN (R 4.1.0)
#>    magrittr      2.0.1   2020-11-17 [1] CRAN (R 4.1.0)
#>    pillar        1.6.2   2021-07-29 [1] CRAN (R 4.1.0)
#>    pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.1.0)
#>    purrr       * 0.3.4   2020-04-17 [1] CRAN (R 4.1.0)
#>  P R.cache       0.15.0  2021-04-30 [?] CRAN (R 4.1.0)
#>  P R.methodsS3   1.8.1   2020-08-26 [?] CRAN (R 4.1.0)
#>  P R.oo          1.24.0  2020-08-26 [?] CRAN (R 4.1.0)
#>  P R.utils       2.11.0  2021-09-26 [?] CRAN (R 4.1.0)
#>    R6            2.5.1   2021-08-19 [1] CRAN (R 4.1.0)
#>    reprex        2.0.1   2021-08-05 [1] CRAN (R 4.1.0)
#>    rlang         0.4.11  2021-04-30 [1] CRAN (R 4.1.0)
#>    rmarkdown     2.11    2021-09-14 [1] CRAN (R 4.1.0)
#>    rstudioapi    0.13    2020-11-12 [1] CRAN (R 4.1.0)
#>    sessioninfo   1.1.1   2018-11-05 [3] CRAN (R 4.1.0)
#>    stringi       1.7.4   2021-08-25 [1] CRAN (R 4.1.0)
#>    stringr       1.4.0   2019-02-10 [1] CRAN (R 4.1.0)
#>  P styler        1.6.2   2021-09-23 [?] CRAN (R 4.1.0)
#>    tibble      * 3.1.4   2021-08-25 [1] CRAN (R 4.1.0)
#>    tidyselect    1.1.1   2021-04-30 [1] CRAN (R 4.1.0)
#>    utf8          1.2.2   2021-07-24 [1] CRAN (R 4.1.0)
#>    vctrs         0.3.8   2021-04-29 [1] CRAN (R 4.1.0)
#>    withr         2.4.2   2021-04-18 [1] CRAN (R 4.1.0)
#>    xfun          0.26    2021-09-14 [1] CRAN (R 4.1.0)
#>    yaml          2.2.1   2020-02-01 [1] CRAN (R 4.1.0)
#> 
#> [1] /Users/quangnguyen/research/microbe_set_trait/renv/library/R-4.1/x86_64-apple-darwin17.0
#> [2] /private/var/folders/fs/hp4_8vfs665_nqytkhjc8s6w0000gn/T/Rtmp6ZA9pW/renv-system-library
#> [3] /Library/Frameworks/R.framework/Versions/4.1/Resources/library
#> 
#>  P ── Loaded and on-disk path mismatch.

My understanding is that the error is due to the fact that the function being mapped returns nothing at row 3. The solution dplyr gave is that I should wrap everything in a list.

However:

I am using the original map which should already return a list (other tutorials on using map to transform list columns for tibbles also did not wrap everything around list). Wrapping this inside a list returns a list of NULL elements where the things that I want to extract are set as names of this new list.
My function does return values even if the element in the list is empty (returns NA_character_.

As seen in the reprex, the normal map function works and returns a list of length 3 with the empty row having an NA assigned to it as per the logic of the custom function. Right now I'm working around this by just generating a separate list and attach it to the data frame later, however I would love to understand what I'm looking at!

CodePudding user response：

It is an issue with rowwise group attribute. As we are looping over each element in map, just ungroup

library(dplyr)
library(purrr)
test_dset %>% 
   ungroup %>% 
   mutate(ncbi_filt = map(g_test, proc_gtest))
# A tibble: 3 × 6
  genus       t_count     n ncbi_id   g_test    ncbi_filt
  <chr>         <int> <int> <list>    <list>    <list>   
1 Aureitalea        0     1 <chr [1]> <dbl [1]> <chr [1]>
2 Aureivirga        0     1 <chr [1]> <dbl [1]> <chr [1]>
3 Auricoccucs       0     1 <chr [0]> <dbl [0]> <chr [1]>

Or use map_chr to return as a vector (as there is one single value returned)

test_dset %>% 
   ungroup %>% 
   mutate(ncbi_filt = map_chr(g_test, proc_gtest))
# A tibble: 3 × 6
  genus       t_count     n ncbi_id   g_test    ncbi_filt
  <chr>         <int> <int> <list>    <list>    <chr>    
1 Aureitalea        0     1 <chr [1]> <dbl [1]> 1176327  
2 Aureivirga        0     1 <chr [1]> <dbl [1]> 1433990  
3 Auricoccucs       0     1 <chr [0]> <dbl [0]> <NA>

If there is a rowwise attribute, we can directly apply the function and get the output in a list (if the output returns length > 1 or of different structure)

test_dset %>%
    mutate(ncbi_filt = list(proc_gtest(g_test)))
# A tibble: 3 × 6
# Rowwise: 
  genus       t_count     n ncbi_id   g_test    ncbi_filt
  <chr>         <int> <int> <list>    <list>    <list>   
1 Aureitalea        0     1 <chr [1]> <dbl [1]> <chr [1]>
2 Aureivirga        0     1 <chr [1]> <dbl [1]> <chr [1]>
3 Auricoccucs       0     1 <chr [0]> <dbl [0]> <chr [1]>

The function returns a single value, so we don't need to wrap with list as well

test_dset %>% 
    mutate(ncbi_filt = proc_gtest(g_test))
# A tibble: 3 × 6
# Rowwise: 
  genus       t_count     n ncbi_id   g_test    ncbi_filt
  <chr>         <int> <int> <list>    <list>    <chr>    
1 Aureitalea        0     1 <chr [1]> <dbl [1]> 1176327  
2 Aureivirga        0     1 <chr [1]> <dbl [1]> 1433990  
3 Auricoccucs       0     1 <chr [0]> <dbl [0]> <NA>