Home > database >  create a function using data masking in R
create a function using data masking in R

Time:02-12

inorder to do a paired analysis i need to write a function that sums integer counts. The total number required to be summed is specified in the corresponding "Yrs_Before" and "Yrs_After" columns in "df". Is there a way i can avoid writing "fm_after" with just one function? Is this a variable masking for "Yrs_Before", "Yrs_After","Before.Yr_1...n 1" and "After.Yr_1...n 1" columns?

data frame

set.seed(123)
(df=data.frame(
  Yrs_Before=sample(1:8, 3),
  Yrs_After=sample(1:8, 3),
  Before.Yr_1=sample(1:8, 3),
  Before.Yr_2=sample(1:8, 3),
  Before.Yr_3=sample(1:8, 3),
  Before.Yr_4=sample(1:8, 3),
  Before.Yr_5=sample(1:8, 3),
  Before.Yr_6=sample(1:8, 3),
  Before.Yr_7=sample(1:8, 3),
  Before.Yr_8=sample(1:8, 3),
  After.Yr_1=sample(1:8, 3),
  After.Yr_2=sample(1:8, 3),
  After.Yr_3=sample(1:8, 3),
  After.Yr_4=sample(1:8, 3),
  After.Yr_5=sample(1:8, 3),
  After.Yr_6=sample(1:8, 3),
  After.Yr_7=sample(1:8, 3),
  After.Yr_8=sample(1:8, 3)
  
))

function sums the corresponding rows based on the number of years in the before period.

fm=function(data,Yrs_Before){
  data |> dplyr::mutate(sums=
  ifelse(
  Yrs_Before == 1, rowSums(across(Before.Yr_1)),
  ifelse(
    Yrs_Before == 2, rowSums(across(Before.Yr_1:Before.Yr_2)),
    ifelse(
      Yrs_Before == 3, rowSums(across(Before.Yr_1:Before.Yr_3)),
      ifelse(
        Yrs_Before == 4, rowSums(across(Before.Yr_1:Before.Yr_4)),
        ifelse(
          Yrs_Before == 5, rowSums(across(Before.Yr_1:Before.Yr_5)),
          ifelse(
            Yrs_Before == 6, rowSums(across(Before.Yr_1:Before.Yr_6)),
            ifelse(
              Yrs_Before == 7, rowSums(across(Before.Yr_1:Before.Yr_7)),
              ifelse(
                Yrs_Before == 8, rowSums(across(Before.Yr_1:Before.Yr_8)),"")))))))))
}

output


fm(df,Yrs_Before)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 sums
1          7         6           2           5           6           3           3           1           3           1          4          3          4          5          1          4          3          2   23
2          8         3           6           4           1           5           1           8           2           6          6          7          7          7          2          5          6          5   33
3          3         2           3           6           2           8           4           5           7           3          1          5          2          1          3          7          1          7   11

duplicated function with variable name changed to "After"


fm_after=function(data,Yrs_After){
  data |> dplyr::mutate(sums=
                   ifelse(
                     Yrs_After == 1, rowSums(across(After.Yr_1)),
                     ifelse(
                       Yrs_After == 2, rowSums(across(After.Yr_1:After.Yr_2)),
                       ifelse(
                         Yrs_After == 3, rowSums(across(After.Yr_1:After.Yr_3)),
                         ifelse(
                           Yrs_After == 4, rowSums(across(After.Yr_1:After.Yr_4)),
                           ifelse(
                             Yrs_After == 5, rowSums(across(After.Yr_1:After.Yr_5)),
                             ifelse(
                               Yrs_After == 6, rowSums(across(After.Yr_1:After.Yr_6)),
                               ifelse(
                                 Yrs_After == 7, rowSums(across(After.Yr_1:After.Yr_7)),
                                 ifelse(
                                   Yrs_After == 8, rowSums(across(After.Yr_1:After.Yr_8)),"")))))))))
}

output

fm_after(df,Yrs_After)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 sums
1          7         6           2           5           6           3           3           1           3           1          4          3          4          5          1          4          3          2   21
2          8         3           6           4           1           5           1           8           2           6          6          7          7          7          2          5          6          5   20
3          3         2           3           6           2           8           4           5           7           3          1          5          2          1          3          7          1          7    6

CodePudding user response:

We may do this by pivoting to long format

library(dplyr)
library(tidyr)
library(stringr)
df %>%  
  mutate(rn = row_number()) %>% 
  pivot_longer(cols = -c(rn, Yrs_Before, Yrs_After)) %>%
   mutate(yrs = as.numeric(str_extract(name, "\\d $"))) %>% 
  group_by(rn, grp = str_extract(name, "\\w ")) %>%
  summarise(Sum = if(cur_group()$grp == 'Before')  
   sum(value[yrs <= Yrs_Before], na.rm = TRUE)  else 
   sum(value[yrs <= Yrs_After], na.rm = TRUE), .groups = 'drop') %>% 
  pivot_wider(names_from = grp, values_from = Sum) %>%
  select(-rn) %>%
  bind_cols(df, .)

-output

 Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1
1          7         6           2           5           6           3           3           1           3           1          4
2          8         3           6           4           1           5           1           8           2           6          6
3          3         2           3           6           2           8           4           5           7           3          1
  After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 After Before
1          3          4          5          1          4          3          2    21     23
2          7          7          7          2          5          6          5    20     33
3          5          2          1          3          7          1          7     6     11

It can be wrapped in a function with just input data

fm <- function(data) {
  data %>%  
  mutate(rn = row_number()) %>% 
  pivot_longer(cols = -c(rn, Yrs_Before, Yrs_After)) %>%
   mutate(yrs = as.numeric(str_extract(name, "\\d $"))) %>% 
  group_by(rn, grp = str_extract(name, "\\w ")) %>%
  summarise(Sum = if(cur_group()$grp == 'Before')  
   sum(value[yrs <= Yrs_Before], na.rm = TRUE)  else 
   sum(value[yrs <= Yrs_After], na.rm = TRUE), .groups = 'drop') %>% 
  pivot_wider(names_from = grp, values_from = Sum) %>%
  select(-rn) %>%
  bind_cols(df, .)
  }

-testing

> fm(df)
  Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1
1          7         6           2           5           6           3           3           1           3           1          4
2          8         3           6           4           1           5           1           8           2           6          6
3          3         2           3           6           2           8           4           5           7           3          1
  After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 After Before
1          3          4          5          1          4          3          2    21     23
2          7          7          7          2          5          6          5    20     33
3          5          2          1          3          7          1          7     6     11

CodePudding user response:

EDIT:

in base R you could do:

A <- split.default(df, sub('.*(After|Before).*', '\\1', names(df)))  
sapply(A, \(x) rowSums(x[-1] * (col(x[-1]) <= x[[1]])))
    After Before
[1,]    21     23
[2,]    20     33
[3,]     6     11

You can then cbind this to the original dataframe

tidyverse:

use mutate instead of transmute in order to retain the original dataframe.

df %>%
  rowwise() %>%
  transmute(After = cumsum(c_across(matches('Before.Yr')))[Yrs_Before],
            Before = cumsum(c_across(matches('After.Yr')))[Yrs_After]) 

# A tibble: 3 x 2
# Rowwise: 
  After Before
  <int>  <int>
1    23     21
2    33     20
3    11      6

CodePudding user response:

To avoid so much code you could use paste.

f <- \(x, dat=df) sapply(seq_len(nrow(dat)), \(i, ...) 
                      sum(dat[i, paste0(x, '.Yr_', 1:dat[i, paste0('Yrs_', x)])]))

res <- transform(df, sums_bef=f('Before'), sums_aft=f('After'))
res
#   Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3
# 1          7         6           2           5           6
# 2          8         3           6           4           1
# 3          3         2           3           6           2
#   Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8
# 1           3           3           1           3           1
# 2           5           1           8           2           6
# 3           8           4           5           7           3
#   After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5
# 1          4          3          4          5          1
# 2          6          7          7          7          2
# 3          1          5          2          1          3
#   After.Yr_6 After.Yr_7 After.Yr_8 sums_bef sums_aft
# 1          4          3          2       23       21
# 2          5          6          5       33       20
# 3          7          1          7       11        6

Data:

df <- structure(list(Yrs_Before = c(7L, 8L, 3L), Yrs_After = c(6L, 
3L, 2L), Before.Yr_1 = c(2L, 6L, 3L), Before.Yr_2 = c(5L, 4L, 
6L), Before.Yr_3 = c(6L, 1L, 2L), Before.Yr_4 = c(3L, 5L, 8L), 
    Before.Yr_5 = c(3L, 1L, 4L), Before.Yr_6 = c(1L, 8L, 5L), 
    Before.Yr_7 = c(3L, 2L, 7L), Before.Yr_8 = c(1L, 6L, 3L), 
    After.Yr_1 = c(4L, 6L, 1L), After.Yr_2 = c(3L, 7L, 5L), After.Yr_3 = c(4L, 
    7L, 2L), After.Yr_4 = c(5L, 7L, 1L), After.Yr_5 = 1:3, After.Yr_6 = c(4L, 
    5L, 7L), After.Yr_7 = c(3L, 6L, 1L), After.Yr_8 = c(2L, 5L, 
    7L)), class = "data.frame", row.names = c(NA, -3L))
  • Related