Add multiple rows based on column string R-CodePudding

I have a data frame that looks like this...

    patient_ID  CBC  CBN totindex samp index
120       1007 BLOQ BLOQ        7    8     1
121       1007 BLOQ BLOQ        8    9     1
122       1007 BLOQ BLOQ        9   10     1
123       1007 BLOQ BLOQ       10   11     1
124       1007 BLOQ BLOQ       11   12     1
125       1007 BLOQ BLOQ       12   15     4
126       1007 BLOQ BLOQ       13   16     1
127       1007 BLOQ BLOQ       14   17     1
128       1007 BLOQ BLOQ       15   18     1
129       1007 BLOQ BLOQ       16   19     1
130       1007 BLOQ BLOQ       17   20     1

I created the index column to denote any rows in the samp column to show where the previous row is not in numerical order. For example, if the numbers are a difference of one, I make the index of the row 3. In this case, there was a difference of 2 in the samp column between rows 124 and 125. So I placed a 4. I do this for other examples as well like a difference of 3 or 4 is indexed as a 5 or 6.

How can I insert 1 or 2 or 3 or 4 new blank lines in between the discrepancies?

In this example, I would like this output...

    patient_ID  CBC  CBN totindex samp index
120       1007 BLOQ BLOQ        7    8     1
121       1007 BLOQ BLOQ        8    9     1
122       1007 BLOQ BLOQ        9   10     1
123       1007 BLOQ BLOQ       10   11     1
124       1007 BLOQ BLOQ       11   12     1
125       1007    0    0        0   13     1
126       1007    0    0        0   14     1
127       1007 BLOQ BLOQ       12   15     1
128       1007 BLOQ BLOQ       13   16     1
129       1007 BLOQ BLOQ       14   17     1
130       1007 BLOQ BLOQ       15   18     1
131       1007 BLOQ BLOQ       16   19     1
132       1007 BLOQ BLOQ       17   20     1

Thus, adding 2 rows equal to 0 before the 4 index, keep the patient_ID, remove the 4, and add the 13 and 14 to the samp column.

I have tried a for loop like this...

nidx4 <- as.numeric(rownames(df[grep("4", df$index), ]))
dfnew <- data.frame()

for (idx in 1:length(nidx4)) {
  if (idx==1){
    df1 = df[1:(nidx4[idx]-1),]
  }
  else if (idx == length(nidx4)) {
    df1 = df[nidx4[idx-1]:nrow(df),]
  }
  else {
    df1 = df[(nidx4[idx-1]):(nidx4[idx]-1),]
  }
  df1[nrow(df1) 1,] = 0
  df1[nrow(df1) 1,] = 0
  df1[nrow(df1)-1,21] = df1[nrow(df1)-2,21] 1
  df1[nrow(df1),21] = df1[nrow(df1)-1,21] 1
  dfnew = rbind(dfnew,df1)
}

for (row in 1:nrow(dfnew)){
  if (dfnew[row,"index"] == 0) {dfnew[row,"index"] = 1}
  if (dfnew[row,"index"] == 4) {dfnew[row,"index"] = 1}
}

rownames(dfnew) <- NULL
df <- dfnew

But that only accounts for the first few indices. The last iteration of 4 in the index is not accounted for. Also, I had to change the loop code to this because, before, it was stopping at the last iteration of 4 and not including the rest of the data frame.

Any help would be great.

Edit Answer

This worked for me by adding in all of the column names that I wanted to iterate over.

dfnew <- complete(df, patient_ID, samp = full_seq(samp, period = 1),
         fill = list("Sample_Name_(run_ID)" = "no_sample",
                     Sample_Name = "no_sample", THC = "0", OH_THC = "0",
                     THC_COOH = "0", THC_COO_gluc = "0", THC_gluc = "0", 
                     CBD = "0", "6aOH_CBD" = "0", "7OH_CBD" = "0", 
                     "6bOH_CBD" = "0", CBD_COOH = "0", CBD_gluc = "0", 
                     CBC = "0", CBN = "0", CBG = "0", THCV = "0", CBDV = "0",
                     totindex = 0, 
                     index = 1)) %>%
  mutate(index = 1)

CodePudding user response：

We may use complete

library(tidyr)
library(dplyr)
complete(df1, patient_ID, samp = full_seq(samp, period = 1),
     fill = list(CBC = "0", CBN = "0", totindex = 0, index = 1)) %>%
   mutate(index = 1)

-output

# A tibble: 13 × 6
   patient_ID  samp CBC   CBN   totindex index
        <int> <dbl> <chr> <chr>    <int> <dbl>
 1       1007     8 BLOQ  BLOQ         7     1
 2       1007     9 BLOQ  BLOQ         8     1
 3       1007    10 BLOQ  BLOQ         9     1
 4       1007    11 BLOQ  BLOQ        10     1
 5       1007    12 BLOQ  BLOQ        11     1
 6       1007    13 0     0            0     1
 7       1007    14 0     0            0     1
 8       1007    15 BLOQ  BLOQ        12     1
 9       1007    16 BLOQ  BLOQ        13     1
10       1007    17 BLOQ  BLOQ        14     1
11       1007    18 BLOQ  BLOQ        15     1
12       1007    19 BLOQ  BLOQ        16     1
13       1007    20 BLOQ  BLOQ        17     1

Or use

complete(df1, patient_ID, samp = full_seq(samp, period = 1)) %>% 
    mutate(across(CBC:CBN, replace_na, "0"), 
     totindex = replace_na(totindex, 0), index = 1)

Or use

complete(df1, patient_ID, samp = full_seq(samp, period = 1),
  fill =  setNames(c(as.list(rep('0', 2)), 0, 1), 
   names(df1)[c(2:4, 6)]))

data

df1 <- structure(list(patient_ID = c(1007L, 1007L, 1007L, 1007L, 1007L, 
1007L, 1007L, 1007L, 1007L, 1007L, 1007L), CBC = c("BLOQ", "BLOQ", 
"BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", 
"BLOQ"), CBN = c("BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ", 
"BLOQ", "BLOQ", "BLOQ", "BLOQ", "BLOQ"), totindex = 7:17, samp = c(8L, 
9L, 10L, 11L, 12L, 15L, 16L, 17L, 18L, 19L, 20L), index = c(1L, 
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L)), class = "data.frame", 
row.names = c("120", 
"121", "122", "123", "124", "125", "126", "127", "128", "129", 
"130"))