melt dataframe with IDs based on start of column name-CodePudding

I have a dataset with counts for different genes (Locus) on different days (d0-d24) - dX.2 is the count of the 2nd allele for a given day/locus combination. Right now this dataset is in a sort of "wide" format, with paired columns based on the prefix (dX):

df<-data.frame(Locus=c("Locus_1","Locus_10","Locus_100","Locus_101","Locus_102","Locus_103")
,d0.1=c(248,20,95,13,0,33),d0.2=c(252,480,405,487,500,467),
d2.1=c(252,24,84,14,0,43),d2.2=c(248,476,416,486,500,457),
d6.1=c(256,30,82,15,0,41),d6.2=c(244,470,418,485,500,459),
d10.1=c(280,21,84,18,0,36),d10.2=c(220,479,416,482,500,464))

df
      Locus d0.1 d0.2 d2.1 d2.2 d6.1 d6.2 d10.1 d10.2
1   Locus_1  248  252  252  248  256  244   280   220
2  Locus_10   20  480   24  476   30  470    21   479
3 Locus_100   95  405   84  416   82  418    84   416
4 Locus_101   13  487   14  486   15  485    18   482
5 Locus_102    0  500    0  500    0  500     0   500
6 Locus_103   33  467   43  457   41  459    36   464

What Id like to do is use reshape (or similar, tidyr) to reformat this dataframe to the 'long' format, with variables that incorporate the day (e.g. d0) and the allele ID (.1 or .2)

i.e.

long.df<-reshape_code(df...)
long.df
       Locus Allele.1 Allele.2 Day
1    Locus_1      248      252   0
2   Locus_10       20      480   0
3  Locus_100       95      405   0
4  Locus_101       13      487   0
5  Locus_102        0      500   0
6  Locus_103       33      467   0
7    Locus_1      252      248   2
8   Locus_10       24      476   2
9  Locus_100       84      416   2
10 Locus_101       14      486   2
11 Locus_102        0      500   2
12 Locus_103       43      457   2
13   Locus_1      256      244   6
14  Locus_10       30      470   6
15 Locus_100       82      418   6
16 Locus_101       15      485   6
17 Locus_102        0      500   6
18 Locus_103       41      459   6
19   Locus_1      280      220  10
20  Locus_10       21      479  10
21 Locus_100       84      416  10
22 Locus_101       18      482  10
23 Locus_102        0      500  10
24 Locus_103       36      464  10

Perhaps using the package nc (named capture)? Im not quite sure where to start. Standard code/approach would be reshape(df,id=c("Day","Locus",...)) but of course there is no 'day' variable yet.

I also saw this solution to a similar but perhaps more simple problem, using gather separate spread but im not sure how create the sets by Alleles 1 and 2. Thanks for any advice!

CodePudding user response：

dplyr

library(dplyr)
library(tidyr)
tidyr::pivot_longer(df, -Locus, names_pattern = "d([0-9] )[.]([0-9] )", names_to = c("Day", ".value")) %>%
  rename_with(~ paste0("Allele.", .), .cols = `1`:`2`)
# # A tibble: 24 x 4
#    Locus     Day   Allele.1 Allele.2
#    <chr>     <chr>    <dbl>    <dbl>
#  1 Locus_1   0          248      252
#  2 Locus_1   2          252      248
#  3 Locus_1   6          256      244
#  4 Locus_1   10         280      220
#  5 Locus_10  0           20      480
#  6 Locus_10  2           24      476
#  7 Locus_10  6           30      470
#  8 Locus_10  10          21      479
#  9 Locus_100 0           95      405
# 10 Locus_100 2           84      416
# # ... with 14 more rows

data.table

library(data.table)
DT <- as.data.table(df)
tmp <- melt(DT, id.vars = "Locus"
  )[, c("Day", "Allele") := strcapture("d([0-9] )[.]([0-9] )", variable, proto = list(Day = "", Allele = ""))
  ][, variable := NULL]
tmp <- dcast(tmp, Locus   Day ~ Allele, value.var = "value")
setnames(tmp, c("1", "2"), paste0("Allele.", c("1", "2")))
tmp
#         Locus    Day Allele.1 Allele.2
#        <char> <char>    <num>    <num>
#  1:   Locus_1      0      248      252
#  2:   Locus_1     10      280      220
#  3:   Locus_1      2      252      248
#  4:   Locus_1      6      256      244
#  5:  Locus_10      0       20      480
#  6:  Locus_10     10       21      479
#  7:  Locus_10      2       24      476
#  8:  Locus_10      6       30      470
#  9: Locus_100      0       95      405
# 10: Locus_100     10       84      416
# 11: Locus_100      2       84      416
# 12: Locus_100      6       82      418
# 13: Locus_101      0       13      487
# 14: Locus_101     10       18      482
# 15: Locus_101      2       14      486
# 16: Locus_101      6       15      485
# 17: Locus_102      0        0      500
# 18: Locus_102     10        0      500
# 19: Locus_102      2        0      500
# 20: Locus_102      6        0      500
# 21: Locus_103      0       33      467
# 22: Locus_103     10       36      464
# 23: Locus_103      2       43      457
# 24: Locus_103      6       41      459
#         Locus    Day Allele.1 Allele.2