Home > Software design >  Reshape long to wide with
Reshape long to wide with

Time:03-22

I need to reshape a dataframe csf_antibio (sample below) from long to wide format, separating all values per patient by row.

study_id,proanbx_dt,proanbx_tm,name,othername,route,dosage,units,doses,freq
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13
CHLA_0001,2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9
CHLA_0001,2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6
CHLA_0001,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8
CHLA_0002,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13
CHLA_0002,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8
CHLA_0002,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12
CHLA_0003,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1
CHLA_0003,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8
CHLA_0003,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12

I tried the following without success:

csfmelt <- melt(csf_antibio, id.vars=1:1)
csf <- dcast(csfmelt, study_id ~ variable, value.var = "value", fun.aggregate = sum)

I want the final dataframe to have each study id per row with variables

study_id,proanbx_dt1,proanbx_tm1,name1,othername1,route1,dosage1,units1,doses1,freq1,proanbx_dt2,proanbx_tm2,name2,othername2,route2,dosage2,units2,doses2,freq2,proanbx_dt3,proanbx_tm3,name3,othername3,route3,dosage3,units3,doses3,freq3,proanbx_dt4,proanbx_tm4,name4,othername4,route4,dosage4,units4,doses4,freq4
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13, 2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9, 2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12

Thanks in advance!

CodePudding user response:

Your desired output has a "number" component that is not naturally inferred by dcast. We can add it relatively easily with ave (base R, certainly this can be done just as easily in data.table or dplyr groupings).

reshape2 and base R

csfmelt$num <- ave(seq(nrow(csfmelt)), csfmelt[c("study_id","variable")], FUN = seq_along)
head(csfmelt)
#    study_id   variable      value num
# 1 CHLA_0001 proanbx_dt 2021-07-22   1
# 2 CHLA_0001 proanbx_dt 2021-07-22   2
# 3 CHLA_0001 proanbx_dt 2021-07-22   3
# 4 CHLA_0001 proanbx_dt 2021-08-09   4
# 5 CHLA_0002 proanbx_dt 2020-12-18   1
# 6 CHLA_0002 proanbx_dt 2020-12-18   2
csfwide <- reshape2::dcast(csfmelt, study_id ~ variable   num, value.var = "value")
csfwide
#    study_id proanbx_dt_1 proanbx_dt_2 proanbx_dt_3 proanbx_dt_4 proanbx_tm_1 proanbx_tm_2 proanbx_tm_3 proanbx_tm_4      name_1        name_2     name_3     name_4 othername_1 othername_2                   othername_3 othername_4 route_1 route_2 route_3 route_4 dosage_1 dosage_2 dosage_3 dosage_4 units_1 units_2 units_3 units_4 doses_1 doses_2 doses_3 doses_4 freq_1 freq_2 freq_3 freq_4
# 1 CHLA_0001   2021-07-22   2021-07-22   2021-07-22   2021-08-09     20:01:00     20:19:00     23:17:00     19:34:00 ceftriaxone metronidazole vancomycin  cefazolin                                                                        IV      IV      IV      IV     1250      250      350      738      mg      mg      mg      mg       4       5       3       1     13      9      6      8
# 2 CHLA_0002   2020-12-18   2020-12-18   2020-12-19   2020-12-20      0:30:00      1:03:00     18:15:00      4:18:00    cefepime    vancomycin   cefepime vancomycin                                                                        IV      IV      IV      IV       75       23       60       24      mg      mg      mg      mg       5       4       6       4      8     13      8     12
# 3 CHLA_0003   2021-04-20   2021-04-21   2021-04-22         <NA>     15:17:00      2:20:00     14:16:00         <NA>   meropenem     meropenem      Other       <NA>                         sulfamethoxazole-trimethoprim        <NA>      IV      IV      IV    <NA>      200      400       50     <NA>      mg      mg      mg    <NA>       2      17       9    <NA>      1      8     12   <NA>

The column order is not what you requested, but it can be conformed a bit with this:

variables <- as.character(unique(csfmelt$variable))
sub(".*_", "", names(csfwide)[-(1:2)])
#  [1] "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4"
sub("_[^_]*$", "", names(csfwide)[-(1:2)])
#  [1] "proanbx_dt" "proanbx_dt" "proanbx_dt" "proanbx_tm" "proanbx_tm" "proanbx_tm" "proanbx_tm" "name"       "name"       "name"       "name"       "othername"  "othername"  "othername"  "othername"  "route"      "route"      "route"      "route"      "dosage"    
# [21] "dosage"     "dosage"     "dosage"     "units"      "units"      "units"      "units"      "doses"      "doses"      "doses"      "doses"      "freq"       "freq"       "freq"       "freq"      

nms <- names(csfwide)[-(1:2)]
newnms <- nms[order(sub(".*_", "", nms), match(nms, variables))]
csfwide2 <- subset(csfwide, select = c(names(csfwide)[1:2], newnms))
csfwide2
#    study_id proanbx_dt_1 proanbx_tm_1      name_1 othername_1 route_1 dosage_1 units_1 doses_1 freq_1 proanbx_dt_2 proanbx_tm_2        name_2 othername_2 route_2 dosage_2 units_2 doses_2 freq_2 proanbx_dt_3 proanbx_tm_3     name_3                   othername_3 route_3 dosage_3 units_3 doses_3 freq_3 proanbx_dt_4 proanbx_tm_4     name_4 othername_4 route_4 dosage_4 units_4 doses_4 freq_4
# 1 CHLA_0001   2021-07-22     20:01:00 ceftriaxone                  IV     1250      mg       4     13   2021-07-22     20:19:00 metronidazole                  IV      250      mg       5      9   2021-07-22     23:17:00 vancomycin                                    IV      350      mg       3      6   2021-08-09     19:34:00  cefazolin                  IV      738      mg       1      8
# 2 CHLA_0002   2020-12-18      0:30:00    cefepime                  IV       75      mg       5      8   2020-12-18      1:03:00    vancomycin                  IV       23      mg       4     13   2020-12-19     18:15:00   cefepime                                    IV       60      mg       6      8   2020-12-20      4:18:00 vancomycin                  IV       24      mg       4     12
# 3 CHLA_0003   2021-04-20     15:17:00   meropenem                  IV      200      mg       2      1   2021-04-21      2:20:00     meropenem                  IV      400      mg      17      8   2021-04-22     14:16:00      Other sulfamethoxazole-trimethoprim      IV       50      mg       9     12         <NA>         <NA>       <NA>        <NA>    <NA>     <NA>    <NA>    <NA>   <NA>

CodePudding user response:

@r2evans gave you a great answer, but I was thinking about your comments regarding dates and time. You didn't provide how you collected this data, so I can't tell you how to import it this way. However, I did convert these variables in the following code. That being said, adding dates isn't meaningful. I was thinking that the number of days and the amount of time passed might be more along the lines of what you were looking for for those particular variables. Unfortunately, I wasn't able to figure out how to make it work with reshape2. This uses dplyr, tidyselect and hms. Although, you would only have to call dplyr, because I've appended the packages for the applicable functions. (You need the packages installed, though.)

I didn't keep the name and othername because it's not multiple entries.

library(dplyr)

csf_antibio = read.table(header = T, sep = ",", text = "study_id,proanbx_dt,proanbx_tm,name,othername,route,dosage,units,doses,freq
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13
CHLA_0001,2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9
CHLA_0001,2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6
CHLA_0001,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8
CHLA_0002,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13
CHLA_0002,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8
CHLA_0002,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12
CHLA_0003,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1
CHLA_0003,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8
CHLA_0003,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12")  

Because the time is truly linked to the date, I wrote a function to process the time difference.

timer <- function(df1){
  maxtm = max(df1$proanbx_tm[df1$proanbx_dt == max(df1$proanbx_dt)]) %>% hms::as_hms()
  mintm = min(df1$proanbx_tm[df1$proanbx_dt == min(df1$proanbx_dt)]) %>% hms::as_hms()
  if(maxtm > mintm){
    tmr = (maxtm - mintm) %>% hms::as_hms() # captures mult entries in the same day
  } else if(mintm > maxtm) {
    tmr = (maxtm - mintm)   hms::as_hms('24:00:00')  # add a full day
  } else {      # only one entry or the time is identical in max/min
    tmr = hms::as_hms('0')
  }
  return(tmr)
}

I collected the column names to return the columns to the original order.

ordNames = names(csf_antibio) # collect names to return order to columns
#  [1] "study_id"   "proanbx_dt" "proanbx_tm" "name"       "othername"  "route"     
#  [7] "dosage"     "units"      "doses"      "freq"       

# names kept = ordNames[,c(1:3,6:10)]

Find the sums and differences in time

csf2 <- csf_antibio %>% 
  mutate(proanbx_dt = as.Date(proanbx_dt),         # convert to date
         proanbx_tm = hms::as_hms(proanbx_tm)) %>% # convert to time
  group_by(study_id) %>%                           # group by study
  summarise(proanbx_tm = timer(.data), # difference in time
            proanbx_dt = max(proanbx_dt) - min(proanbx_dt), # difference in days
            across(tidyselect:::where(is.integer), sum),
            units = "mg",
            route = "IV") %>% 
  select(ordNames[c(1:3,6:10)])
head(csf2)
# # A tibble: 3 × 8
#   study_id  proanbx_dt proanbx_tm route dosage units doses  freq
#   <chr>     <drtn>     <time>     <chr>  <int> <chr> <int> <int>
# 1 CHLA_0001 18 days    23:33      IV      2588 mg       13    36
# 2 CHLA_0002  2 days    03:48      IV       182 mg       19    41
# 3 CHLA_0003  2 days    22:59      IV       650 mg       28    21 
  • Related