Home > Software design >  How to create a combined dataframe from multiple text files and rename columns based on the file nam
How to create a combined dataframe from multiple text files and rename columns based on the file nam

Time:06-17

I have multiple .txt files (each file contains 4 columns; an identifier gene column, a raw_counts and other columns). I would like to merge those files into a combined dataframe using the common gene column. I was able to import multiple .txt files together, merge based on identifier column, however, after merging, the raw_counts column becomes un-identifiable (arising from which .txt file?). Is it possible to add a prefix/name of the part of the .txt file to this column and drop the rest of the columns (median_length_normalized and RPKM). I have provide an example which might be helpful for you to provide inputs.

File names: For instance
TARGET-00-BM3897-14A-01R.gene.quantification.txt
TARGET-00-BM3969-14A-01R.gene.quantification.txt

Add prefix names to the raw_counts column names as:
raw_counts_TARGET-00-BM3897-14A-01R
raw_counts_TARGET-00-BM3969-14A-01R

Load files and merging:

library(tidyverse)
all_files <- dir("/Target_Data/TARGET_FHCRC/")
file_names <- grep(all_files,pattern = "^T.*gene.quantification.txt$",value = TRUE)
Data_file <- map(file_names,read.delim, stringsAsFactors = FALSE, check.names = FALSE, row.names = NULL)
str(Data_file)
Merge_All_Samples <- Data_file %>% reduce(inner_join, by = "gene")
colnames(Merge_All_Samples)
## There could be more for instance; ".x", ".x.x", ".x.x.x", , similary ".y", ".y.y", ".y.y.y"
names(Merge_All_Samples) = gsub(names(Merge_All_Samples),pattern = ".x",replacement = "")
names(Merge_All_Samples) = gsub(names(Merge_All_Samples),pattern = ".y",replacement = "")
colnames(Merge_All_Samples)

dput(Merge_All_Samples)

structure(list(gene = c("ENSG00000000457", "ENSG00000000460", 
                        "ENSG00000000938", "ENSG00000000971", "ENSG00000001460"), raw_counts = c(1359L, 
                                                                                                 1042L, 24113L, 296L, 351L), median_length_normalized = c(26.5127, 
                                                                                                                                                          12.2998, 513.195, 2.7302, 3.0886), RPKM = c(5.2943, 2.4561, 102.4798, 
                                                                                                                                                                                                      0.5452, 0.6167), raw_counts = c(1497L, 1152L, 20619L, 251L, 314L
                                                                                                                                                                                                      ), median_length_normalized = c(29.2034, 13.5943, 438.824, 2.3087, 
                                                                                                                                                                                                                                      2.7697), RPKM = c(6.9478, 3.2342, 104.4015, 0.5493, 0.6589)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                        -5L))
#>              gene raw_counts median_length_normalized     RPKM raw_counts
#> 1 ENSG00000000457       1359                  26.5127   5.2943       1497
#> 2 ENSG00000000460       1042                  12.2998   2.4561       1152
#> 3 ENSG00000000938      24113                 513.1950 102.4798      20619
#> 4 ENSG00000000971        296                   2.7302   0.5452        251
#> 5 ENSG00000001460        351                   3.0886   0.6167        314
#>   median_length_normalized     RPKM
#> 1                  29.2034   6.9478
#> 2                  13.5943   3.2342
#> 3                 438.8240 104.4015
#> 4                   2.3087   0.5493
#> 5                   2.7697   0.6589

## Drop columns "RPKM", "median_length_normalized" from all files (not necessary columns)
drop.cols <- c("RPKM", "median_length_normalized")
Merge_All_Samples <- Merge_All_Samples %>% select(-one_of(drop.cols))

Expected output:

dput(Output)
structure(list(gene = c("ENSG00000000457", "ENSG00000000460", 
                        "ENSG00000000938", "ENSG00000000971", "ENSG00000001460"), "raw_counts_TARGET-00-BM3897-14A-01R" = c(1359L, 
                                                                                                 1042L, 24113L, 296L, 351L), "raw_counts_TARGET-00-BM3969-14A-01R" = c(1497L, 1152L, 20619L, 
                                                                                                                                              251L, 314L)), class = "data.frame", row.names = c(NA, -5L))
#>              gene raw_counts_TARGET-00-BM3897-14A-01R
#> 1 ENSG00000000457                                1359
#> 2 ENSG00000000460                                1042
#> 3 ENSG00000000938                               24113
#> 4 ENSG00000000971                                 296
#> 5 ENSG00000001460                                 351
#>   raw_counts_TARGET-00-BM3969-14A-01R
#> 1                                1497
#> 2                                1152
#> 3                               20619
#> 4                                 251
#> 5                                 314

Thank you,

Mohammed

CodePudding user response:

You can try -

library(tidyverse)
#List all the files
all_files <- dir("/Target_Data/TARGET_FHCRC/")

#include the files with the specific pattern
file_names <- grep(all_files,
                   pattern = "^T.*gene.quantification.txt$",value = TRUE)

#Read the file in a list with the name of the file as list name
Data_file <- sapply(file_names,read.delim, stringsAsFactors = FALSE, 
                    check.names = FALSE, row.names = NULL, simplify = FALSE)
#Drop .txt from name of the list
names(Data_file) <- sub('\\.txt$', '', names(Data_file))

#For each dataframe in list of dataframe append name of the file
Merge_All_Samples <- Data_file %>% 
                      imap(function(x, y) x %>% 
                            #Drop the columns that you don't need
                             select(-median_length_normalized, -RPKM) %>% 
                             rename_with(~paste0(., y), -gene)) %>%
                      reduce(inner_join, by = "gene")

Merge_All_Samples
  • Related