I have multiple .txt files (each file contains 4 columns; an identifier gene column, a raw_counts and other columns). I would like to merge those files into a combined dataframe using the common gene column. I was able to import multiple .txt files together, merge based on identifier column, however, after merging, the raw_counts column becomes un-identifiable (arising from which .txt file?). Is it possible to add a prefix/name of the part of the .txt file to this column and drop the rest of the columns (median_length_normalized and RPKM). I have provide an example which might be helpful for you to provide inputs.
File names: For instance
TARGET-00-BM3897-14A-01R.gene.quantification.txt
TARGET-00-BM3969-14A-01R.gene.quantification.txt
Add prefix names to the raw_counts column names as:
raw_counts_TARGET-00-BM3897-14A-01R
raw_counts_TARGET-00-BM3969-14A-01R
Load files and merging:
library(tidyverse)
all_files <- dir("/Target_Data/TARGET_FHCRC/")
file_names <- grep(all_files,pattern = "^T.*gene.quantification.txt$",value = TRUE)
Data_file <- map(file_names,read.delim, stringsAsFactors = FALSE, check.names = FALSE, row.names = NULL)
str(Data_file)
Merge_All_Samples <- Data_file %>% reduce(inner_join, by = "gene")
colnames(Merge_All_Samples)
## There could be more for instance; ".x", ".x.x", ".x.x.x", , similary ".y", ".y.y", ".y.y.y"
names(Merge_All_Samples) = gsub(names(Merge_All_Samples),pattern = ".x",replacement = "")
names(Merge_All_Samples) = gsub(names(Merge_All_Samples),pattern = ".y",replacement = "")
colnames(Merge_All_Samples)
dput(Merge_All_Samples)
structure(list(gene = c("ENSG00000000457", "ENSG00000000460",
"ENSG00000000938", "ENSG00000000971", "ENSG00000001460"), raw_counts = c(1359L,
1042L, 24113L, 296L, 351L), median_length_normalized = c(26.5127,
12.2998, 513.195, 2.7302, 3.0886), RPKM = c(5.2943, 2.4561, 102.4798,
0.5452, 0.6167), raw_counts = c(1497L, 1152L, 20619L, 251L, 314L
), median_length_normalized = c(29.2034, 13.5943, 438.824, 2.3087,
2.7697), RPKM = c(6.9478, 3.2342, 104.4015, 0.5493, 0.6589)), class = "data.frame", row.names = c(NA,
-5L))
#> gene raw_counts median_length_normalized RPKM raw_counts
#> 1 ENSG00000000457 1359 26.5127 5.2943 1497
#> 2 ENSG00000000460 1042 12.2998 2.4561 1152
#> 3 ENSG00000000938 24113 513.1950 102.4798 20619
#> 4 ENSG00000000971 296 2.7302 0.5452 251
#> 5 ENSG00000001460 351 3.0886 0.6167 314
#> median_length_normalized RPKM
#> 1 29.2034 6.9478
#> 2 13.5943 3.2342
#> 3 438.8240 104.4015
#> 4 2.3087 0.5493
#> 5 2.7697 0.6589
## Drop columns "RPKM", "median_length_normalized" from all files (not necessary columns)
drop.cols <- c("RPKM", "median_length_normalized")
Merge_All_Samples <- Merge_All_Samples %>% select(-one_of(drop.cols))
Expected output:
dput(Output)
structure(list(gene = c("ENSG00000000457", "ENSG00000000460",
"ENSG00000000938", "ENSG00000000971", "ENSG00000001460"), "raw_counts_TARGET-00-BM3897-14A-01R" = c(1359L,
1042L, 24113L, 296L, 351L), "raw_counts_TARGET-00-BM3969-14A-01R" = c(1497L, 1152L, 20619L,
251L, 314L)), class = "data.frame", row.names = c(NA, -5L))
#> gene raw_counts_TARGET-00-BM3897-14A-01R
#> 1 ENSG00000000457 1359
#> 2 ENSG00000000460 1042
#> 3 ENSG00000000938 24113
#> 4 ENSG00000000971 296
#> 5 ENSG00000001460 351
#> raw_counts_TARGET-00-BM3969-14A-01R
#> 1 1497
#> 2 1152
#> 3 20619
#> 4 251
#> 5 314
Thank you,
Mohammed
CodePudding user response:
You can try -
library(tidyverse)
#List all the files
all_files <- dir("/Target_Data/TARGET_FHCRC/")
#include the files with the specific pattern
file_names <- grep(all_files,
pattern = "^T.*gene.quantification.txt$",value = TRUE)
#Read the file in a list with the name of the file as list name
Data_file <- sapply(file_names,read.delim, stringsAsFactors = FALSE,
check.names = FALSE, row.names = NULL, simplify = FALSE)
#Drop .txt from name of the list
names(Data_file) <- sub('\\.txt$', '', names(Data_file))
#For each dataframe in list of dataframe append name of the file
Merge_All_Samples <- Data_file %>%
imap(function(x, y) x %>%
#Drop the columns that you don't need
select(-median_length_normalized, -RPKM) %>%
rename_with(~paste0(., y), -gene)) %>%
reduce(inner_join, by = "gene")
Merge_All_Samples