Hi I have the following dataframe.
temp = structure(list(A = c(0, 0, 0, 3.72900887033786, 1.94860084749336,
0), C = c(0, 0, 0, 3.44095219802964, 2.35049724708413, 0.0285691521967709
), A = c(0, 0, 0, 3.29572302453997, 0.933572638261024, 0), D = c(0,
0, 0, 2.4905701304462, 1.54101915313356, 0), E = c(0, 0, 0, 4.23189316164533,
1.7311832415722, 0), E = c(0, 0, 0, 4.37851162325373, 2.50080205305716,
0), D = c(0, 0, 0, 3.68929916053589, 2.4905701304462, 0.189033824390017
), F = c(0, 2.27500704749987, 0, 3.68032435684402, 1.77820857639809,
0), A = c(0, 0, 0, 3.5668151540109, 1.72683121703249, 0.0285691521967709
), G = c(0, 0, 0, 5.6450098843911, 3.09929520433778, 0)), row.names = c("5_8S_rRNA",
"5S_rRNA", "7SK", "A1BG", "A1BG-AS1", "A1CF"), class = "data.frame")
it looks like this.
A C A D E E D F A G
5_8S_rRNA 0.000000 0.00000000 0.0000000 0.000000 0.000000 0.000000 0.0000000 0.000000 0.00000000 0.000000
5S_rRNA 0.000000 0.00000000 0.0000000 0.000000 0.000000 0.000000 0.0000000 2.275007 0.00000000 0.000000
7SK 0.000000 0.00000000 0.0000000 0.000000 0.000000 0.000000 0.0000000 0.000000 0.00000000 0.000000
A1BG 3.729009 3.44095220 3.2957230 2.490570 4.231893 4.378512 3.6892992 3.680324 3.56681515 5.645010
A1BG-AS1 1.948601 2.35049725 0.9335726 1.541019 1.731183 2.500802 2.4905701 1.778209 1.72683122 3.099295
A1CF 0.000000 0.02856915 0.0000000 0.000000 0.000000 0.000000 0.1890338 0.000000 0.02856915 0.000000
What I like to do is collapse any column that are duplicates by averaging duplicates but I want to do for each row.
The ideal dataframe would be contain the same amount of rows but would only contain columns A, C, D, E, F , G
is this possible? thank you.
CodePudding user response:
We could use split.default
to split by the column names and loop over the list
, apply the rowMeans
sapply(split.default(temp, names(temp)), rowMeans)
A C D E F G
5_8S_rRNA 0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
5S_rRNA 0.000000000 0.00000000 0.00000000 0.000000 2.275007 0.000000
7SK 0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
A1BG 3.530515683 3.44095220 3.08993465 4.305202 3.680324 5.645010
A1BG-AS1 1.536334901 2.35049725 2.01579464 2.115993 1.778209 3.099295
A1CF 0.009523051 0.02856915 0.09451691 0.000000 0.000000 0.000000
CodePudding user response:
Another base R solution with rowsum
:
t(rowsum(t(temp), names(temp)) / c(table(names(temp))))
A C D E F G
5_8S_rRNA 0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
5S_rRNA 0.000000000 0.00000000 0.00000000 0.000000 2.275007 0.000000
7SK 0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
A1BG 3.530515683 3.44095220 3.08993465 4.305202 3.680324 5.645010
A1BG-AS1 1.536334901 2.35049725 2.01579464 2.115993 1.778209 3.099295
A1CF 0.009523051 0.02856915 0.09451691 0.000000 0.000000 0.000000
CodePudding user response:
Is this what you're looking for?
library(tidyr)
library(dplyr)
output <- temp %>%
add_rownames(var = "ID") %>%
pivot_longer(-ID) %>%
mutate(name = gsub("\\.*\\d*", "", name)) %>%
group_by(ID, name) %>%
summarise(value = mean(value)) %>%
mutate(name = gsub("\\.*\\d*", "", name)) %>%
pivot_wider(names_from = "name", values_from = "value")
output
# A tibble: 6 × 7
# Groups: ID [6]
ID A C D E F G
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 5_8S_rRNA 0 0 0 0 0 0
2 5S_rRNA 0 0 0 0 2.28 0
3 7SK 0 0 0 0 0 0
4 A1BG 3.53 3.44 3.09 4.31 3.68 5.65
5 A1BG-AS1 1.54 2.35 2.02 2.12 1.78 3.10
6 A1CF 0.00952 0.0286 0.0945 0 0 0
CodePudding user response:
Here is a base R solution:
t(do.call(rbind, by(t(temp), row.names(t(temp)), FUN = colMeans)))
Or with tidyverse
Here is a tidyverse
option, where we can split on the unique names, then we set the names for each dataframe, then use rowMeans
. However, we have to use setNames
in the first map
statement to make the column names unique, as tidyverse
does not like duplicate column names. Then, since the rownames also get dropped, then we can add them back at the end.
library(tidyverse)
map(.x = unique(names(temp)), ~
select(setNames(temp, make.names(names(temp), unique = TRUE)), starts_with(.x))) %>%
set_names(unique(names(temp))) %>%
map_dfc(., rowMeans) %>%
as.data.frame() %>%
`rownames<-`(row.names(temp))
Or another base R solution:
temp2 <- t(temp)
t(tapply(temp2, list(row.names(temp2)[row(temp2)], colnames(temp2)[col(temp2)]), FUN = mean))
Output
A C D E F G
5_8S_rRNA 0.00000000 0.0000000 0.0000000 0.00000 0.00000 0.00000
5S_rRNA 0.00000000 0.0000000 0.0000000 0.00000 2.27501 0.00000
7SK 0.00000000 0.0000000 0.0000000 0.00000 0.00000 0.00000
A1BG 3.53051568 3.4409522 3.0899346 4.30520 3.68032 5.64501
A1BG-AS1 1.53633490 2.3504972 2.0157946 2.11599 1.77821 3.09930
A1CF 0.00952305 0.0285692 0.0945169 0.00000 0.00000 0.00000
CodePudding user response:
Using data.table
library('data.table')
d <- transpose(as.data.table(temp), keep.names = "rn")
d <- d[, by = rn, lapply(.SD, mean)]
d <- cbind(rn = row.names(temp), transpose(d, make.names = T))