My dataset looks like this:
Names Sample Init_QC Run_QC
A DN PASS PASS
A DN FAIL PASS
A RN FAIL FAIL
B DN PASS FAIL
B DN PASS PASS
B RN FAIL PASS
C DN PASS FAIL
C DN FAIL FAIL
C RN PASS PASS
I am looking to have things summarized by "Names" by counting number of occurrences of each category. Result would look like this:
Names DN RN Init_QC$PASS Init_QC$FAIL Run_QC$PASS Run_QC$FAIL
A 2 1 1 2 2 1
B 2 1 2 1 2 1
C 2 1 1 2 1 2
I was hoping to use "table" or "count" from dplyr but without success
Would anyone have an easy way to do this ? Thanks very much
CodePudding user response:
Here is a tidyverse
way.
x <- 'Names Sample Init_QC Run_QC
A DN PASS PASS
A DN FAIL PASS
A RN FAIL FAIL
B DN PASS FAIL
B DN PASS PASS
B RN FAIL PASS
C DN PASS FAIL
C DN FAIL FAIL
C RN PASS PASS'
df1 <- read.table(textConnection(x), header = TRUE)
suppressPackageStartupMessages({
library(dplyr)
library(tidyr)
})
df1 %>%
count(Names, Sample) %>%
pivot_wider(
names_from = Sample,
values_from = n
) %>%
left_join(
df1 %>%
select(-Sample) %>%
pivot_longer(
cols = ends_with("QC"),
names_to = "QC",
values_to = "value"
) %>%
count(Names, QC, value) %>%
pivot_wider(
names_from = c("QC", "value"),
values_from = n
),
by = "Names"
)
#> # A tibble: 3 × 7
#> Names DN RN Init_QC_FAIL Init_QC_PASS Run_QC_FAIL Run_QC_PASS
#> <chr> <int> <int> <int> <int> <int> <int>
#> 1 A 2 1 2 1 1 2
#> 2 B 2 1 1 2 1 2
#> 3 C 2 1 1 2 2 1
Created on 2022-06-20 by the reprex package (v2.0.1)
CodePudding user response:
in base R:
aggregate(.~Names, df, table)
Names Sample.DN Sample.RN Init_QC.FAIL Init_QC.PASS Run_QC.FAIL Run_QC.PASS
1 A 2 1 2 1 1 2
2 B 2 1 1 2 1 2
3 C 2 1 1 2 2 1
To make everything into columns, do:
do.call(data.frame, aggregate(.~Names, df, table))
Names Sample.DN Sample.RN Init_QC.FAIL Init_QC.PASS Run_QC.FAIL Run_QC.PASS
1 A 2 1 2 1 1 2
2 B 2 1 1 2 1 2
3 C 2 1 1 2 2 1
using reshape2
:
reshape2::recast(df, Names~variable value, fun.agg = length, id.var = 'Names')
Names Sample_DN Sample_RN Init_QC_FAIL Init_QC_PASS Run_QC_FAIL Run_QC_PASS
1 A 2 1 2 1 1 2
2 B 2 1 1 2 1 2
3 C 2 1 1 2 2 1
In tidyverse:
library(tidyverse)
df %>%
pivot_longer(-Names) %>%
count(Names, name, value) %>%
pivot_wider(Names, names_from = c(name, value), values_from = n)
# A tibble: 3 x 7
Names Init_QC_FAIL Init_QC_PASS Run_QC_FAIL Run_QC_PASS Sample_DN Sample_RN
<chr> <int> <int> <int> <int> <int> <int>
1 A 2 1 1 2 2 1
2 B 1 2 1 2 2 1
3 C 1 2 2 1 2 1
CodePudding user response:
This seems a bit more complex than it should be, but it does the job:
library(tidyverse)
tibble(Names = unique(df$Names)) %>%
bind_cols(table(df$Names, df$Sample), table(df$Names, df$Init_QC)) %>%
rename(Init_FAIL = FAIL, Init_PASS = PASS) %>%
bind_cols(table(df$Names, df$Run_QC)) %>%
rename(Run_FAIL = FAIL, Run_PASS = PASS)
#> # A tibble: 3 x 7
#> Names DN RN Init_FAIL Init_PASS Run_FAIL Run_PASS
#> <chr> <int> <int> <int> <int> <int> <int>
#> 1 A 2 1 2 1 1 2
#> 2 B 2 1 1 2 1 2
#> 3 C 2 1 1 2 2 1
CodePudding user response:
In principle I would do it this way. I have tried to shorten the code but failed:
The code should be self explained. First count
and split the df
by the desired columns and pivot_wider
. Finally bring them all together:
library(dplyr)
library(tidyr)
df1 <- df %>%
count(Names, Sample) %>%
pivot_wider(names_from=Sample, values_from=n, names_glue = )
df2 <- df %>%
count(Names, Init_QC) %>%
pivot_wider(names_from= Init_QC, values_from = n, names_glue = "Init_QC${Init_QC}")
df3 <- df %>%
count(Names, Run_QC) %>%
pivot_wider(names_from= Run_QC, values_from = n, names_glue = "Run_QC${Run_QC}")
df1 %>%
left_join(df2) %>%
left_join(df3)
Names DN RN `Init_QC$FAIL` `Init_QC$PASS` `Run_QC$FAIL` `Run_QC$PASS`
<chr> <int> <int> <int> <int> <int> <int>
1 A 2 1 2 1 1 2
2 B 2 1 1 2 1 2
3 C 2 1 1 2 2 1
CodePudding user response:
library(data.table)
setDT(df)
# summary for each column
x <- names(df)
y <- rbindlist(lapply(setdiff(x, 'id'), \(i) df[, .(rows=.N), c('id', i)][, (i) := paste0(i, '_', get(i))])
, F
)
# long to wide
dcast(y, paste('id', '~', x[2]))