I have a data frame that looks like this:
I want to create a new column called labels
, which indicates the genes that have the
top2 positive log2FoldChange
and small pvalue
as well
as the top2 negative log2FoldChange
and small pvalue
.
library(tidyverse)
df <- tibble(gene=c("AT1G23410","AT2G05530","AT2G26820","AT1G36240","AT5G05260","AT5G47260"),
log2FoldChange=c(14.468018,13.763999,13.708055, -13.480280,-12.166529,-11.468673),
pvalue=c(0.001,0.004,0.005,0.0041,0.0034,0.004))
df
#> # A tibble: 6 × 3
#> gene log2FoldChange pvalue
#> <chr> <dbl> <dbl>
#> 1 AT1G23410 14.5 0.001
#> 2 AT2G05530 13.8 0.004
#> 3 AT2G26820 13.7 0.005
#> 4 AT1G36240 -13.5 0.0041
#> 5 AT5G05260 -12.2 0.0034
#> 6 AT5G47260 -11.5 0.004
Created on 2022-10-19 with reprex v2.0.2
I want my data to look like this.
#> gene log2FoldChange pvalue labels
#> <chr> <dbl> <dbl>
#> 1 AT1G23410 14.5 0.001 AT1G23410
#> 2 AT2G05530 13.8 0.004 AT2G05530
#> 3 AT2G26820 13.7 0.005
#> 4 AT1G36240 -13.5 0.0041 AT1G36240
#> 5 AT5G05260 -12.2 0.0034 AT5G05260
#> 6 AT5G47260 -11.5 0.004
EDIT:
by small pvalue I mean the smallest value in the dataset. In terms of arrange it would be:
arrange(log2FoldChange, pvalue)
for the negative ones, and for the positive arrange(desc(log2FoldChange), pvalue)
does it make sense
CodePudding user response:
You can arrange
and use ifelse
:
df %>%
arrange(log2FoldChange) %>%
mutate(labels = ifelse(row_number() %in% c(1:2, (n()-1):n()), gene, ""))
output
# A tibble: 6 × 4
gene log2FoldChange pvalue labels
<chr> <dbl> <dbl> <chr>
1 AT1G36240 -13.5 0.0041 "AT1G36240"
2 AT5G05260 -12.2 0.0034 "AT5G05260"
3 AT5G47260 -11.5 0.004 ""
4 AT2G26820 13.7 0.005 ""
5 AT2G05530 13.8 0.004 "AT2G05530"
6 AT1G23410 14.5 0.001 "AT1G23410"
If there is a filtering to do with pvalue
, you can do:
df %>%
mutate(labels = ifelse(log2FoldChange %in% c(head(log2FoldChange[order(log2FoldChange, pvalue)], 2),
head(log2FoldChange[order(-log2FoldChange, pvalue)], 2)), gene, ""))
CodePudding user response:
Here's an option:
library(dplyr)
library(purrr)
df <- tibble(gene=c("AT1G23410","AT2G05530","AT2G26820","AT1G36240","AT5G05260","AT5G47260"),
log2FoldChange=c(14.468018,13.763999,13.708055, -13.480280,-12.166529,-11.468673),
pvalue=c(0.001,0.004,0.005,0.0041,0.0034,0.004))
df %>% left_join(
df %>%
mutate(positive = ifelse(log2FoldChange >= 0, "y", "n")) %>%
split(.$positive) %>%
map(~.x %>% arrange(desc(abs(log2FoldChange))) %>% slice(1:2) %>% select(gene) %>% mutate(labels = gene)) %>%
bind_rows())
#> Joining, by = "gene"
#> # A tibble: 6 × 4
#> gene log2FoldChange pvalue labels
#> <chr> <dbl> <dbl> <chr>
#> 1 AT1G23410 14.5 0.001 AT1G23410
#> 2 AT2G05530 13.8 0.004 AT2G05530
#> 3 AT2G26820 13.7 0.005 <NA>
#> 4 AT1G36240 -13.5 0.0041 AT1G36240
#> 5 AT5G05260 -12.2 0.0034 AT5G05260
#> 6 AT5G47260 -11.5 0.004 <NA>
CodePudding user response:
You can arrange
the data and label
first and last 2 rows.
library(dplyr)
df %>%
arrange(desc(log2FoldChange)) %>%
mutate(label = case_when(row_number() %in% c(1, 2, n()-1, n()) ~ gene,
TRUE ~ ""))
# gene log2FoldChange pvalue label
# <chr> <dbl> <dbl> <chr>
#1 AT1G23410 14.5 0.001 "AT1G23410"
#2 AT2G05530 13.8 0.004 "AT2G05530"
#3 AT2G26820 13.7 0.005 ""
#4 AT5G47260 -11.5 0.004 ""
#5 AT5G05260 -12.2 0.0034 "AT5G05260"
#6 AT1G36240 -13.5 0.0041 "AT1G36240"
Note - It is not clear to me what you mean by small p-value. This answer only considers log2FoldChange
variable which seems to match with your expected output.
CodePudding user response:
A base R approach using order()
only, and some indexing
df$labels <- NA
df[order(df$log2FoldChange),][1:2,]$labels <- df[order(df$log2FoldChange),][1:2,]$gene
df[order(df$log2FoldChange, decreasing = TRUE),][1:2,]$labels <- df[order(df$log2FoldChange, decreasing = TRUE),][1:2,]$gene
df
# A tibble: 6 × 4
gene log2FoldChange pvalue labels
<chr> <dbl> <dbl> <chr>
1 AT1G23410 14.5 0.001 AT1G23410
2 AT2G05530 13.8 0.004 AT2G05530
3 AT2G26820 13.7 0.005 NA
4 AT1G36240 -13.5 0.0041 AT1G36240
5 AT5G05260 -12.2 0.0034 AT5G05260
6 AT5G47260 -11.5 0.004 NA