How can I write a loop/function to calculate the % difference between different rows/factors?-CodePudding

I'm looking to do something that's out of my coding league. I have included 2 snippets of real data below.

Basically I want to mutate/add a column (or even create a new dataframe if that is easier) that calculates the % difference of a measurement "median.fsc" that corresponds to the same "strain" but different "starvation state" but within the same experiment "run".

So for example

I want the % difference of median.fsc of 381-starved / 381-enriched for run 1

I presume i would be using dplyr but have no clue where to start!

Some further manipulation that I also want to do - but guessing I can adapt whatever code I do end up using.

Strain-comparison (median.fsc) expressed as % ie 381.dwecA (enriched) / 381 (enriched) in run 1

structure(list(strain = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
        8L, 10L), .Label = c("381", "381.dwecA", "381.dwecA.cwecA", "381.dwecE", 
        "381.dwecE.cwecE", "381.dwaaG", "381.dwaaG.cwaaG", "381.dE.dG", 
        "381.dE.dG.cE.cG.c1", "381.dE.dG.cE.cG.c2"), class = "factor"), 
            starvation = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
            2L), .Label = c("starved", "enriched"), class = "factor"), 
            live.dead = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
            ), .Label = c("live", "mixed"), class = "factor"), geomean.fsc = c(9283.6, 
            8012.8, 8030.1, 9464.1, 8286, 7903.6, 10600.7, 8539.2, 8935.1
            ), mean.fsc = c(11002.9, 9551.4, 9606.6, 10747.4, 9824, 9277, 
            11874.9, 9645.4, 10537.6), median.fsc = c(10581.7, 8420.3, 
            8393.4, 10050.1, 8771.6, 7979.3, 11526, 8371.3, 9701.5), 
            std.dv = c(5808.4, 5482.6, 5533, 5264, 5481.1, 5250, 5202.3, 
            4816.8, 5654.1), run = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
            1L, 1L, 1L), .Label = c("1", "2", "3"), class = "factor")), row.names = c(NA, 
        -9L), class = c("tbl_df", "tbl", "data.frame"))

    structure(list(strain = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
    8L, 10L), .Label = c("381", "381.dwecA", "381.dwecA.cwecA", "381.dwecE", 
    "381.dwecE.cwecE", "381.dwaaG", "381.dwaaG.cwaaG", "381.dE.dG", 
    "381.dE.dG.cE.cG.c1", "381.dE.dG.cE.cG.c2"), class = "factor"), 
        starvation = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L), .Label = c("starved", "enriched"), class = "factor"), 
        live.dead = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
        ), .Label = c("live", "mixed"), class = "factor"), geomean.fsc = c(2123.8, 
        2426.4, 2056.6, 2556.5, 2452, 2703.3, 2065, 3875, 2362.6), 
        mean.fsc = c(2482.7, 2867, 2321, 2757.6, 2651.4, 3107.2, 
        2368.9, 4308.2, 2571.1), median.fsc = c(1924.9, 2177, 1896, 
        2512.2, 2378.8, 2442.8, 1883.6, 3882.8, 2263.4), std.dv = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_), run = structure(c(1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L), .Label = c("1", "2", "3"), class = "factor")), row.names = c(NA, 
    -9L), class = c("tbl_df", "tbl", "data.frame"))

CodePudding user response：

Let's assume your datasets are called:df_starved and df_enriched

library(dplyr)
df_new <- tibble(strain = df_starved$strain) %>%  mutate("diff_median" = df_starved$median.fsc/df_enriched$median.fsc*100,
                                                         "strain_comp_enriched" = df_enriched$median.fsc/df_enriched$median.fsc[1]*100)
df_new

CodePudding user response：

Would you consider joining the two tables on matching columns and then deploying calculations within the obtained structure?

library("tidyverse")
full_join(
    mutate(df_A, strain = as.character(strain), run = as.integer(run)),
    mutate(df_B, strain = as.character(strain), run = as.integer(run)),
    by = c("strain" = "strain", "run" = "run"),
    suffix = c ("_dtaA", "_dtaB")
) %>%
    glimpse() %>%
    # Ensure that looking at unmatched starvation rates
    filter(starvation_dtaA != starvation_dtaB) %>%
    mutate(any_calcs = median.fsc_dtaA - median.fsc_dtaB) %>%
    select(starts_with("strain"), starts_with("run"), contains("any_calcs"),
           starts_with("starvation")) %>%
    glimpse()

Results

# Rows: 9
# Columns: 5
# $ strain          <chr> "381", "381.dwecA", "381.dwecA.cwecA", "381.dwecE", "381.dwecE.cwecE", "381.dwaa…
# $ run             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1
# $ any_calcs       <dbl> 8656.8, 6243.3, 6497.4, 7537.9, 6392.8, 5536.5, 9642.4, 4488.5, 7438.1
# $ starvation_dtaA <fct> enriched, enriched, enriched, enriched, enriched, enriched, enriched, enriched, …
# $ starvation_dtaB <fct> starved, starved, starved, starved, starved, starved, starved, starved, starved