Sum unique occurences in a string based on associated number-CodePudding

I work with single sequence read classification and want to filter based on the quality of classification. However, the output format needs to be changed in order to do this. I have a classification statistics (a score) like below for each read, which represents ["taxonomy":"kmers assigned to that taxonomy" "taxonomy":"kmers assigned to that taxonomy" etc.], and each taxonomy can occur multiple times.

classification_stats<-c("3:1 7:4 0:34 3:7 0:27", 
"0:110 561:19 0:37",
"0:3 562:5 0:7 543:55 0:47")

read_ID<-c("read1", "read2", "read3")

df<-data.frame(read_ID, classification_stats)

> df
  read_ID classification_stats
1   read1 3:1 7:4 0:34 3:7 0:27
2   read2 0:110 561:19 0:37
3   read3 0:3 562:5 0:7 543:55 0:47

For each read (each row) I want to count the total number of kmers assigned to a taxonomy (in classification_stats), but since each taxonomy occurs multiple times non-consecutively, this becomes more difficult. What it means is that for e.g. read1 taxonomy 3 has 1 7 kmers, taxonomy 7 has 4 kmers and taxonomy 0 has 34 27 kmers.

My desired output looks like this, preferably sorted so that tax1 is the taxonomy with the highest number of kmers.

  read_ID     classification_stats        tax1 kmer1 tax2 kmer2 tax3 kmer3
  read1       3:1 7:4 0:34 3:7 0:27       0    61    3    8     7    4
  read2       0:110 561:19 0:37           0    147   561  19    NA   NA
  read3       0:3 562:5 0:7 543:55 0:47   0    57    543  55    562  5

Either R or bash solutions are interesting.

CodePudding user response：

Here's another solution in R.

First group_by(read_ID) so that everything is done on the read_ID layer.
Then break the classification_stats column into separate rows with a white space as the delimiter.
Break the resulting column into two, separated by a colon ":".
Then create a unique_tax column, which contains the number of tax in each read_ID. This will be used as part of the column name in pivot_wider (telling us how many pairs of kmer and tax should be generated).

Since we will expect duplicated values in pivot_wider and the way we deal with it are different, I'll use two separate pivot_wider and bind them together later.

First pivot without tax, where we would have duplicated values for kmer, and we want to sum them.
Second, pivot without kmer, where we would have duplicated values for tax, and we want to have a unique value of it (do not sum it).
left_join the necessary information together. Note that the first dataset in left_join is another left_join, merging the pivot_wider result with the original df that contains the classification_stats column
Finally, use select to reorder the columns to the place you like.

library(tidyverse)

df_long <- df %>% 
  group_by(read_ID) %>% 
  separate_rows(!read_ID, sep = " ") %>% 
  separate(classification_stats, into = c("tax", "kmer")) %>%
  mutate(unique_tax = as.numeric(factor(tax)),
         across(c(tax, kmer), ~as.numeric(.x)))

left_join(
  left_join(pivot_wider(df_long, !tax,  names_from = "unique_tax", values_from = "kmer", values_fn = sum, names_prefix = "kmer"),
            df,
            by = "read_ID"),
  pivot_wider(df_long, !kmer,  names_from = "unique_tax", values_from = "tax", values_fn = unique, names_prefix = "tax"),
  by = "read_ID"
  ) %>% 
  select(read_ID, classification_stats, tax1, kmer1, tax2, kmer2, tax3, kmer3)

# A tibble: 3 × 8
# Groups:   read_ID [3]
  read_ID classification_stats       tax1 kmer1  tax2 kmer2  tax3 kmer3
  <chr>   <chr>                     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 read1   3:1 7:4 0:34 3:7 0:27         0    61     3     8     7     4
2 read2   0:110 561:19 0:37             0   147   561    19    NA    NA
3 read3   0:3 562:5 0:7 543:55 0:47     0    57   543    55   562     5

CodePudding user response：

library(tidyverse)

classification_stats <- c(
  "3:1 7:4 0:34 3:7 0:27",
  "0:110 561:19 0:37",
  "0:3 562:5 0:7 543:55 0:47"
)

read_ID <- c("read1", "read2", "read3")

df <- tibble(read_ID, classification_stats)

df %>%
  separate_rows(classification_stats, sep = " ") %>%
  separate(classification_stats, into = c("tax", "kmer")) %>%
  type_convert() %>%
  arrange(-kmer) %>%
  nest(tax) %>%
  mutate(id = row_number()) %>%
  unnest(data) %>%
  pivot_wider(names_from = id, values_from = c(kmer, tax))
#> Warning: All elements of `...` must be named.
#> Did you want `data = tax`?
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   read_ID = col_character(),
#>   tax = col_double(),
#>   kmer = col_double()
#> )
#> # A tibble: 3 × 27
#>   read_ID kmer_1 kmer_2 kmer_3 kmer_4 kmer_5 kmer_6 kmer_7 kmer_8 kmer_9 kmer_10
#>   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
#> 1 read2      110     NA     NA     37     NA     NA     19     NA     NA      NA
#> 2 read3       NA     55     47     NA     NA     NA     NA     NA      7       5
#> 3 read1       NA     NA     NA     NA     34     27     NA      7     NA      NA
#> # … with 16 more variables: kmer_11 <dbl>, kmer_12 <dbl>, kmer_13 <dbl>,
#> #   tax_1 <dbl>, tax_2 <dbl>, tax_3 <dbl>, tax_4 <dbl>, tax_5 <dbl>,
#> #   tax_6 <dbl>, tax_7 <dbl>, tax_8 <dbl>, tax_9 <dbl>, tax_10 <dbl>,
#> #   tax_11 <dbl>, tax_12 <dbl>, tax_13 <dbl>

^{Created on 2022-03-10 by the reprex package (v2.0.0)}

And this here counts how often a kmer or taxon occurred for each read:

df %>%
  separate_rows(classification_stats, sep = " ") %>%
  separate(classification_stats, into = c("tax", "kmer")) %>%
  type_convert() %>%
  arrange(-kmer) %>%
  nest(-tax) %>%
  mutate(id = row_number()) %>%
  unnest(data) %>%
  pivot_wider(names_from = id, values_from = c(kmer, tax), values_fn = length)
#> Warning: All elements of `...` must be named.
#> Did you want `data = -tax`?
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   read_ID = col_character(),
#>   tax = col_double(),
#>   kmer = col_double()
#> )
#> # A tibble: 3 × 13
#>   read_ID kmer_1 kmer_2 kmer_3 kmer_4 kmer_5 kmer_6 tax_1 tax_2 tax_3 tax_4
#>   <chr>    <int>  <int>  <int>  <int>  <int>  <int> <int> <int> <int> <int>
#> 1 read2        2     NA      1     NA     NA     NA     2    NA     1    NA
#> 2 read3        3      1     NA     NA      1     NA     3     1    NA    NA
#> 3 read1        2     NA     NA      2     NA      1     2    NA    NA     2
#> # … with 2 more variables: tax_5 <int>, tax_6 <int>

CodePudding user response：

solution using data.table

library(data.table)

setDT(df) # make the tibble a data.table

dt <- df[, .(tax_kmer = unlist(str_split(classification_stats, " "))), by = read_ID]
dt[, c("tax", "kmer") := tstrsplit(tax_kmer, ":")]
dt <- dt[, .(kmer = sum(as.numeric(kmer))), by = .(read_ID, tax)]
dt[, order := frankv(kmer, ties.method = "first", order = c(-1L)), by = read_ID]
dt <- dcast(dt, read_ID ~ order, value.var = c("kmer", "tax"))

results

dt
#    read_ID kmer_1 kmer_2 kmer_3 tax_1 tax_2 tax_3
# 1:   read1     61      8      4     0     3     7
# 2:   read2    147     19     NA     0   561  <NA>
# 3:   read3     57     55      5     0   543   562

data

classification_stats <- c(
  "3:1 7:4 0:34 3:7 0:27",
  "0:110 561:19 0:37",
  "0:3 562:5 0:7 543:55 0:47"
)
read_ID <- c("read1", "read2", "read3")
df <- tibble(read_ID, classification_stats)