How to create a count table in R?-CodePudding

I have a dataframe like below, which contains the body lengths and the observed counts of species A and B at each station

> set.seed(10)
> df <- data.frame(
    species = c(rep("A",4), rep("B",4)),
    station = rep(1:2, 4),
    length = round(rnorm(8, 15, 2)),
    count = round(rnorm(8, 5, 2))
  )
> df
  species station length count
1       A       1     15     2
2       A       2     15     4
3       A       1     12     7
4       A       2     14     7
5       B       1     16     5
6       B       2     16     7
7       B       1     13     6
8       B       2     14     5

What I want to do is to transform this into a table with 2cm bins like this. But how can I get this?

> cnt_table <- data.frame(
    species = c("A","A","B","B"),
    station = c(1,2,1,2),
    L11_12 = c(0,0,0,0),
    L13_14 = c(7,7,6,5),
    L15_16 = c(2,4,5,7),
    L17_18 = c(0,0,0,0),
    L19_20 = c(0,0,0,0)
  )
> cnt_table
  species station L11_12 L13_14 L15_16 L17_18 L19_20
1       A       1      0      7      2      0      0
2       A       2      0      7      4      0      0
3       B       1      0      6      5      0      0
4       B       2      0      5      7      0      0

CodePudding user response：

First, you need to use cut to bin the lengths. Then you can use complete to fill the missing counts with 0. Then, group_by species, station and bin and use summarize to add the counts per group. Last, use pivot_wider to make the bins column labels.

Note 1: The result differs from your expected output, but I think you have a typo.

Note 2: I don't know if teh grouping and summing is necessary. In your example it's not, but logically I would include it.

library(tidyverse)

set.seed(10)
df <- data.frame(
  species = c(rep("A",4), rep("B",4)),
  station = rep(1:2, 4),
  length = round(rnorm(8, 15, 2)),
  count = round(rnorm(8, 5, 2))
)
df


#---------------------
df %>% 
  mutate(length = cut(length, 
                      breaks = seq(10.5, 20.5, by = 2),
                      labels = c("L11_12", "L13_14", "L15_16", "L17_18", "L19_20"))) %>% 
  complete(species, station, length, fill = list(count = 0)) %>% 
  group_by(species, station, length) %>% 
  summarize(count = sum(count)) %>% 
  pivot_wider(names_from = length, values_from = count)


#---------------------
# A tibble: 4 x 7
# Groups:   species, station [4]
  species station L11_12 L13_14 L15_16 L17_18 L19_20
  <chr>     <int>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 A             1      7      0      2      0      0
2 A             2      0      7      4      0      0
3 B             1      0      6      5      0      0
4 B             2      0      5      7      0      0

CodePudding user response：

a data.table approach that should scale well

library(data.table)
# create alookup table with categories
lookup <- data.table(
  name = paste0("L", seq(11, 19, 2), "_", seq(12,20, 2)),
  from = seq(11, 19, 2),
  to = seq(12,20, 2)
)
lookup
# join with a non-equi join
setDT(df)[lookup, name := i.name, on = .(length >= from, length <= to)]
# spreak to wide
dcast(df, species   station ~ name, value.var = "count", fun.aggregate = sum)
#    species station L11_12 L13_14 L15_16
# 1:       A       1      7      0      2
# 2:       A       2      0      7      4
# 3:       B       1      0      6      5
# 4:       B       2      0      5      7

CodePudding user response：

You can do something like:


library(dplyr)
library(tidyr)

# Create the bins
df$bin  <- cut(df$length, breaks = seq(from = min(df$length-1), to = max(df$length) 1, by = 2))

# Get the data frame into shape
df  |> 
    mutate(
        bin_start = substr(bin, 2,3), 
        bin_end = as.numeric(substr(bin, 5,6))-1, 
        bin_name = paste0("L", bin_start, "_", bin_end))  |>
    group_by(species, station, bin_name)  |>
    summarise(n = n())  |>
    pivot_wider(
        names_from = bin_name, 
        values_from = n, 
        values_fill = list(n = 0)
   )

# A tibble: 4 x 5
# Groups:   species, station [4]
#   species station L11_12 L13_14 L15_16
#   <chr>     <int>  <int>  <int>  <int>
# 1 A             1      1      1      0
# 2 A             2      0      2      0
# 3 B             1      1      0      1
# 4 B             2      0      1      1

CodePudding user response：

Define your desired range r, i.e. 11:20 as shown in OP (length should be even!). Then match df$lengh with a helper-matrix m to get bins, calculate aggregated sums and reshape to wide format, e.g. using reshape2::dcast.

r <- 11:20; m <- matrix(r, ncol=2, byrow=TRUE)

transform(df, bin=factor(sapply(df$length, \(z) which(rowSums(z == m) > 0)),
                         levels=seq_along(x), labels=apply(m, 1, paste, collapse='_'))) |>
  aggregate(count ~ bin   species   station, sum) |>
  reshape2::dcast(species   station ~ bin, value.var='count', drop=FALSE, fill=0)
#   species station 11_12 13_14 15_16 17_18 19_20
# 1       A       1     7     0     2     0     0
# 2       A       2     0     7     4     0     0
# 3       B       1     0     6     5     0     0
# 4       B       2     0     5     7     0     0