Count column values that is in between (range) of two other column values-CodePudding

I have two dataframes (Droplets and Nucleus) with data from thousands ofobjects within images such as follow:

head(Droplets)
  class_name object_id centroid_y centroid_x
  <chr>          <dbl>      <dbl>      <dbl>
1 Droplet            1         47        621
2 Droplet            2        173        106
3 Droplet            3        158        949
4 Droplet            4        176        627
5 Droplet            5        619        154
6 Droplet            6        631       1361


 head(Nucleus)
  class_name object_id  area bbox_y_start bbox_x_start bbox_y_end bbox_x_end
  <chr>          <dbl> <dbl>        <dbl>        <dbl>      <dbl>      <dbl>
1 Nucleus            1  8973            0           95        102        213
2 Nucleus            2  1592            0          189         36        257
3 Nucleus            3  2980            0          256         43        348
4 Nucleus            4  4664            0          404         93        490
5 Nucleus            5  3973            0          486         79        560
6 Nucleus            6   737            0          564         16        635

Droplets are points that are inside a nucleus. All droplets are inside a nucleus, but a Nucleus could also have 0 droplets. I am trying to figure out a way to count how many droplets are inside a Nucleus based on their location. I believe Droplet is a point and Nucleus could be polygons. I read about point.in.polygon. I also tried to look if both centroid_y and centroid_x fall in the range of bbox. But I am not a R ninja and I am not sure how to start.The desired output would be something like this:

  class_name object_id Droplets_count
1    Nucleus         1              1
2    Nucleus         2              2
3    Nucleus         3              3
4    Nucleus         4              0
5    Nucleus         5              0
6    Nucleus         6              1

Is there any easy way to do it? Thanks!

CodePudding user response：

You could assign each droplet to a particular nucleus by checking its cetroid against the bounding box limits:

Droplets$Nucleus <- unlist(mapply(function(x, y) {
        result <- which(Nucleus$bbox_x_end >= x & 
                        Nucleus$bbox_x_start <= x & 
                        Nucleus$bbox_y_end >= y & 
                        Nucleus$bbox_y_start <= y)
        if(length(result) == 0) return(0)
        return(result[1])
       }, 
       x = Droplets$centroid_x, y = Droplets$centroid_y, SIMPLIFY = TRUE))

You can then count the number of droplets within each nucleus and assign that to a column in the Nucleus data frame like this:

Nucleus$Droplets <- sapply(seq(nrow(Nucleus)), function(i) {
  length(which(Droplets$Nucleus == i))})

Unfortunately, in the sample data you gave us, none of the droplets shown in Droplets fall within any of the bounding boxes in Nucleus. Therefore, I have modified the data frames a little bit to demonstrate this code in action:

Droplets
#>   class_name object_id centroid_y centroid_x
#> 1    Droplet         1         21        152
#> 2    Droplet         2          6        126
#> 3    Droplet         3         36        301
#> 4    Droplet         4         66        426
#> 5    Droplet         5          8        599
#> 6    Droplet         6         12        602

Nucleus
#>   class_name object_id area bbox_y_start bbox_x_start bbox_y_end bbox_x_end
#> 1    Nucleus         1 8973            0           95        102        213
#> 2    Nucleus         2 1592            0          189         36        257
#> 3    Nucleus         3 2980            0          256         43        348
#> 4    Nucleus         4 4664            0          404         93        490
#> 5    Nucleus         5 3973            0          486         79        560
#> 6    Nucleus         6  737            0          564         16        635

When we run the code above on these two data frames, they become:

Droplets
#>   class_name object_id centroid_y centroid_x Nucleus
#> 1    Droplet         1         21        152       1
#> 2    Droplet         2          6        126       1
#> 3    Droplet         3         36        301       3
#> 4    Droplet         4         66        426       4
#> 5    Droplet         5          8        599       6
#> 6    Droplet         6         12        602       6

Nucleus
#>   class_name object_id area bbox_y_start bbox_x_start bbox_y_end bbox_x_end Droplets
#> 1    Nucleus         1 8973            0           95        102        213        2
#> 2    Nucleus         2 1592            0          189         36        257        0
#> 3    Nucleus         3 2980            0          256         43        348        1
#> 4    Nucleus         4 4664            0          404         93        490        1
#> 5    Nucleus         5 3973            0          486         79        560        0
#> 6    Nucleus         6  737            0          564         16        635        2

Data used

Droplets <- structure(list(class_name = c("Droplet", "Droplet", "Droplet", 
                                          "Droplet", "Droplet", "Droplet"), 
                           object_id = 1:6, 
                           centroid_y = c(21L, 6L, 36L, 66L, 8L, 12L), 
                           centroid_x = c(152L, 126L, 301L, 426L, 599L, 602L)), 
                      class = "data.frame", row.names = c(NA, -6L))

Nucleus <- structure(list(class_name = c("Nucleus", "Nucleus", "Nucleus", 
                              "Nucleus", "Nucleus", "Nucleus"), 
                          object_id = 1:6, 
                          area = c(8973L, 1592L, 2980L, 4664L, 3973L, 737L), 
                          bbox_y_start = c(0L, 0L, 0L, 0L, 0L, 0L), 
                          bbox_x_start = c(95L, 189L, 256L, 404L, 486L, 564L), 
                          bbox_y_end = c(102L, 36L, 43L, 93L, 79L, 16L), 
                          bbox_x_end = c(213L, 257L, 348L, 490L, 560L, 635L)), 
                     class = "data.frame", row.names = c(NA, -6L))

CodePudding user response：

data.table approach

library(data.table)
# convert to data.table format using
#   setDT(Droplets); setDT(Nucleus)

# Perform non-equi left join
ans <- Droplets[Nucleus, on = .(centroid_y >= bbox_y_start,
                         centroid_y <= bbox_y_end,
                         centroid_x >= bbox_x_start,
                         centroid_x <= bbox_x_end)][]
# summarise
ans[, .(Droplets_count = uniqueN(object_id, na.rm = TRUE)), 
        by = .(Nucleus_id = i.object_id)]

   Nucleus_id Droplets_count
1:          1              2
2:          2              0
3:          3              1
4:          4              1
5:          5              0
6:          6              2

sample data used

library(data.table)
Droplets <- fread("class_name object_id centroid_y centroid_x
    Droplet         1         21        152
    Droplet         2          6        126
    Droplet         3         36        301
    Droplet         4         66        426
    Droplet         5          8        599
    Droplet         6         12        602")

Nucleus <- fread("class_name object_id area bbox_y_start bbox_x_start bbox_y_end bbox_x_end
    Nucleus         1 8973            0           95        102        213
    Nucleus         2 1592            0          189         36        257
    Nucleus         3 2980            0          256         43        348
    Nucleus         4 4664            0          404         93        490
    Nucleus         5 3973            0          486         79        560
    Nucleus         6  737            0          564         16        635")