I have a tibble with 27 columns of numeric values. I want to count the number of digits after the decimal point, especially, I want to know the maximum number of digits that can happen in each column.

I tried to convert the numerical values to a string variable, then subset a string after the decimal point, then count the number of the new string, then find max.

afterdecimal_val1 <- data %>%
  mutate(
    across(where(is.numeric), as.character),
  ) %>%
  rowwise() %>%
  mutate(
   init = str_split(value_1,"[.]"),
   init2 = init[2],
   init3 = nchar(init2)
  ) %>%
  ungroup() %>%
  mutate(init4 = max(init3, na.rm=TRUE))

Well, this works, however only for the Column "value_1" and I am pretty sure that this is not the prettiest way to do it.

Do you know a more feasible way? Or: Can you help me to upgrade the code so it works for all 27 variables and not only "value_1"?

I really like answers using dplyr!

Thanks for your help!

CodePudding user response：

Does this work:

df1 <- data.frame(c1 = c(1.11, 1.121,1.1212),
                  c2 = c(1.1, 1.121,1.121),
                  c3 = c(1.1111, 1.121,1.12111))
apply(apply(df1, 2, function(x) gsub('(^\\d)(\\.)(\\d )','\\3',as.character(x))), 2, function(y) max(nchar(y)))
c1 c2 c3 
 4  3  5

CodePudding user response：

dpylr solution, untested due to lack of MRE:

library(dplyr)
library(tidyr)

data %>% 
  pivot_longer(starts_with("value"), names_to = "variable", values_to = "value") %>% 
  mutate(N_digits = nchar(gsub(".*\\.", "", as.character(value)))) %>% 
  group_by(variable) %>% 
  summarise(max_N_digits = max(N_digits)) %>% 
  pivot_wider(names_from = variable, values_from = max_N_digits)

Edit

This should also work for numbers without decimal point (i.e. give 0):

data1 <- data.frame(value1 = c(1.11, 1.121,1.1212),
                   value2 = c(6666, 5,5),
                   value3 = c(1.1111, 1.121,1.12111))
library(dplyr)
library(tidyr)
data1 %>% 
  pivot_longer(starts_with("value"), names_to = "variable", values_to = "value") %>% 
  mutate(N_digits = nchar(gsub(".*\\.|^[^.] $", "", as.character(value)))) %>% 
  group_by(variable) %>% 
  summarise(max_N_digits = max(N_digits)) %>% 
  pivot_wider(names_from = variable, values_from = max_N_digits)

Returns:

  value1 value2 value3
   <int>  <int>  <int>
1      4      0      5

CodePudding user response：

Tidyverse solutions:

library(tidyverse)
# Option 1 using `purrr::`:
df1 %>% 
  summarise_if(is.double, ~max(
    str_length(
      map_chr(
        str_split(
          as.character(.),
          "\\.",
        ),
        function(x) x[[2]]
      )
    )
  )
)

# Option 2 using regex:
df1 %>%
  summarise_if(
    is.double,
    function(x){
      max(
        str_length(
          str_replace(
            x,
            ".*\\.(\\d )",
            "\\1"
          )
        )
      )
    }
  )

Base solution:

# Option 1:
# Resolve the name double vectors: double_vecs => character vector
double_vecs <- names(df1)[vapply(df1, is.double, logical(1))]

# calculate the max number of decimal points in each column: 
# res => named integer vector
res <- setNames(
  vapply(
    lapply(
        data.frame(
          Vectorize(gsub)(
            ".*\\.(\\d )",
            "\\1",
            df1[,double_vecs]
          )
        ),
      nchar
    ),
    max,
    integer(1),
    USE.NAMES = FALSE
  ),
  double_vecs
)

# Option 2:
# Resolve the index of double vectors: col_idx => logical vector
col_idx <- vapply(df1, is.double, logical(1))

# Matrix holding values representing the number of characters 
# after a decimal point: len_mat => matrix
len_mat <- apply(
  Vectorize(gsub)(
    ".*\\.(\\d )",
    "\\1",
    df1[,col_idx]
  ),
  seq_len(2),
  nchar
)

# Get the maximum of each column: res => named integer vector
res <- setNames(
  do.call(
    pmax, 
    c(
      as.data.frame(
        t(len_mat)
      )
    )
  ),
  names(df1)[col_idx]
)