I have the following df where two columns ara labeled with the same name:
dput(df_test)
structure(list(X = c("Gen", "ABCB1", "ABCG2", "CES1"), X.1 = c("Prioridad del gen",
"Candidato", "Candidato", "Candidato"), X.2 = c("Región codificante",
"2110", "1526", "3533"), X.3 = c("Categoría Reg. Codif.", "intron",
"intron", "intron"), X.4 = c("Alineamiento múltiple", "No", "No",
"No"), X.5 = c("Cromosoma", "7", "4", "16"), X.6 = c("Posición inicial",
"87153584", "89096060", "55855151"), X.7 = c("Posición final",
"87153585", "89096061", "55855151"), X.8 = c("Tamaño (pb)", "2",
"2", "1"), X.9 = c("Nº pb cob. ? 15X", "0", "1", "0"), X.10 = c("Nº pb cob. ? 15X",
"2", "1", "1"), X.11 = c("% pb cob. ? 15X", "0%", "50%", "0%"
), X.12 = c("Cobertura media", "3", "14,50", "0"), X.13 = c("Nº pb sin cubrir",
"0", "0", "1"), X.14 = c("Nº pb cob. [1-5]", "2", "0", "0"),
X.15 = c("Nº pb cob. [6-14]", "0", "1", "0"), X.16 = c("Nº pb cob. [15-29]",
"0", "1", "0"), X.17 = c("Nº pb cob. ? 30X", "0", "0", "0"
)), class = "data.frame", row.names = c(NA, -4L))
Because the first raw is empty in the original file, the real header becomes part of the df instead of being used as the header. Hence, I use row_to_names to move up the raw containing the names:
df1 <- read.delim("file", header = T) %>% row_to_names(row_number = 1)
Now I need to rename the columns "Nº pb cob. ? 15X" as "Nº pb cob. ≥ 15X" and " Nº pb cob. ≤ 15X", respectively. I've tried with:
clean_ rename_at clean_names() after row_to_names() and didn't change anything.
rename_vars and rename_at didn't work out either.
df1 <- rename_at(df1, 10, ~"Num pb cob. ≥ 15X") Error in
combine_names()
: ! Can't rename duplicate variables to{name}
. Runrlang::last_error()
to see where the error occurred.
Could any one give me some advice¿?
Thanks!!
CodePudding user response:
You can do it manually using backticks:
library(tidyverse)
df <-tibble(`Nº pb cob. ? 15X` = seq(2))
df
#> # A tibble: 2 x 1
#> `Nº pb cob. ? 15X`
#> <int>
#> 1 1
#> 2 2
rename(df, `Nº pb cob. ≤ 15X` = `Nº pb cob. ? 15X`)
#> # A tibble: 2 x 1
#> `Nº pb cob. ≤ 15X`
#> <int>
#> 1 1
#> 2 2
Created on 2022-02-22 by the reprex package (v2.0.0)
CodePudding user response:
I did not find a way to place those especial characters in a data frame variable name, so I used a minor variation.
The idea was to create a function that cleans your data, that way you can apply this function to all your files.
library(stringr)
library(purrr)
test <- structure(
list(
X = c("Gen", "ABCB1", "ABCG2", "CES1"),
X.1 = c("Prioridad del gen","Candidato", "Candidato", "Candidato"),
X.2 = c("Región codificante","2110", "1526", "3533"),
X.3 = c("Categoría Reg. Codif.", "intron", "intron", "intron"),
X.4 = c("Alineamiento múltiple", "No", "No", "No"),
X.5 = c("Cromosoma", "7", "4", "16"),
X.6 = c("Posición inicial", "87153584", "89096060", "55855151"),
X.7 = c("Posición final", "87153585", "89096061", "55855151"),
X.8 = c("Tamaño (pb)", "2", "2", "1"),
X.9 = c("Nº pb cob. ? 15X", "0", "1", "0"),
X.10 = c("Nº pb cob. ? 15X", "2", "1", "1"),
X.11 = c("% pb cob. ? 15X", "0%", "50%", "0%"),
X.12 = c("Cobertura media", "3", "14,50", "0"),
X.13 = c("Nº pb sin cubrir", "0", "0", "1"),
X.14 = c("Nº pb cob. [1-5]", "2", "0", "0"),
X.15 = c("Nº pb cob. [6-14]", "0", "1", "0"),
X.16 = c("Nº pb cob. [15-29]", "0", "1", "0"),
X.17 = c("Nº pb cob. ? 30X", "0", "0", "0")),
class = "data.frame", row.names = c(NA, -4L))
# Function to clean the names as you need
clean_df_names <- function(df) {
df_names <- df[1, ] %>%
unlist(use.names = FALSE)
repeated_names <- which(df_names == 'Nº pb cob. ? 15X')
#name_symbols <- c('\u2265', '\u2264') # these are the unicode symbols, but can not be used in df names
name_symbols <- c('>=', '<=')
new_names <- purrr::map2_chr(
df_names[repeated_names], name_symbols,
~stringr::str_replace(.x, '\\?', .y)
)
df_names[repeated_names] <- new_names
new_df <- df[-1, ]
setNames(new_df, df_names)
}
test <- clean_df_names(test)
str(test)
#> 'data.frame': 3 obs. of 18 variables:
#> $ Gen : chr "ABCB1" "ABCG2" "CES1"
#> $ Prioridad del gen : chr "Candidato" "Candidato" "Candidato"
#> $ Región codificante : chr "2110" "1526" "3533"
#> $ Categoría Reg. Codif.: chr "intron" "intron" "intron"
#> $ Alineamiento múltiple: chr "No" "No" "No"
#> $ Cromosoma : chr "7" "4" "16"
#> $ Posición inicial : chr "87153584" "89096060" "55855151"
#> $ Posición final : chr "87153585" "89096061" "55855151"
#> $ Tamaño (pb) : chr "2" "2" "1"
#> $ Nº pb cob. >= 15X : chr "0" "1" "0"
#> $ Nº pb cob. <= 15X : chr "2" "1" "1"
#> $ % pb cob. ? 15X : chr "0%" "50%" "0%"
#> $ Cobertura media : chr "3" "14,50" "0"
#> $ Nº pb sin cubrir : chr "0" "0" "1"
#> $ Nº pb cob. [1-5] : chr "2" "0" "0"
#> $ Nº pb cob. [6-14] : chr "0" "1" "0"
#> $ Nº pb cob. [15-29] : chr "0" "1" "0"
#> $ Nº pb cob. ? 30X : chr "0" "0" "0"
Created on 2022-02-22 by the reprex package (v2.0.1)