I've got the following data frame:
structure(list(Nombre.HIC.HGVS = c("NC_000007.13:g.87178626C>T",
"NC_000007.13:g.87278760G>A", "NC_000012.11:g.22063115A>G"),
Gen = c("ABCB1", "ABCB1", "ABCB1"), Isoforma = c("NM_000927.4",
"NM_000927.4", "NM_000927.4"), Nombre.genómico = c("g.87178626C>T",
"g.87278760G>A", "g.87133993T>C"), Nombre.cDNA = c("c.1725 38G>A",
"c.-330-48366C>T", "c.3637-228A>G"), rs = c("rs2235013",
"rs10267099", "rs1186746"), Zona..Inicial..Final. = c("Intrón: 15 / -",
"Intrón: 1 / -", "Intrón: 28 / -"), Count.... = c("78,947%",
"10,526%", "7,895%"), Count..total. = c("(30 / 38)", "(4 / 38)",
"(3 / 38)"), Count.HiC.... = c("77,652%", "7,961%", "3,544%"
), Profundidad.de.lectura..DP. = c(310L, 406L, 27L), Profundidad.de.lectura.corregida.por.calidad..DP.QUAL. = c(403L,
283L, 20L), Frecuencia.del.alelo.alternativo.en.las.lecturas..FREQ.ALT. = c(50.62,
99.65, 40), Calidad.en.la.identificación.de.la.variante..Qual. = c(255L,
255L, 95L), Cigosidad..AF1. = c("Heterocigosis", "Homocigosis",
"Heterocigosis")), row.names = c(NA, 3L), class = "data.frame")
I need to extract, in two different columns, the numbers i have put between []an highlighted in bold. So for example:
NC_000007.13:g.87278760G>A ->> from here I want to take: NC_00000[7].13:g.[87278760]G>A NC_000012.11:g.22063115A>G ->> from here I want to take: NC_0000[12].11:g.[22063115]A>G
So basically, I want to keep the last number before the first dot (or the last 2 numbers before the first dot - as long as the second to last is not a 0- ), and all numbers after "g."
I've been using the stringr package but these conditions are too many for me to get it right.
Any ideas?
Thanks!
CodePudding user response:
Here is a potential solution:
library(tidyverse)
df <- structure(list(Nombre.HIC.HGVS = c("NC_000007.13:g.87178626C>T",
"NC_000007.13:g.87278760G>A", "NC_000013.11:g.22063115A>G"),
Gen = c("ABCB1", "ABCB1", "ABCB1"), Isoforma = c("NM_000927.4",
"NM_000927.4", "NM_000927.4"), Nombre.genómico = c("g.87178626C>T",
"g.87278760G>A", "g.87133993T>C"), Nombre.cDNA = c("c.1725 38G>A",
"c.-330-48366C>T", "c.3637-228A>G"), rs = c("rs2235013",
"rs10267099", "rs1186746"), Zona..Inicial..Final. = c("Intrón: 15 / -",
"Intrón: 1 / -", "Intrón: 28 / -"), Count.... = c("78,947%",
"10,526%", "7,895%"), Count..total. = c("(30 / 38)", "(4 / 38)",
"(3 / 38)"), Count.HiC.... = c("77,652%", "7,961%", "3,544%"
), Profundidad.de.lectura..DP. = c(310L, 406L, 27L), Profundidad.de.lectura.corregida.por.calidad..DP.QUAL. = c(403L,
283L, 20L), Frecuencia.del.alelo.alternativo.en.las.lecturas..FREQ.ALT. = c(50.62,
99.65, 40), Calidad.en.la.identificación.de.la.variante..Qual. = c(255L,
255L, 95L), Cigosidad..AF1. = c("Heterocigosis", "Homocigosis",
"Heterocigosis")), row.names = c(NA, 3L), class = "data.frame")
df %>%
select(Nombre.HIC.HGVS) %>%
mutate(first = gsub(x = str_extract(Nombre.HIC.HGVS, "\\d "), pattern = "^0 ", ""),
second = str_extract(Nombre.HIC.HGVS, "(?<=g.)\\d (?=[[:alpha:]] )"))
#> Nombre.HIC.HGVS first second
#> 1 NC_000007.13:g.87178626C>T 7 87178626
#> 2 NC_000007.13:g.87278760G>A 7 87278760
#> 3 NC_000013.11:g.22063115A>G 13 22063115
Created on 2022-03-03 by the reprex package (v2.0.1)