getting different patterns within the same string-CodePudding

I've got the following data frame:

structure(list(Nombre.HIC.HGVS = c("NC_000007.13:g.87178626C>T", 
"NC_000007.13:g.87278760G>A", "NC_000012.11:g.22063115A>G"), 
    Gen = c("ABCB1", "ABCB1", "ABCB1"), Isoforma = c("NM_000927.4", 
    "NM_000927.4", "NM_000927.4"), Nombre.genómico = c("g.87178626C>T", 
    "g.87278760G>A", "g.87133993T>C"), Nombre.cDNA = c("c.1725 38G>A", 
    "c.-330-48366C>T", "c.3637-228A>G"), rs = c("rs2235013", 
    "rs10267099", "rs1186746"), Zona..Inicial..Final. = c("Intrón: 15 / -", 
    "Intrón: 1 / -", "Intrón: 28 / -"), Count.... = c("78,947%", 
    "10,526%", "7,895%"), Count..total. = c("(30 / 38)", "(4 / 38)", 
    "(3 / 38)"), Count.HiC.... = c("77,652%", "7,961%", "3,544%"
    ), Profundidad.de.lectura..DP. = c(310L, 406L, 27L), Profundidad.de.lectura.corregida.por.calidad..DP.QUAL. = c(403L, 
    283L, 20L), Frecuencia.del.alelo.alternativo.en.las.lecturas..FREQ.ALT. = c(50.62, 
    99.65, 40), Calidad.en.la.identificación.de.la.variante..Qual. = c(255L, 
    255L, 95L), Cigosidad..AF1. = c("Heterocigosis", "Homocigosis", 
    "Heterocigosis")), row.names = c(NA, 3L), class = "data.frame")

I need to extract, in two different columns, the numbers i have put between []an highlighted in bold. So for example:

NC_000007.13:g.87278760G>A ->> from here I want to take: NC_00000[7].13:g.[87278760]G>A NC_000012.11:g.22063115A>G ->> from here I want to take: NC_0000[12].11:g.[22063115]A>G

So basically, I want to keep the last number before the first dot (or the last 2 numbers before the first dot - as long as the second to last is not a 0- ), and all numbers after "g."

I've been using the stringr package but these conditions are too many for me to get it right.

Any ideas?

Thanks!

CodePudding user response：

Here is a potential solution:

library(tidyverse)
df <- structure(list(Nombre.HIC.HGVS = c("NC_000007.13:g.87178626C>T", 
                                         "NC_000007.13:g.87278760G>A", "NC_000013.11:g.22063115A>G"), 
                     Gen = c("ABCB1", "ABCB1", "ABCB1"), Isoforma = c("NM_000927.4", 
                                                                      "NM_000927.4", "NM_000927.4"), Nombre.genómico = c("g.87178626C>T", 
                                                                                                                         "g.87278760G>A", "g.87133993T>C"), Nombre.cDNA = c("c.1725 38G>A", 
                                                                                                                                                                            "c.-330-48366C>T", "c.3637-228A>G"), rs = c("rs2235013", 
                                                                                                                                                                                                                        "rs10267099", "rs1186746"), Zona..Inicial..Final. = c("Intrón: 15 / -", 
                                                                                                                                                                                                                                                                              "Intrón: 1 / -", "Intrón: 28 / -"), Count.... = c("78,947%", 
                                                                                                                                                                                                                                                                                                                                "10,526%", "7,895%"), Count..total. = c("(30 / 38)", "(4 / 38)", 
                                                                                                                                                                                                                                                                                                                                                                        "(3 / 38)"), Count.HiC.... = c("77,652%", "7,961%", "3,544%"
                                                                                                                                                                                                                                                                                                                                                                        ), Profundidad.de.lectura..DP. = c(310L, 406L, 27L), Profundidad.de.lectura.corregida.por.calidad..DP.QUAL. = c(403L, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        283L, 20L), Frecuencia.del.alelo.alternativo.en.las.lecturas..FREQ.ALT. = c(50.62, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    99.65, 40), Calidad.en.la.identificación.de.la.variante..Qual. = c(255L, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       255L, 95L), Cigosidad..AF1. = c("Heterocigosis", "Homocigosis", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       "Heterocigosis")), row.names = c(NA, 3L), class = "data.frame")
df %>%
  select(Nombre.HIC.HGVS) %>%
  mutate(first = gsub(x = str_extract(Nombre.HIC.HGVS, "\\d "), pattern = "^0 ", ""),
         second = str_extract(Nombre.HIC.HGVS, "(?<=g.)\\d (?=[[:alpha:]] )"))
#>              Nombre.HIC.HGVS first   second
#> 1 NC_000007.13:g.87178626C>T     7 87178626
#> 2 NC_000007.13:g.87278760G>A     7 87278760
#> 3 NC_000013.11:g.22063115A>G    13 22063115

^{Created on 2022-03-03 by the reprex package (v2.0.1)}