Home > Software design >  Sub in R pattern
Sub in R pattern

Time:10-02

I am trying to work out how the Sub works in R. I was trying to break a list up into two sections. For instance, i have this working for list_2, where Var1 = 0_300 and var2 = minus 5.

    list_2 <- c("area_0_300_minus5",     
                "area_0_300_minus4" ,    
                "area_0_300_minus3"  ,   
                "area_0_300_minus2"   ,  
                "area_0_300_minus1"    , 
                "area_0_300_0"          ,
                "area_0_300_1"          ,
                "area_0_300_2"          ,
                "area_0_300_3"         ,
                "area_0_300_4"          ,
                "area_0_300_5"          ,
                "area_300_600_minus5"   ,
                "area_300_600_minus4"   ,
                "area_300_600_minus3"   ,
                "area_300_600_minus2"   ,
                "area_300_600_minus1"   ,
                "area_300_600_0"        ,
                "area_300_600_1"       ,
                "area_300_600_2"        ,
                "area_300_600_3"        ,
                "area_300_600_4"        ,
                "area_300_600_5"        ,
                "area_600_800_minus5"   ,
                "area_600_800_minus4"   ,
                "area_600_800_minus3"   ,
                "area_600_800_minus2"   ,
                "area_600_800_minus1"  ,
                "area_600_800_0"        ,
                "area_600_800_1"        ,
                "area_600_800_2"        ,
                "area_600_800_3"        ,
                "area_600_800_4"        ,
                "area_600_800_5"       )

    var1_working = sub("^.*_(\\d _\\d )_.*$", "\\1", list_2)
    var2_working = sub("^.*_(.*)$", "\\1", list_2)

    

But in my list 1, i cant seem to extract 0_300 etc to equal to var1 and the 'm5'/ 'm4' / 'm3' / 'm2' / 'm1' / '0' / '1' etc etc to equal to Var2.

list_1 <- c("as.factor(radius_ring)0_300:as.factor(year_delta)0:units",   
    "as.factor(radius_ring)300_600:as.factor(year_delta)0:units"  ,
    "as.factor(radius_ring)600_800:as.factor(year_delta)0:units"  ,
    "as.factor(radius_ring)800_1000:as.factor(year_delta)0:units" ,
    "as.factor(radius_ring)0_300:as.factor(year_delta)1:units"     ,
    "as.factor(radius_ring)300_600:as.factor(year_delta)1:units"  ,
    "as.factor(radius_ring)600_800:as.factor(year_delta)1:units"   ,
    "as.factor(radius_ring)800_1000:as.factor(year_delta)1:units" ,
    "as.factor(radius_ring)0_300:as.factor(year_delta)2:units"     ,
    "as.factor(radius_ring)300_600:as.factor(year_delta)2:units"  ,
    "as.factor(radius_ring)600_800:as.factor(year_delta)2:units", 
    "as.factor(radius_ring)800_1000:as.factor(year_delta)2:units",
    "as.factor(radius_ring)0_300:as.factor(year_delta)3:units",     
    "as.factor(radius_ring)300_600:as.factor(year_delta)3:units",
    "as.factor(radius_ring)600_800:as.factor(year_delta)3:units",   
    "as.factor(radius_ring)800_1000:as.factor(year_delta)3:units" ,
    "as.factor(radius_ring)0_300:as.factor(year_delta)4:units",     
    "as.factor(radius_ring)300_600:as.factor(year_delta)4:units",  
    "as.factor(radius_ring)600_800:as.factor(year_delta)4:units" ,  
    "as.factor(radius_ring)800_1000:as.factor(year_delta)4:units",
    "as.factor(radius_ring)0_300:as.factor(year_delta)5:units",     
    "as.factor(radius_ring)300_600:as.factor(year_delta)5:units",  
    "as.factor(radius_ring)600_800:as.factor(year_delta)5:units",   
    "as.factor(radius_ring)800_1000:as.factor(year_delta)5:units", 
    "as.factor(radius_ring)0_300:as.factor(year_delta)m1:units",    
    "as.factor(radius_ring)300_600:as.factor(year_delta)m1:units", 
    "as.factor(radius_ring)600_800:as.factor(year_delta)m1:units",  
    "as.factor(radius_ring)800_1000:as.factor(year_delta)m1:units",
    "as.factor(radius_ring)0_300:as.factor(year_delta)m2:units",    
    "as.factor(radius_ring)300_600:as.factor(year_delta)m2:units",
    "as.factor(radius_ring)600_800:as.factor(year_delta)m2:units" , 
    "as.factor(radius_ring)800_1000:as.factor(year_delta)m2:units",
    "as.factor(radius_ring)0_300:as.factor(year_delta)m3:units",    
    "as.factor(radius_ring)300_600:as.factor(year_delta)m3:units" ,
    "as.factor(radius_ring)600_800:as.factor(year_delta)m3:units" , 
    "as.factor(radius_ring)800_1000:as.factor(year_delta)m3:units",
    "as.factor(radius_ring)0_300:as.factor(year_delta)m4:units"    ,
    "as.factor(radius_ring)300_600:as.factor(year_delta)m4:units" ,
    "as.factor(radius_ring)600_800:as.factor(year_delta)m4:units"  ,
    "as.factor(radius_ring)800_1000:as.factor(year_delta)m4:units",
    "as.factor(radius_ring)0_300:as.factor(year_delta)m5:units"    ,
    "as.factor(radius_ring)300_600:as.factor(year_delta)m5:units",
    "as.factor(radius_ring)600_800:as.factor(year_delta)m5:units" , 
    "as.factor(radius_ring)800_1000:as.factor(year_delta)m5:units")
    
    
    
    

    
    var1_nonworking = sub("^.*_(\\d _\\d )_.*$", "\\1", list_1)
    var2_nonworking = sub("^.*_(.*)$", "\\1", list_1)
    

I am actually just a bit unsure on how the pattern extraction works "^.*_(\\d _\\d )_.*$", "\\1" which means quite little to me to be able to adapt it to my list 1.

Hope this makes sense

CodePudding user response:

Two approaches:

  1. strcapture returns a frame, one column per capture group.

    strcapture(".*\\)([^:]*).*\\)([^:]*):.*", list_1,
               proto = list(var1 = "", var2 = ""))[c(1:3, 42:44),]
    #        var1 var2
    # 1     0_300    0
    # 2   300_600    0
    # 3   600_800    0
    # 42  300_600   m5
    # 43  600_800   m5
    # 44 800_1000   m5
    
  2. gregexpr to extract zero or more per line of text.

    gre <- gregexpr("(?<=\\))([^:]*)(?=:)", list_1, perl = TRUE)
    regmatches(list_1, gre)[c(1:3, 42:44)]
    # [[1]]
    # [1] "0_300" "0"    
    # [[2]]
    # [1] "300_600" "0"      
    # [[3]]
    # [1] "600_800" "0"      
    # [[4]]
    # [1] "300_600" "m5"     
    # [[5]]
    # [1] "600_800" "m5"     
    # [[6]]
    # [1] "800_1000" "m5"      
    

CodePudding user response:

Using the input shown in the Note at the end to avoid excessive length replace the last underscore with a space and then read that using read.table. With this approach only relatively simple regular expressions are needed.

read.table(text = sub("(.*)_", "\\1 ", list_2))
##           V1     V2
## 1 area_0_300 minus5
## 2 area_0_300 minus4
## 3 area_0_300 minus3
## 4 area_0_300 minus2
## 5 area_0_300 minus1
## 6 area_0_300      0

Similarly for list_1 replace all occurrences of : and ) with a space, read it in and extract columns 2 and 4.

read.table(text = gsub("[:)]", " ", list_1))[c(2, 4)]
##         V2 V4
## 1    0_300  0
## 2  300_600  0
## 3  600_800  0
## 4 800_1000  0
## 5    0_300  1
## 6  300_600  1

Note

list_2 <- c("area_0_300_minus5",     
                "area_0_300_minus4",    
                "area_0_300_minus3",   
                "area_0_300_minus2",  
                "area_0_300_minus1", 
                "area_0_300_0")

list_1 <- c("as.factor(radius_ring)0_300:as.factor(year_delta)0:units",   
"as.factor(radius_ring)300_600:as.factor(year_delta)0:units",
"as.factor(radius_ring)600_800:as.factor(year_delta)0:units",
"as.factor(radius_ring)800_1000:as.factor(year_delta)0:units",
"as.factor(radius_ring)0_300:as.factor(year_delta)1:units",
"as.factor(radius_ring)300_600:as.factor(year_delta)1:units")
  •  Tags:  
  • r
  • Related