I have a vector tissue
which contains strings delimited by multiple characters. The constituent strings of the vector belong broadly to four classes:
Strings which only consist of term(s) (e.g.
Thymus
Thyroid
) separated by,
Strings which contain identifier(s) (e.g.
ECO:0000313|RefSeq:XP_014046664.1
) ending with},
followed by term(s) separated by,
Strings which contain a term followed by identifier(s)
Strings which contain a term followed by an identifier and then term(s) separated by
,
tissue <- c("Head kidney,Thymus,Thyroid,", "Red blood cell,", "ECO:0000313|RefSeq:XP_014046664.1},Muscle,", "ECO:0000313|RefSeq:XP_016683349.1},ECO:0000313|RefSeq:XP_016683354.1},Leaf,", "ECO:0000313|RefSeq:XP_014023833.1},Head kidney,Muscle,White muscle,", "Blood,ECO:0000313|RefSeq:XP_017326031.1},", "Spleen,ECO:0000313|RefSeq:XP_010844217.1},ECO:0000313|RefSeq:XP_010844218.1},", "Brain,ECO:0000313|RefSeq:XP_014030244.1},Head kidney,Muscle,Spleen,White muscle,")
For strings belonging to category 1, I could split the terms using a simple strsplit()
function
unlist(strsplit("Head kidney,Thymus,Thyroid,", ","))
[1] "Head kidney" "Thymus" "Thyroid"
unlist(strsplit("Red blood cell,", ","))
[1] "Red blood cell"
For strings belonging to category 2, this is what I came up with and it works fine
unlist(strsplit(sub('.*\\},', "", "ECO:0000313|RefSeq:XP_014046664.1},Muscle,"), ","))
[1] "Muscle"
unlist(strsplit(sub('.*\\},', "", "ECO:0000313|RefSeq:XP_016683349.1},ECO:0000313|RefSeq:XP_016683354.1},Leaf,"), ","))
[1] "Leaf"
unlist(strsplit(sub('.*\\},', "", "ECO:0000313|RefSeq:XP_014023833.1},Head kidney,Muscle,White muscle,"), ","))
[1] "Head kidney" "Muscle" "White muscle"
For strings belonging to category 3, this worked for me
sub(',ECO:.*', "", "Blood,ECO:0000313|RefSeq:XP_017326031.1},")
[1] "Blood"
sub(',ECO:.*', "", "Spleen,ECO:0000313|RefSeq:XP_010844217.1},ECO:0000313|RefSeq:XP_010844218.1},")
[1] "Spleen"
For category 4, this is what I tried and it works fine
unlist(strsplit(sub(',ECO:.*},', ",", "Brain,ECO:0000313|RefSeq:XP_014030244.1},Head kidney,Muscle,Spleen,White muscle,"), ","))
[1] "Brain" "Head kidney" "Muscle" "Spleen" "White muscle"
I'm looking for a solution, a single regex if possible, which can handle all these conditions and can be used directly on the vector.
CodePudding user response:
We may remove some of the substring and then use strsplit
library(stringr)
lapply(strsplit(str_remove_all(tissue, "ECO:[^\\}] \\}"), ","),
function(x) x[nzchar(x)])
-output
[[1]]
[1] "Head kidney" "Thymus" "Thyroid"
[[2]]
[1] "Red blood cell"
[[3]]
[1] "Muscle"
[[4]]
[1] "Leaf"
[[5]]
[1] "Head kidney" "Muscle" "White muscle"
[[6]]
[1] "Blood"
[[7]]
[1] "Spleen"
[[8]]
[1] "Brain" "Head kidney" "Muscle" "Spleen" "White muscle"
Or with a tidyverse work flow
library(dplyr)
library(tidyr)
str_remove_all(tissue, "ECO:[^\\}] \\}") %>%
trimws(whitespace = ", ") %>%
str_replace_all(',{2,}', ",") %>%
tibble(col1 = .) %>%
tidyr::separate(col1, into = str_c('V',
seq(max(str_count(.$col1, ",")) 1)), sep = ",", fill = "right")
-output
# A tibble: 8 × 5
V1 V2 V3 V4 V5
<chr> <chr> <chr> <chr> <chr>
1 Head kidney Thymus Thyroid <NA> <NA>
2 Red blood cell <NA> <NA> <NA> <NA>
3 Muscle <NA> <NA> <NA> <NA>
4 Leaf <NA> <NA> <NA> <NA>
5 Head kidney Muscle White muscle <NA> <NA>
6 Blood <NA> <NA> <NA> <NA>
7 Spleen <NA> <NA> <NA> <NA>
8 Brain Head kidney Muscle Spleen White muscle
Or using only base R
read.csv(text = gsub(",{2,}", ",", trimws(gsub("ECO:[^\\}] \\}",
"", tissue), whitespace = ", ")), header = FALSE, fill = TRUE, sep=",")
CodePudding user response:
How about:
library(stringr)
x <- str_remove(unlist(str_match_all(tissue, '(.*?)(?=\\,)')), '^ECO.*')
unique(x[x != ""])
[1] "Head kidney" "Thymus" "Thyroid" "Red blood cell"
[5] "Muscle" "Leaf" "White muscle" "Blood"
[9] "Spleen" "Brain"