I have a dataset (z
) with strings which are very long in z$txt
. I also have a dictionary (incd
) of keywords that need to be identified. In the column z$inc.terms
. I need to have all the keywords (same keyword may be repeated n times in the same string so need this for each occurrence) with the 5 characters before and after (eg so I can see the "keyword in its context").
#CREATE "z" DATASET
z<-data.frame(matrix("",3,3))
names(z)<-c("row","txt","inc.terms")
z$row<-c(1,2,3)
z[1,2]<-"I like the sky when the sky is blu not when the sky is grey"
z[2,2]<-"I like the mountains when the sky is blu not when the mountains are cloudy"
z[3,2]<-"I like the sky when the sky is dark in the mountains"
incd<-c("sky","mountains") #inclusion dictionary
This is what I have managed to achieve but it only returns the first of the keywords and I need each of the keywords (actually this doesn't work either, not sure why, but it works in my original data, which is more complex and that cannot be shared for data protection).
for(i in incd){
for(j in z$row){
z$inc.terms[z$row==j]<-paste(z$inc.term[z$row==j],paste(stringr::str_sub(stringr::str_split(z$txt[z$row==j],i,simplify=TRUE)[,1],-5,-1),i,stringr::str_sub(stringr::str_split(z$txt[z$row==j],i,simplify=TRUE)[,2],1,5)),sep=" /// ")
}
}
this is what I have been using but it returns the first occurrence of each keyword in each cell and not each one.
The result I would like in z$inc.terms
is as follows for z$inc.terms
:
z[1,3] " the sky when" /// " the sky is b" /// " the sky is g"
z[2,3] " the mountains when" /// " the sky is b" /// " the mountains are "
z[3,3] " the sky when" /// " the sky is d" /// " the mountains"
CodePudding user response:
Here's a tidy solution:
library(dplyr)
library(stringr)
z<-data.frame(matrix("",3,3))
names(z)<-c("row","txt","inc.terms")
z$row<-c(1,2,3)
z[1,2]<-"I like the sky when the sky is blu not when the sky is grey"
z[2,2]<-"I like the mountains when the sky is blu not when the mountains are cloudy"
z[3,2]<-"I like the sky when the sky is dark in the mountains"
incd<-c("sky","mountains")
words <- paste(incd, collapse="|")
z <- z %>%
mutate(inc.terms = str_extract_all(z$txt, paste0(".{5}(", words, ").{5}")))
z
#> row
#> 1 1
#> 2 2
#> 3 3
#> txt
#> 1 I like the sky when the sky is blu not when the sky is grey
#> 2 I like the mountains when the sky is blu not when the mountains are cloudy
#> 3 I like the sky when the sky is dark in the mountains
#> inc.terms
#> 1 the sky when, the sky is b, the sky is g
#> 2 the mountains when, the sky is b, the mountains are
#> 3 the sky when, the sky is d
z$inc.terms
#> [[1]]
#> [1] " the sky when" " the sky is b" " the sky is g"
#>
#> [[2]]
#> [1] " the mountains when" " the sky is b" " the mountains are "
#>
#> [[3]]
#> [1] " the sky when" " the sky is d"
Created on 2022-05-06 by the reprex package (v2.0.1)
CodePudding user response:
You can try regmatches
if you are with base R
transform(
z,
inc.terms = regmatches(
txt,
gregexpr(
sprintf(".{0,5}(%s).{0,5}", paste0(incd, collapse = "|")),
txt
)
)
)
which gives
row
1 1
2 2
3 3
txt
1 I like the sky when the sky is blu not when the sky is grey
2 I like the mountains when the sky is blu not when the mountains are cloudy
3 I like the sky when the sky is dark in the mountains
inc.terms
1 the sky when, the sky is b, the sky is g
2 the mountains when, the sky is b, the mountains are
3 the sky when, the sky is d, the mountains