I have created a dictionary in quanteda and prepared my corpus etc. I have wordclouds and textstat_frequencies, so I can see the text data I'm trying to match is there.
However, when I run dtm, I get some hits, but on dfm_lookup I repeatedly get 100% sparsity. What am I doing wrong?
doc <- data.frame(symptoms = c("patient trouble breathing", "breathlessness", "heart palpitations",
"elevated troponin", "chest tightness, pericarditis", "patient drowsy, headaches,
low bloood pressure", "tummy pain, vomiting", "heart attack", "edema, achypnea"))
# create corpus from main text
corp <- corpus(doc, text_field = 'symptoms')
# prerocessing: tokenise, lowercase, remove stopwords and punctuation and perform stemming
corp2 <- corp %>%
tokens(remove_punct = T, remove_numbers = T, remove_symbols = T) %>%
tokens_tolower() %>%
tokens_remove(stopwords('en'))
# tokens_wordstem() # this won't suit for our dictionary, e.g. acute becomes acut, etc might re-appraise
corp2
dtm <- dfm(corp2)
# construct lexicon in a dataframe
terms <- structure(list(keywords = c("myocarditis", "pericarditis", "heart","heart-attack", "chest-pain", "tightness","tummy",
"irritability", "vomiting", "tachypnea", "achypnea", "tachypnoea",
"lethargy", "drowsiness", "drowsy"),
topic = c("cardiac", "cardiac","cardiac", "cardiac","cardiac", "non_specific", "non_specific",
"non_specific","non_specific","non_specific", "non_specific","non_specific",
"children", "children","children")) ,class = "data.frame", row.names = c(NA, -15L))
# convert to dictionary
lex <- split(terms$keywords, terms$topic)
lex <- (dictionary(lex))
# tokenise and apply dict to prepared symptom corpus
head(dfm(tokens(corp2), dictionary = lex))
dtm <- dfm(tokens(corp2), dictionary = lex)
dtm_lu <- dfm_lookup(dtm, lex, valuetype = "glob") #* get 100% sparse ? , but not above?
print(dtm_lu)
# show unmatched
dfm_lookup(dtm, lex1, nomatch = "_UNMATCHED")
What is the difference between
dtm <- dfm(tokens(corp2), dictionary = lex)
and
dtm_lu <- dfm_lookup(dtm, lex, valuetype = "glob")
What is dtm doing when you pass it both a corpus and a dictionary, is it maybe just joining them and that's why some results come back?
CodePudding user response:
Use tokens_lookup()
to apply the dictionary. dfm(x, dictionary = ...)
is deprecated and should not be used. Always better to use tokens_lookup()
because this matches multi-token values in the dictionary (although you had none).
Also I simplified your code a bit.
library("quanteda")
#> Package version: 3.2.3
#> Unicode version: 14.0
#> ICU version: 70.1
#> Parallel computing: 10 of 10 threads used.
#> See https://quanteda.io for tutorials and examples.
corp <- corpus(c("patient trouble breathing", "breathlessness", "heart palpitations",
"elevated troponin", "chest tightness, pericarditis", "patient drowsy, headaches,
low bloood pressure", "tummy pain, vomiting", "heart attack", "edema, achypnea"))
toks <- corp %>%
tokens(remove_punct = T, remove_numbers = T, remove_symbols = T) %>%
tokens_tolower() %>%
tokens_remove(stopwords('en'))
lex <- dictionary(list(keywords = c("myocarditis", "pericarditis", "heart","heart-attack", "chest-pain", "tightness","tummy",
"irritability", "vomiting", "tachypnea", "achypnea", "tachypnoea",
"lethargy", "drowsiness", "drowsy"),
topic = c("cardiac", "cardiac","cardiac", "cardiac","cardiac", "non_specific", "non_specific",
"non_specific","non_specific","non_specific", "non_specific","non_specific",
"children", "children","children")))
Now the key difference in the code from a functional standpoint:
dfmat <- toks %>%
tokens_lookup(dictionary = lex) %>%
dfm()
dfmat
#> Document-feature matrix of: 9 documents, 2 features (66.67% sparse) and 0 docvars.
#> features
#> docs keywords topic
#> text1 0 0
#> text2 0 0
#> text3 1 0
#> text4 0 0
#> text5 2 0
#> text6 1 0
#> [ reached max_ndoc ... 3 more documents ]