I have to search out account temp123, TEMP456 with word temp OR TEMP
Here is my index with ngram tokenizer and some sample doc
# index
PUT /demo
{
"settings": {
"index": {
"max_ngram_diff": "20",
"analysis": {
"analyzer": {
"account_analyzer": {
"tokenizer": "account_tokenizer"
}
},
"tokenizer": {
"account_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "1",
"type": "ngram",
"max_gram": "15"
}
}
}
}
},
"mappings": {
"properties": {
"account": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "account_analyzer",
"search_analyzer": "standard"
}
}
}
}
# docs
PUT /demo/_doc/1
{
"account": "temp123"
}
PUT /demo/_doc/2
{
"account": "TEMP456"
}
With following queries, I expect to get both docs back. But I got doc 1 only. It seems like I can not get doc with capital word.
How should I do to get both docs back with temp or TEMP ?
POST /demo/_search/
{
"query": {
"bool": {
"must": [
{
"match": {
"account": {
"query": "temp",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
POST /demo/_search/
{
"query": {
"bool": {
"must": [
{
"match": {
"account": {
"query": "TEMP",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
CodePudding user response:
You can use _analyze to check the tokens that your analyzer is generating.
GET demo/_analyze
{
"analyzer": "account_analyzer",
"text": ["TEMP123"]
}
"tokens" : [
{
"token" : "T",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "TE",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "TEM",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "TEMP",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "TEMP1",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 4
},
{
"token" : "TEMP12",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 5
},
{
"token" : "TEMP123",
"start_offset" : 0,
"end_offset" : 7,
"type" : "word",
"position" : 6
},
{
"token" : "E",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 7
},
{
"token" : "EM",
"start_offset" : 1,
"end_offset" : 3,
"type" : "word",
"position" : 8
},
{
"token" : "EMP",
"start_offset" : 1,
"end_offset" : 4,
"type" : "word",
"position" : 9
},
{
"token" : "EMP1",
"start_offset" : 1,
"end_offset" : 5,
"type" : "word",
"position" : 10
},
{
"token" : "EMP12",
"start_offset" : 1,
"end_offset" : 6,
"type" : "word",
"position" : 11
},
{
"token" : "EMP123",
"start_offset" : 1,
"end_offset" : 7,
"type" : "word",
"position" : 12
},
{
"token" : "M",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 13
},
{
"token" : "MP",
"start_offset" : 2,
"end_offset" : 4,
"type" : "word",
"position" : 14
},
{
"token" : "MP1",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 15
},
{
"token" : "MP12",
"start_offset" : 2,
"end_offset" : 6,
"type" : "word",
"position" : 16
},
{
"token" : "MP123",
"start_offset" : 2,
"end_offset" : 7,
"type" : "word",
"position" : 17
},
{
"token" : "P",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 18
},
{
"token" : "P1",
"start_offset" : 3,
"end_offset" : 5,
"type" : "word",
"position" : 19
},
{
"token" : "P12",
"start_offset" : 3,
"end_offset" : 6,
"type" : "word",
"position" : 20
},
{
"token" : "P123",
"start_offset" : 3,
"end_offset" : 7,
"type" : "word",
"position" : 21
},
{
"token" : "1",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 22
},
{
"token" : "12",
"start_offset" : 4,
"end_offset" : 6,
"type" : "word",
"position" : 23
},
{
"token" : "123",
"start_offset" : 4,
"end_offset" : 7,
"type" : "word",
"position" : 24
},
{
"token" : "2",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 25
},
{
"token" : "23",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 26
},
{
"token" : "3",
"start_offset" : 6,
"end_offset" : 7,
"type" : "word",
"position" : 27
}
]
You need to add a lower case filter to your analyzer to that all tokens that are generate have lower case
{
"settings": {
"index": {
"max_ngram_diff": "20",
"analysis": {
"analyzer": {
"account_analyzer": {
"tokenizer": "account_tokenizer",
"filter": [ ----> note
"lowercase"
]
}
},
"tokenizer": {
"account_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "1",
"type": "ngram",
"max_gram": "15"
}
}
}
}
},
"mappings": {
"properties": {
"account": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "account_analyzer",
"search_analyzer": "standard"
}
}
}
}