Home > database >  How to find out capital word with ngram tokenizer in Elasticsearch 7
How to find out capital word with ngram tokenizer in Elasticsearch 7

Time:12-01

I have to search out account temp123, TEMP456 with word temp OR TEMP

Here is my index with ngram tokenizer and some sample doc

# index
PUT /demo
{
    "settings": {
        "index": {
            "max_ngram_diff": "20",
            "analysis": {
                "analyzer": {
                    "account_analyzer": {
                        "tokenizer": "account_tokenizer"
                    }
                },
                "tokenizer": {
                    "account_tokenizer": {
                        "token_chars": [
                            "letter",
                            "digit"
                        ],
                        "min_gram": "1",
                        "type": "ngram",
                        "max_gram": "15"
                    }
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "account": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                },
                "analyzer": "account_analyzer",
                "search_analyzer": "standard"
            }
        }
    }
}

# docs

PUT /demo/_doc/1
{
  "account": "temp123"
}

PUT /demo/_doc/2
{
  "account": "TEMP456"
}

With following queries, I expect to get both docs back. But I got doc 1 only. It seems like I can not get doc with capital word.

How should I do to get both docs back with temp or TEMP ?

POST /demo/_search/
{
  "query": {
      "bool": {
          "must": [
            {
              "match": {
                "account": {
                  "query": "temp",
                  "fuzziness": "AUTO"
                }
              }
             }
          ]
      }
  }
}


POST /demo/_search/
{
  "query": {
      "bool": {
          "must": [
            {
              "match": {
                "account": {
                  "query": "TEMP",
                  "fuzziness": "AUTO"
                }
              }
             }
          ]
      }
  }
}

CodePudding user response:

You can use _analyze to check the tokens that your analyzer is generating.

GET demo/_analyze
{
  "analyzer": "account_analyzer",
  "text": ["TEMP123"]
}
"tokens" : [
    {
      "token" : "T",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "TE",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "TEM",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "TEMP",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "TEMP1",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "TEMP12",
      "start_offset" : 0,
      "end_offset" : 6,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "TEMP123",
      "start_offset" : 0,
      "end_offset" : 7,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "E",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "EM",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "EMP",
      "start_offset" : 1,
      "end_offset" : 4,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "EMP1",
      "start_offset" : 1,
      "end_offset" : 5,
      "type" : "word",
      "position" : 10
    },
    {
      "token" : "EMP12",
      "start_offset" : 1,
      "end_offset" : 6,
      "type" : "word",
      "position" : 11
    },
    {
      "token" : "EMP123",
      "start_offset" : 1,
      "end_offset" : 7,
      "type" : "word",
      "position" : 12
    },
    {
      "token" : "M",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "word",
      "position" : 13
    },
    {
      "token" : "MP",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "word",
      "position" : 14
    },
    {
      "token" : "MP1",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "word",
      "position" : 15
    },
    {
      "token" : "MP12",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "word",
      "position" : 16
    },
    {
      "token" : "MP123",
      "start_offset" : 2,
      "end_offset" : 7,
      "type" : "word",
      "position" : 17
    },
    {
      "token" : "P",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "word",
      "position" : 18
    },
    {
      "token" : "P1",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "word",
      "position" : 19
    },
    {
      "token" : "P12",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "word",
      "position" : 20
    },
    {
      "token" : "P123",
      "start_offset" : 3,
      "end_offset" : 7,
      "type" : "word",
      "position" : 21
    },
    {
      "token" : "1",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "word",
      "position" : 22
    },
    {
      "token" : "12",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "word",
      "position" : 23
    },
    {
      "token" : "123",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "word",
      "position" : 24
    },
    {
      "token" : "2",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "word",
      "position" : 25
    },
    {
      "token" : "23",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "word",
      "position" : 26
    },
    {
      "token" : "3",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "word",
      "position" : 27
    }
  ]

You need to add a lower case filter to your analyzer to that all tokens that are generate have lower case

{
  "settings": {
    "index": {
      "max_ngram_diff": "20",
      "analysis": {
        "analyzer": {
          "account_analyzer": {
            "tokenizer": "account_tokenizer",
            "filter": [                       ----> note
              "lowercase"
            ]
          }
        },
        "tokenizer": {
          "account_tokenizer": {
            "token_chars": [
              "letter",
              "digit"
            ],
            "min_gram": "1",
            "type": "ngram",
            "max_gram": "15"
          }
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "account": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        },
        "analyzer": "account_analyzer",
        "search_analyzer": "standard"
      }
    }
  }
}
  • Related