I want to make a search API for my blog, I was storing all the data inside an elasticsearch in HTML format to use it in full text search as fast as I can, but HTML tags bother me to search inside my contents. with many searches I was found an answer about how I could ignore them in search but I can't filtering them out to don't show in results is there any way to do this?
Now I search and get the results with the following:
POST /test/_search HTTP/1.1
Content-Type: application/json
Content-Length: 68
{
"query": {
"match": {
"html": "more"
}
}
}
Response:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.2876821,
"hits": [
{
"_index": "test",
"_type": "_doc",
"_id": "1",
"_score": 0.2876821,
"_source": {
"html": "<html><body><h1 style=\"font-family: Arial\">Test</h1> <span>More test</span></body></html>"
}
}
]
}
}
But I want to get something like this:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.2876821,
"hits": [
{
"_index": "test",
"_type": "_doc",
"_id": "1",
"_score": 0.2876821,
"_source": {
"html": "Test More test"
}
}
]
}
}
CodePudding user response:
You need to use HTML strip character filter in your mapping. Through it you will remove HTML elements from your document. I used this post to try to get close to your result.
PUT idx_test
{
"settings": {
"analysis": {
"filter": {
"my_pattern_replace_filter": {
"type": "pattern_replace",
"pattern": "\n",
"replacement": ""
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase"
],
"char_filter": [
"html_strip"
]
},
"parsed_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"char_filter": [
"html_strip"
],
"filter": [
"my_pattern_replace_filter"
]
}
}
}
},
"mappings": {
"properties": {
"html": {
"type": "text",
"analyzer": "my_analyzer",
"fields": {
"raw": {
"type": "text",
"fielddata": true,
"analyzer": "parsed_analyzer"
}
}
}
}
}
}
POST idx_test/_doc
{
"html": """<html><body><h1 style="font-family: Arial">Test</h1> <span>More test</span></body></html>"""
}
GET idx_test/_search
{
"script_fields": {
"html_raw": {
"script": "doc['html.raw']"
}
},
"query": {
"match": {
"html": "more"
}
}
}
Results:
"hits": [
{
"_index": "idx_test",
"_id": "0b-UqoMBCzQxtx05B-WH",
"_score": 0.2876821,
"fields": {
"html_raw": [
"Test More test"
]
}
}
]