_", ] }, "quote_filte" /> _", ] }, "quote_filte" /> _", ] }, "quote_filte"/>

How to include dotted words in Elasticsearch analyzer?

125 Views Asked by At

I am using this analyzer

"settings": {
    "analysis": {
        "char_filter": {
            "my_char_filter": {
                "type": "mapping",
                "mappings": [
                    "- => _",
                ]
            },
            "quote_filter": {
                "type": "mapping",
                "mappings": [
                    "\\u0091=>\\u0020",
                    "\\u0092=>\\u0020",
                ]
            }
        },
        "analyzer": {
            "my_analyzer": {
                "tokenizer": "standard",
                "char_filter": [
                    "my_char_filter", "quote_filter"
                ],
                "filter": [
                    "lowercase",
                ]
            }
        }
    }
}

within this mapping:

"mappings": {
    "properties": {
        "title": {
            "type": "text",
            "analyzer": "my_analyzer",
            "term_vector": "with_positions_offsets",
        },
        "description": {
            "type": "text",
            "analyzer": "my_analyzer",
            "term_vector": "with_positions_offsets",
            "fielddata": True
        },
    }
}

and everything works with simple keywords.

So, if I use this query

{
    "query":
    {
        "bool":
        {
            "must":
            [
                {
                    "query_string":
                    {
                        "query": "\".net\" OR \".com\"",
                        "fields":
                        [
                            "title",
                            "description"
                        ]
                    }
                }
            ]
        }
    },
    "highlight":
    {
        "pre_tags":
        [
            "<match>"
        ],
        "post_tags":
        [
            "</match>"
        ],
        "fields":
        {
            "title":
            {
                "type": "fvh",
                "number_of_fragments": 0
            },
            "description":
            {
                "type": "fvh",
                "number_of_fragments": 0
            }
        }
    }
}

to search ".com" in following description "Google.com is an American multinational technology company (COM) that focuses on artificial intelligence, search engine technology, online advertising, cloud computing and computer software" it only matches "COM" (inside parentheses) instead of ".com".

How can I solve this issue?

EDIT: I am finding that query:

"query_string" : {
    "query" : ".com OR .net OR Engine OR American" # by removing '\"'
    "fields": ["title","description"],
}

work partially, since it maches "Engine" and "American" but I can't know if matches ".com" or ".net" (a human eye would obviously be able to) because query response give me:

matched_keywords: {'Engine', 'American', 'Google.com'}

So, how can have something like

matched_keywords: {'Engine', 'American', '*.com'} 

?

1

There are 1 best solutions below

3
rabbitbr On

This is because the closest token you have is "google.com", in your case the wildcard could solve it but you would lose performance.

{
          "wildcard": {
            "description": {
              "value": "*.com"
            }
          }
        }