Hashtag searches and Japanese full text search in elasticsearch

18 Views Asked by At

We are trying to incorporate hashtag searches and Japanese full text searches in our data. We were able to achieve them separately but when we try to combine the two configs together, it doesn't work as expected.

PS: Our data will be stored in multiple languages.

Below are the configs that we used.

Hashtag search:

    "settings": {
        "index": {
            "analysis": {
                "filter": {
                    "hashtag_filter": {
                        "type": "word_delimiter",
                        "type_table": [
                            "# => ALPHA"
                        ]
                    }
                },
                "analyzer": {
                    "hashtag_analyzer": {
                        "type": "custom",
                        "tokenizer": "whitespace",
                        "filter": [
                            "lowercase",
                            "hashtag_filter"
                        ]
                    }
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "caption": {
                "type": "text",
                "analyzer": "hashtag_analyzer"
            }
        }
    }
}

CJK full text search:

    "settings": {
        "index": {
            "analysis": {
                "analyzer": {
                    "kuromoji_analyzer": {
                        "char_filter": [
                            "icu_normalizer"
                        ],
                        "tokenizer": "kuromoji_tokenizer",
                        "filter": [
                            "kuromoji_baseform",
                            "kuromoji_part_of_speech",
                            "cjk_width",
                            "ja_stop",
                            "kuromoji_stemmer",
                            "lowercase"
                        ]
                    }
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "caption": {
                "type": "text",
                "analyzer": "kuromoji_analyzer"
            }
        }
    }
}

The two configs combined together, which doesn't seem to be working:

{
  "settings": {
    "index": {
      "analysis": {
        "filter": {
          "hashtag_filter": {
            "type": "word_delimiter",
            "type_table": [
              "# => ALPHA"
            ]
          }
        },
        "analyzer": {
          "kuromoji_hashtag_analyzer": {
            "char_filter": [
              "icu_normalizer"
            ],
            "type": "custom",
            "tokenizer": "kuromoji_tokenizer",
            "filter": [
              "kuromoji_baseform",
              "kuromoji_part_of_speech",
              "cjk_width",
              "ja_stop",
              "kuromoji_stemmer",
              "lowercase",
              "hashtag_filter"
            ]
          }
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "caption": {
        "type": "text",
        "analyzer": "kuromoji_hashtag_analyzer"
      }
    }
  }
}


  [1]: https://discuss.elastic.co/t/how-can-i-correctly-index-screen-name-hashtag-and-url-in-japanese-text/147946
0

There are 0 best solutions below