Skip to content

Latest commit

 

History

History
258 lines (246 loc) · 6.86 KB

繁简.md

File metadata and controls

258 lines (246 loc) · 6.86 KB

背景

搜索支持繁简搜索

方案

镜像:

FROM docker.elastic.co/elasticsearch/elasticsearch:5.5.3
RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.5.3/elasticsearch-analysis-ik-5.5.3.zip
RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch https://artifacts.elastic.co/downloads/elasticsearch-plugins/analysis-smartcn/analysis-smartcn-5.5.3.zip
RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch https://github.com/medcl/elasticsearch-analysis-stconvert/releases/download/v5.5.3/elasticsearch-analysis-stconvert-5.5.3.zip
docker run --rm --network host -e "discovery.type=single-node" -e "xpack.security.enabled=false" yzs/myes

创建自定义的分词器,繁简和ik

curl --location --request PUT 'http://localhost:9200/ts_analysis_ik' \
--header 'Content-Type: application/json' \
--data-raw '{
    "settings": {
        "analysis": {
            "analyzer": {
                "custom_stconvert_analyzer": {
                    "type": "custom",
                    "tokenizer": "ik_max_word",
                    "filter": "tsconvert",
                    "char_filter": "tsconvert"
                }
            },
             "filter": {
               "tsconvert" : {
                     "type" : "stconvert",
                     "delimiter" : "#",
                     "keep_both" : false,
                     "convert_type" : "t2s"
                 }
             },
            "char_filter" : {
                "tsconvert" : {
                    "type" : "stconvert",
                    "convert_type" : "t2s"
                }
            }
        }
    },
    "mappings": {
        "data": {
            "properties": {
                "slogan": {
                    "type": "text",
                    "analyzer": "custom_stconvert_analyzer",
                    "fields": {
                        "raw": {
                            "type": "keyword"
                        }
                    }
                }
            }
        }
    }
}

分词结果:

curl --location --request GET 'http://localhost:9200/ts_analysis_ik/_analyze' \
--header 'Content-Type: application/json' \
--header 'Content-Type: text/plain' \
--data-raw '{
  "analyzer": "custom_stconvert_analyzer",
  "text":     "覺醒後的神關羽果然沒讓我失望!變得更帥更強了😍 #一招制敵"
}'

分词结果:

{
    "tokens": [
        {
            "token": "觉醒",
            "start_offset": 0,
            "end_offset": 2,
            "type": "CN_WORD",
            "position": 0
        },
        {
            "token": "",
            "start_offset": 2,
            "end_offset": 3,
            "type": "CN_CHAR",
            "position": 1
        },
        {
            "token": "",
            "start_offset": 3,
            "end_offset": 4,
            "type": "CN_CHAR",
            "position": 2
        },
        {
            "token": "",
            "start_offset": 4,
            "end_offset": 5,
            "type": "CN_CHAR",
            "position": 3
        },
        {
            "token": "关羽",
            "start_offset": 5,
            "end_offset": 7,
            "type": "CN_WORD",
            "position": 4
        },
        {
            "token": "果然",
            "start_offset": 7,
            "end_offset": 9,
            "type": "CN_WORD",
            "position": 5
        },
        {
            "token": "没让",
            "start_offset": 9,
            "end_offset": 11,
            "type": "CN_WORD",
            "position": 6
        },
        {
            "token": "",
            "start_offset": 11,
            "end_offset": 12,
            "type": "CN_CHAR",
            "position": 7
        },
        {
            "token": "失望",
            "start_offset": 12,
            "end_offset": 14,
            "type": "CN_WORD",
            "position": 8
        },
        {
            "token": "变得",
            "start_offset": 15,
            "end_offset": 17,
            "type": "CN_WORD",
            "position": 9
        },
        {
            "token": "",
            "start_offset": 17,
            "end_offset": 18,
            "type": "CN_CHAR",
            "position": 10
        },
        {
            "token": "",
            "start_offset": 18,
            "end_offset": 19,
            "type": "CN_CHAR",
            "position": 11
        },
        {
            "token": "更强",
            "start_offset": 19,
            "end_offset": 21,
            "type": "CN_WORD",
            "position": 12
        },
        {
            "token": "强了",
            "start_offset": 20,
            "end_offset": 22,
            "type": "CN_WORD",
            "position": 13
        },
        {
            "token": "一招",
            "start_offset": 26,
            "end_offset": 28,
            "type": "CN_WORD",
            "position": 14
        },
        {
            "token": "",
            "start_offset": 26,
            "end_offset": 27,
            "type": "TYPE_CNUM",
            "position": 15
        },
        {
            "token": "",
            "start_offset": 27,
            "end_offset": 28,
            "type": "COUNT",
            "position": 16
        },
        {
            "token": "",
            "start_offset": 28,
            "end_offset": 29,
            "type": "CN_CHAR",
            "position": 17
        },
        {
            "token": "",
            "start_offset": 29,
            "end_offset": 30,
            "type": "CN_CHAR",
            "position": 18
        }
    ]
}

创建文档

curl --location --request POST 'http://localhost:9200/ts_analysis_ik/data/1' \curl --location --request POST 'http://localhost:9200/ts_analysis_ik/data/1' \--header 'Content-Type: application/json' \--header 'Content-Type: text/plain' \--data-raw '{ "slogan": "覺醒後的神關羽果然沒讓我失望!變得更帥更強了 #一招制敵"}'

搜索

curl --location --request GET 'http://localhost:9200/ts_analysis_ik/data/_search' \ --header 'Content-Type: application/json' \ --header 'Content-Type: text/plain' \ --data-raw '{ "query": { "multi_match": { "query": "关羽", "fields": ["slogan"] } } }'

结果:

{
    "took": 2,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
    },
    "hits": {
        "total": 1,
        "max_score": 0.27638745,
        "hits": [
            {
                "_index": "ts_analysis_ik",
                "_type": "data",
                "_id": "1",
                "_score": 0.27638745,
                "_source": {
                    "slogan": "覺醒後的神關羽果然沒讓我失望!變得更帥更強了😍 #一招制敵"
                }
            }
        ]
    }
}