使用插件
IK分词器
测试IK分词器
> curl -XPUT http://172.16.185.176:9200/index
> curl -XPOST http://172.16.185.176:9200/index/fulltext/_mapping?pretty -d'
{
"fulltext": {
"_all": {
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word",
"term_vector": "no",
"store": "false"
},
"properties": {
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word",
"include_in_all": "true",
"boost": 8
}
}
}
}'
> curl -XPOST http://172.16.185.176:9200/index/fulltext/1 -d'
{"content":"美国留给伊拉克的是个烂摊子吗"}'
> curl -XPOST http://172.16.185.176:9200/index/fulltext/2 -d'
{"content":"公安部:各地校车将享最高路权"}'
> curl -XPOST http://172.16.185.176:9200/index/fulltext/3 -d'
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}'
> curl -XPOST http://172.16.185.176:9200/index/fulltext/4 -d'
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}'
# 高亮查询
> curl -XPOST http://172.16.185.176:9200/index/fulltext/_search?pretty -d'
{
"query" : { "match" : { "content" : "中国" }},
"highlight" : {
"pre_tags" : ["<tag1>", "<tag2>"],
"post_tags" : ["</tag1>", "</tag2>"],
"fields" : {
"content" : {}
}
}
}'
自定义IK分词器
- 导入
t_es_ik_dic.sql
文件。
CREATE TABLE `t_es_ik_dic` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增id',
`ext_word` varchar(100) DEFAULT '' COMMENT '扩展分词',
`stop_word` varchar(100) DEFAULT '' COMMENT '停顿词',
`synonym` varchar(100) DEFAULT '' COMMENT '同义词',
`dic_status` tinyint(4) DEFAULT '0' COMMENT '状态,0表示未新增,1表示新增',
`deleted` tinyint(4) DEFAULT '0' COMMENT '0表示未删除,1表示删除',
`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '新增时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='ik分词器扩展'
修改
pom.xml
文件。
# 增加MySQL依赖
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.18</version>
</dependency>
# <elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin</elasticsearch.plugin.classname>改名为<elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.ik.UpAnalysisIkPlugin</elasticsearch.plugin.classname>
复制
org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin.java
,重命名为UpAnalysisIkPlugin.java
。复制
org.elasticsearch.index.analysis.IkTokenizerFactory.java
,重命名为UpIkTokenizerFactory.java
。在
org.elasticsearch.index.analysis
中增加DBRunnable.java
类增加
org.wltea.analyzer.db
包及DBHelper.java
类。增加
org.wltea.analyzer.utils
包及StringUtils.java
类。org.wltea.analyzer.dic.Dictionary.java
增加方法。
public void addStopWords(Collection<String> words) {
if (words != null) {
for (String word : words) {
if (word != null) {
// 批量加载词条到主内存词典中
singleton._StopWords.fillSegment(word.trim().toCharArray());
}
}
}
}
清除之前的IK分词器, 使用Maven打包项目,将
target/release
中的elasticsearch-analysis-ik-7.17.6.zip
拷贝到/opt/elasticsearch-7.17.6/plugins/ik
中,并解压elasticsearch-analysis-ik-7.17.6.zip
文件。将
mysql-connector-java-x.y.z.jar
也拷贝到/opt/elasticsearch-5.4.1/plugins/ik
目录(x.y.z
为使用的jar
包版本)。
验证IK分词器
> curl -XPUT '172.16.185.176:9200/ps4?pretty'
> curl -XPOST '172.16.185.176:9200/ps4/_analyze?pretty' -d'
{
"analyzer": "ik_max_word",
"text": "自定义IK分词器实现Elasticsearch分词"
}'
# 结合数据库中的内容“IK分词器”,就能看到效果
"tokens" : [
{
"token" : "IK分词器",
"start_offset" : 0,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 0
},
# 如果没有加载分词库,就会出现下面的内容
"tokens" : [
{
"token" : "IK",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "分词器",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 1
},
拼音插件
安装
下载拼音插件elasticsearch-analysis-pinyin源代码,并作为Maven项目导入到IDEA。
使用Maven打包项目,将
target/release
中的elasticsearch-analysis-pinyin.zip
拷贝到/opt/elasticsearch-7.17.6/plugins/pinyin中
中,并解压elasticsearch-analysis-pinyin.zip
文件。
验证
# 创建索引
> curl -XDELETE '172.16.185.176:9200/index?pretty'
# 创建分词
> curl -XPUT http://172.16.185.176:9200/index/ -d'
{
"index" : {
"analysis" : {
"analyzer" : {
"pinyin_analyzer" : {
"tokenizer" : "my_pinyin"
}
},
"tokenizer" : {
"my_pinyin" : {
"type" : "pinyin",
"keep_separate_first_letter" : false,
"keep_full_pinyin" : true,
"keep_original" : true,
"limit_first_letter_length" : 16,
"lowercase" : true,
"remove_duplicated_term" : true
}
}
}
}
}'
# 创建文档
> curl -XPOST http://172.16.185.176:9200/index/folks/andy?pretty -d'{"name":"唐伯虎"}'
# 创建mapping(搜索方式1)
> curl -XPOST http://172.16.185.176:9200/index/folks/_mapping -d'
{
"folks": {
"properties": {
"name": {
"type": "keyword",
"fields": {
"pinyin": {
"type": "text",
"store": "no",
"term_vector": "with_offsets",
"analyzer": "pinyin_analyzer",
"boost": 10
}
}
}
}
}
}'
# 测试
> curl -XGET http://172.16.185.176:9200/index/folks/_search?pretty&q=name.pinyin:tang
> curl -XGET http://172.16.185.176:9200/index/folks/_search?pretty&q=name.pinyin:t
> curl -XGET http://172.16.185.176:9200/index/folks/_search?pretty&q=name.pinyin:bo+hu
> curl -XGET http://172.16.185.176:9200/index/folks/_search?pretty&q=name.pinyin:tbh
> curl -XGET http://172.16.185.176:9200/index/folks/_search?pretty&q=name.pinyin:tangbohu
# 创建mapping(搜索方式2)
> curl -XPOST http://172.16.185.176:9200/index/folks/_mapping -d'
{
"folks": {
"properties": {
"name": {
"type": "text",
"analyzer": "pinyin_analyzer",
"search_analyzer": "pinyin_analyzer"
}
}
}
}'
# 测试
> curl -XGET http://172.16.185.176:9200/index/folks/_search?pretty&q=name:tangbohu
{
"size" : 10,
"query": {"match":{"name" :"tangbohu"}}
}'
拼音 + 同义词
# 创建索引
> curl -XDELETE '172.16.185.176:9200/index?pretty'
> curl -XPUT 172.16.185.176:9200/index/ -d'
{
"index" : {
"analysis" : {
"analyzer" : {
"combin_analyzer" : {
"tokenizer" : "ik_max_word",
"filter" : ["pinyinFilter" , "iksynonym"]
}
},
"filter" : {
"pinyinFilter" : {
"type" : "pinyin",
"keep_first_letter" : true,
"keep_full_pinyin" : true,
"keep_original" : true,
"limit_first_letter_length" : 16,
"lowercase" : true
},
"iksynonym" : {
"type" : "ik_synonym"
}
}
}
}
}'
# 创建mapping
> curl -XPOST 172.16.185.176:9200/index/folks/_mapping -d'
{
"folks": {
"properties": {
"name": {
"type": "text",
"analyzer": "combin_analyzer",
"search_analyzer": "combin_analyzer"
}
}
}
}'
# 创建文档
> curl -XPOST '172.16.185.176:9200/index/folks/1?pretty' -d'{"name":"旅游"}'
> curl -XPOST '172.16.185.176:9200/index/folks/2?pretty' -d'{"name":"酒店"}'
# 测试
> curl -XGET 'http://172.16.185.176:9200/index/folks/_search?pretty' -d '
{
"size" : 10,
"query": {"match": {"name" : "旅行"}}
}'
> curl -XGET 'http://172.16.185.176:9200/index/folks/_search?pretty' -d '
{
"size" : 10,
"query": {"match": {"name" : "宾馆"}}
}'
> curl -XGET http://172.16.185.176:9200/index/folks/_search?pretty&q=name.pinyin:ly
> curl -XGET http://172.16.185.176:9200/index/folks/_search?pretty&q=name.pinyin:bg
整合
# 创建索引
> curl -XDELETE '172.16.185.176:9200/product?pretty'
> curl -XPUT '172.16.185.176:9200/product?pretty' -d'
{
"settings" : {
"index" : {
"number_of_shards" : 3,
"number_of_replicas" : 2
},
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "my_tokenizer"
},
"combin_analyzer" : {
"tokenizer" : "ik_max_word",
"filter" : ["pinyinFilter" , "iksynonym"]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type" : "pattern",
"pattern" : "\\,"
}
},
"filter" : {
"pinyinFilter" : {
"type" : "pinyin",
"keep_first_letter" : true,
"keep_full_pinyin" : true,
"keep_original" : true,
"limit_first_letter_length" : 16,
"lowercase" : true
},
"iksynonym" : {
"type" : "ik_synonym"
}
}
}
}
}'
> curl -XPUT '172.16.185.176:9200/product/_mapping/product_plan?pretty' -d'
{
"properties": {
"id" : { "type": "keyword" },
"name" : { "type": "text", "analyzer": "combin_analyzer", "search_analyzer": "combin_analyzer" },
// "index_options" : "offsets" 用于postings高亮
// "term_vector" : "with_positions_offsets" 用于fvh高亮
"desc" : { "type": "text", "analyzer": "combin_analyzer", "search_analyzer": "combin_analyzer", "store": false },
"areas" : { "type": "text", "analyzer": "my_analyzer", "search_analyzer": "my_analyzer", "store": true },
"price" : { "type": "float" },
"productid" : { "type": "keyword" },
"productname" : { "type": "text","analyzer": "combin_analyzer", "search_analyzer": "combin_analyzer" },
"categorys" : {
"type" : "nested",
"properties" : {
"id" : { "type": "keyword" },
"name" : { "type": "text" , "analyzer": "ik_max_word" },
"parentid" : { "type": "keyword" }
}
}
}
}'
感谢支持
更多内容,请移步《超级个体》。