可以看出gallery_id是第一条,它的rate的是75,满分是100,匹配度蛮高的。
说一下匹配度算法原则,如果完全匹配就是1百分,肯定就上了。然后去除某些关键字后,也匹配上了就是90分。最后采用分词算法,按照1百分打分,其中如果高于50分,可以算基本匹配,自动配置图片的时候,就可以当成匹配成功。总体原则就是匹配词汇越多,分数越多。但是两个字的词汇,和5个字的词汇,分数是不一样的。还有词性,专属词汇理论上应该比形容词分数高。详见下面的calculateWeight代码,自己体会了。
public List
List segmentList) {
String name = "%" + searchstr + "%";
// 先简单搜索 ,完全匹配100分
List
if (CommonUtils.isEmpty(list)) {
searchstr = searchstr.replaceAll("\s", "");
String regEx = "(特价)|(/)|(\()|(\))|(()|())|(\d+ml)|(买.送.)|(/)|(\*)";
searchstr = searchstr.replaceAll(regEx, "");
if (CommonUtils.isNotEmpty(searchstr)) {
name = "%" + searchstr + "%";
// 简单过滤 90分
list = queryList(name, pagenumber, pagesize, 90);
}
// 剩下分词 靠计算
if (CommonUtils.isEmpty(list)) {
if (CommonUtils.isNotEmpty(segmentList)) {
list = queryListTerm(pagenumber, pagesize, segmentList, materialsortname);
}
// 如果只有分类,先定10分
else if (CommonUtils.isNotEmpty(materialsortname))
list = queryList(materialsortname, pagenumber, pagesize, 10);
}
}
return list;
}
private List
String sql = "SELECTn" + " a.gallery_id,n" + " a.fileid,a.material_allname,a.materialname n, " + rate
+ " rate FROMn" + " wj_tbl_gallery an" + "WHEREn"
+ " a.material_allname LIKE :searchstr and a.status = 0 order by length(materialname) LIMIT :pagenumber,:pagesize ";
Dto param = new BaseDto();
param.put("searchstr", name).put("pagenumber", pagenumber * pagesize).put("pagesize", pagesize);
return namedParameterJdbcTemplate.queryForList(sql, param);
private List
String materialsortname) {
Dto param = new BaseDto();
StringBuffer *** = new StringBuffer();
StringBuffer w*** = new StringBuffer(" (");
// 总权重
int tw = 0;
if (CommonUtils.isNotEmpty(segmentList)) {
for (int i = 0; i < segmentList.size(); i++) {
String str = segmentList.get(i).word;
int w = SegmentUtils.calculateWeight(segmentList.get(i));
str = StringUtils.escapeMysqlSpecialChar(str);
tw += w;
***.append("if(LOCATE('").append(str).append("', a.material_allname),").append(w).append(",0) ");
w***.append(" a.material_allname like '%").append(str).append("%' ");
if (i < segmentList.size() - 1) {
***.append(" + ");
w***.append(" or ");
}
}
// 类别单独处理,目前权重较低
// 表示字符串是否为空
int emptylen = 3;
if (CommonUtils.isNotEmpty(materialsortname)) {
if (***.length() > emptylen) {
***.append(" + ");
w***.append(" or ");
}
tw += SegmentUtils.DWEIGHT;
materialsortname = StringUtils.escapeMysqlSpecialChar(materialsortname);
***.append(" if(LOCATE('").append(materialsortname).append("', a.material_allname),")
.append(SegmentUtils.DWEIGHT).append(",0) ");
w***.append(" a.material_allname like '%").append(materialsortname)
.append("%' ");
}
if (***.length() > emptylen) {
***.append(" as score ");
w***.append(") ");
String scoreSelect = ***.toString();
String scorewhere = w***.toString();
String sql = "select gallery_id,fileid,materialname,material_allname,score,ROUND(score/" + tw
+ "*100, 0) rate from (SELECT " + " a.gallery_id, "
+ " a.fileid,materialname,material_allname, " + scoreSelect + " FROM "
+ " wj_tbl_gallery a " + "WHERE " + " a.status = 0 and " + scorewhere
+ " ) b order by score desc ,materialname LIMIT " + pagenumber * pagesize + "," + pagesize;
param.put("pagenumber", pagenumber * pagesize).put("pagesize", pagesize);
logger.debug("商家搜索图库的SQL语句是{}", sql);
List
if (CommonUtils.isNotEmpty(list)) {
return list;
}
}
}
/**
* 计算分词权重
* @author deng
* @date 2019年6月21日
* @param term
* @return
*/
public static int calculateWeight(Term term) {
// 汉字数
int num = countChinese(term.word);
// 大于3个汉字,权重增加
int value = num >= 3 ? 2 + (num - 3) / 2 : DWEIGHT;
// 专属词,如果有两个字至少要最小分是2分
if (term.nature == Nature.nz && value <= DWEIGHT) {
value = DWEIGHT + 1;
}
return value;
}
总结一下,本文介绍的商品图片推荐和自动匹配方法,可以看出来是相当简单的,本质就是mysql的like%% 优化来的,依赖sql语句和hanlp分词库,做法简单,但是能满足专门商品的匹配,适合小图库。自然比不上大公司搞的搜索引擎来的效率高,仅供参考。