ik mysql热加载分词_Elasticsearch 之(25)重写IK分词器源码来基于mysql热更新词库...
熱更新在上一節《
IK分詞器配置文件講解以及自定義詞庫》自定義詞庫,每次都是在es的擴展詞典中,手動添加新詞語,很坑
(1)每次添加完,都要重啟es才能生效,非常麻煩
(2)es是分布式的,可能有數百個節點,你不能每次都一個一個節點上面去修改
es不停機,直接我們在外部某個地方添加新的詞語,es中立即熱加載到這些新詞語
熱更新的方案(1)修改ik分詞器源碼,然后手動支持從mysql中每隔一定時間,自動加載新的詞庫
(2)基于ik分詞器原生支持的熱更新方案,部署一個web服務器,提供一個http接口,通過modified和tag兩個http響應頭,來提供詞語的熱更新
用第一種方案,第二種,ik git社區官方都不建議采用,覺得不太穩定
1、下載源碼
https://github.com/medcl/elasticsearch-analysis-ik/tree/v5.2.0
ik分詞器,是個標準的java maven工程,直接導入eclipse就可以看到源碼
2、修改源碼
Dictionary單例類的初始化方法initial,在這里需要創建一個我們自定義的線程,并且啟動它
/**
* 詞典初始化 由于IK Analyzer的詞典采用Dictionary類的靜態方法進行詞典初始化
* 只有當Dictionary類被實際調用時,才會開始載入詞典, 這將延長首次分詞操作的時間 該方法提供了一個在應用加載階段就初始化字典的手段
*
* @return Dictionary
*/
public static synchronized Dictionary initial(Configuration cfg) {
if (singleton == null) {
synchronized (Dictionary.class) {
if (singleton == null) {
singleton = new Dictionary(cfg);
singleton.loadMainDict();
singleton.loadSurnameDict();
singleton.loadQuantifierDict();
singleton.loadSuffixDict();
singleton.loadPrepDict();
singleton.loadStopWordDict();
new Thread(new HotDictReloadThread()).start();
if(cfg.isEnableRemoteDict()){
// 建立監控線程
for (String location : singleton.getRemoteExtDictionarys()) {
// 10 秒是初始延遲可以修改的 60是間隔時間 單位秒
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
}
for (String location : singleton.getRemoteExtStopWordDictionarys()) {
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
}
}
return singleton;
}
}
}
return singleton;
}HotDictReloadThread類:就是死循環,不斷調用Dictionary.getSingleton().reLoadMainDict(),去重新加載詞典
public class HotDictReloadThread implements Runnable {
private static final Logger logger = ESLoggerFactory.getLogger(HotDictReloadThread.class.getName());
@Override
public void run() {
while(true) {
logger.info("[==========]reload hot dict from mysql......");
Dictionary.getSingleton().reLoadMainDict();
}
}
}Dictionary類:更新詞典 this.loadMySQLExtDict()
/**
* 加載主詞典及擴展詞典
*/
private void loadMainDict() {
// 建立一個主詞典實例
_MainDict = new DictSegment((char) 0);
// 讀取主詞典文件
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error(e.getMessage(), e);
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger.error("ik-analyzer", e);
} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
// 加載擴展詞典
this.loadExtDict();
// 加載遠程自定義詞庫
this.loadRemoteExtDict();
// 從mysql加載詞典
this.loadMySQLExtDict();
}
/**
* 從mysql加載熱更新詞典
*/
private void loadMySQLExtDict() {
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
Path file = PathUtils.get(getDictRoot(), "jdbc-reload.properties");
prop.load(new FileInputStream(file.toFile()));
logger.info("[==========]jdbc-reload.properties");
for(Object key : prop.keySet()) {
logger.info("[==========]" + key + "=" + prop.getProperty(String.valueOf(key)));
}
logger.info("[==========]query hot dict from mysql, " + prop.getProperty("jdbc.reload.sql") + "......");
conn = DriverManager.getConnection(
prop.getProperty("jdbc.url"),
prop.getProperty("jdbc.user"),
prop.getProperty("jdbc.password"));
stmt = conn.createStatement();
rs = stmt.executeQuery(prop.getProperty("jdbc.reload.sql"));
while(rs.next()) {
String theWord = rs.getString("word");
logger.info("[==========]hot word from mysql: " + theWord);
_MainDict.fillSegment(theWord.trim().toCharArray());
}
Thread.sleep(Integer.valueOf(String.valueOf(prop.get("jdbc.reload.interval"))));
} catch (Exception e) {
logger.error("erorr", e);
} finally {
if(rs != null) {
try {
rs.close();
} catch (SQLException e) {
logger.error("error", e);
}
}
if(stmt != null) {
try {
stmt.close();
} catch (SQLException e) {
logger.error("error", e);
}
}
if(conn != null) {
try {
conn.close();
} catch (SQLException e) {
logger.error("error", e);
}
}
}
}
Dictionary類:更新分詞 this.loadMySQLStopwordDict();
/**
* 從mysql加載停用詞
*/
private void loadMySQLStopwordDict() {
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
Path file = PathUtils.get(getDictRoot(), "jdbc-reload.properties");
prop.load(new FileInputStream(file.toFile()));
logger.info("[==========]jdbc-reload.properties");
for(Object key : prop.keySet()) {
logger.info("[==========]" + key + "=" + prop.getProperty(String.valueOf(key)));
}
logger.info("[==========]query hot stopword dict from mysql, " + prop.getProperty("jdbc.reload.stopword.sql") + "......");
conn = DriverManager.getConnection(
prop.getProperty("jdbc.url"),
prop.getProperty("jdbc.user"),
prop.getProperty("jdbc.password"));
stmt = conn.createStatement();
rs = stmt.executeQuery(prop.getProperty("jdbc.reload.stopword.sql"));
while(rs.next()) {
String theWord = rs.getString("word");
logger.info("[==========]hot stopword from mysql: " + theWord);
_StopWords.fillSegment(theWord.trim().toCharArray());
}
Thread.sleep(Integer.valueOf(String.valueOf(prop.get("jdbc.reload.interval"))));
} catch (Exception e) {
logger.error("erorr", e);
} finally {
if(rs != null) {
try {
rs.close();
} catch (SQLException e) {
logger.error("error", e);
}
}
if(stmt != null) {
try {
stmt.close();
} catch (SQLException e) {
logger.error("error", e);
}
}
if(conn != null) {
try {
conn.close();
} catch (SQLException e) {
logger.error("error", e);
}
}
}
}配置
jdbc.url=jdbc:mysql://localhost:3306/test?serverTimezone=GMT
jdbc.user=root
jdbc.password=root
jdbc.reload.sql=select word from hot_words
jdbc.reload.stopword.sql=select stopword as word from hot_stopwords
jdbc.reload.interval=1000
3、mvn package打包代碼
target\releases\elasticsearch-analysis-ik-5.2.0.zip
4、解壓縮ik壓縮包
將mysql驅動jar,放入ik的目錄下
5、重啟es
6、在mysql中添加詞庫與停用詞
7、kibana分詞驗證
GET /my_index/_analyze
{
"text": "一人飲酒醉",
"analyzer": "ik_max_word"
}
{
"tokens": [
{
"token": "一人飲酒醉",
"start_offset": 0,
"end_offset": 5,
"type": "CN_WORD",
"position": 0
},
{
"token": "一人",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 1
},
{
"token": "一",
"start_offset": 0,
"end_offset": 1,
"type": "TYPE_CNUM",
"position": 2
},
{
"token": "人",
"start_offset": 1,
"end_offset": 2,
"type": "COUNT",
"position": 3
},
{
"token": "飲酒",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 4
},
{
"token": "飲",
"start_offset": 2,
"end_offset": 3,
"type": "CN_WORD",
"position": 5
},
{
"token": "酒醉",
"start_offset": 3,
"end_offset": 5,
"type": "CN_WORD",
"position": 6
},
{
"token": "酒",
"start_offset": 3,
"end_offset": 4,
"type": "CN_WORD",
"position": 7
},
{
"token": "醉",
"start_offset": 4,
"end_offset": 5,
"type": "CN_WORD",
"position": 8
}
]
}
總結
以上是生活随笔為你收集整理的ik mysql热加载分词_Elasticsearch 之(25)重写IK分词器源码来基于mysql热更新词库...的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: flash期末作业成品_基于“新工科”的
- 下一篇: mysql性能优化教程_mysql性能优