From 72768357cb5d47a331c1917f47d16d3a4f0eee09 Mon Sep 17 00:00:00 2001 From: lifulong Date: Tue, 31 Dec 2019 16:16:55 +0800 Subject: [PATCH 1/2] real time update words dict support --- .../huaban/analysis/jieba/JiebaSegmenter.java | 2 +- .../huaban/analysis/jieba/WordDictionary.java | 32 ++++++++++++++----- .../analysis/jieba/viterbi/FinalSeg.java | 2 +- .../analysis/jieba/JiebaSegmenterTest.java | 2 +- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java b/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java index 9010f1f..915c031 100644 --- a/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java +++ b/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java @@ -1,5 +1,6 @@ package com.huaban.analysis.jieba; +import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -28,7 +29,6 @@ public void initUserDict(Path path){ public void initUserDict(String[] paths){ wordDict.init(paths); - } private Map> createDAG(String sentence) { diff --git a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java index da40753..8823584 100644 --- a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java +++ b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java @@ -9,12 +9,8 @@ import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Locale; -import java.util.Map; +import java.util.*; import java.util.Map.Entry; -import java.util.Set; public class WordDictionary { @@ -145,7 +141,7 @@ public void loadDict() { } - private String addWord(String word) { + public String addWord(String word) { if (null != word && !"".equals(word.trim())) { String key = word.trim().toLowerCase(Locale.getDefault()); _dict.fillSegment(key.toCharArray()); @@ -154,7 +150,27 @@ private String addWord(String word) { else return null; } - + + + public String addWord(String word, double freq) { + word = addWord(word); + freqs.put(word, freq); + return word; + } + + + public void addWords(List words) { + for(String word : words) + addWord(word); + } + + + public void addWords(Map words) { + for(Map.Entry entry : words.entrySet()) { + addWord(entry.getKey(), entry.getValue()); + } + } + public void loadUserDict(Path userDict) { loadUserDict(userDict, StandardCharsets.UTF_8); @@ -183,7 +199,7 @@ public void loadUserDict(Path userDict, Charset charset) { double freq = 3.0d; if (tokens.length == 2) freq = Double.valueOf(tokens[1]); - word = addWord(word); + word = addWord(word); freqs.put(word, Math.log(freq / total)); count++; } diff --git a/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java b/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java index 8a79eb5..5aa8b83 100644 --- a/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java +++ b/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java @@ -26,7 +26,7 @@ public class FinalSeg { private static Map start; private static Map> trans; private static Map prevStatus; - private static Double MIN_FLOAT = -3.14e100;; + private static Double MIN_FLOAT = -3.14e100; private FinalSeg() { diff --git a/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java b/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java index b31f98b..f7c9129 100644 --- a/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java +++ b/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java @@ -11,6 +11,7 @@ import junit.framework.TestCase; +import org.junit.Ignore; import org.junit.Test; import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; @@ -163,7 +164,6 @@ public void testBugSentence() { } } - @Test public void testSegmentSpeed() { long length = 0L; From 1d55902e2eb8f5b7831f1b51c19e95c0bb3a1f34 Mon Sep 17 00:00:00 2001 From: lifulong Date: Tue, 31 Dec 2019 16:19:29 +0800 Subject: [PATCH 2/2] unset unuseful update --- .../java/com/huaban/analysis/jieba/JiebaSegmenterTest.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java b/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java index f7c9129..4de6692 100644 --- a/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java +++ b/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java @@ -11,7 +11,6 @@ import junit.framework.TestCase; -import org.junit.Ignore; import org.junit.Test; import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; @@ -179,8 +178,7 @@ public void testSegmentSpeed() { System.out.println(String.format(Locale.getDefault(), "time elapsed:%d, rate:%fkb/s, sentences:%.2f/s", elapsed, (length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f), wordCount * 1000.0f / (elapsed * 1.0))); } - - + @Test public void testLongTextSegmentSpeed() { long length = 0L;