diff --git a/src/refactor/base/AlphaBeta.java b/src/refactor/base/AlphaBeta.java new file mode 100644 index 0000000..06eb039 --- /dev/null +++ b/src/refactor/base/AlphaBeta.java @@ -0,0 +1,34 @@ +/** + * Created锛�May 10, 2013 2:18:41 PM + * Project锛�ThulacJava + * @author cxx + * @since JDK 1.6.0_13 + * filename锛�AlphaBeta.java + * description锛� + */ +package base; + + +//a structure for alphas and betas +public class AlphaBeta { + public int value; + public int nodeId; + public int labelId; + + public AlphaBeta() { + super(); + value = 0; + nodeId = -2; + labelId = 0; + } + + public AlphaBeta(int value, int nodeId, int labelId) { + super(); + this.value = value; + this.nodeId = nodeId; + this.labelId = labelId; + } + + + +} \ No newline at end of file diff --git a/src/refactor/base/Counter.java b/src/refactor/base/Counter.java new file mode 100644 index 0000000..45ea9c0 --- /dev/null +++ b/src/refactor/base/Counter.java @@ -0,0 +1,26 @@ +/** + * Created锛�May 9, 2013 2:32:10 PM + * Project锛�ThulacJava + * @author cxx + * @since JDK 1.6.0_13 + * filename锛�Counter.java + * description锛� + */ +package base; + +import java.util.HashMap; + +public class Counter extends HashMap { + /** + * + */ + private static final long serialVersionUID = 1L; + + public void update(KeyType key){ + Integer value = get(key); + if(value == null){ + value = 0; + } + put(key, value + 1); + } +} diff --git a/src/refactor/base/Dat.java b/src/refactor/base/Dat.java new file mode 100644 index 0000000..ee18e4b --- /dev/null +++ b/src/refactor/base/Dat.java @@ -0,0 +1,140 @@ +package base; + + +import java.io.*; +import java.util.Vector; + +public class Dat { + + public Vector dat; + public int datSize; + + public Dat(){ + dat = new Vector(); + datSize = 0; + } + public Dat(int datSize, Vector olddat){ + this.datSize = datSize; + dat= new Vector(); + for(int i = 0; i < datSize; i ++){ + dat.add(new Entry()); + dat.get(i).base = olddat.get(i).base; + dat.get(i).check = olddat.get(i).check; + } + } + public Dat(String filename) throws IOException { + filename = Dat.class.getClassLoader().getResource(filename).getFile(); + File file = new File(filename); + datSize = (int)(file.length() / 8); + //System.out.println(datSize); + + FileInputStream in = new FileInputStream(file); + + byte[] tempbytes = new byte[8 * datSize]; + dat = new Vector(); + in.read(tempbytes); + for(int i = 0; i < datSize; i ++){ + Entry entry = new Entry(); + entry.base = bytesToInt(tempbytes, 8 * i); + + dat.add(entry); + dat.get(i).check = bytesToInt(tempbytes, 8 * i + 4); + } + + in.close(); + } + + public static int bytesToInt(byte[] bb, int index) { + return (int) (((((int)bb[index + 3] & 0xff) << 24) + | (((int)bb[index + 2] & 0xff) << 16) + | (((int)bb[index + 1] & 0xff) << 8) | (((int)bb[index + 0] & 0xff) << 0))); + } + + public static byte[] intToBytes(int n){ + byte[] b = new byte[4]; + for(int i = 0;i < 4;i++){ + b[i] = (byte)(n >> (8 * i)); + } + return b; + } + + public void save(String filename) throws IOException{ + FileOutputStream out = new FileOutputStream(filename); + for(Entry e : dat){ + out.write(intToBytes(e.base)); + } + out.flush(); + for(Entry e : dat){ + out.write(intToBytes(e.check)); + } + out.flush(); + out.close(); + } + + public boolean search(String sentence, Vector bs, Vector es){ + bs.clear(); + es.clear(); + boolean empty = true; + for(int offset = 0; offset < sentence.length(); offset ++){ + int preBase = 0; + int preInd = 0; + int ind = 0; + for(int i = offset; i < sentence.length(); i ++){ + ind = preBase + sentence.charAt(i); + if(ind < 0 || ind >= datSize || dat.get(ind).check != preInd)break; + preInd = ind; + preBase = dat.get(ind).base; + ind = preBase; + if(!(ind < 0 || ind >= datSize || dat.get(ind).check != preInd)){ + bs.add(offset); + es.add(i + 1); + if(empty){ + empty = false; + } + } + } + } + return !empty; + } + + public int match(String word){ + int ind = 0; + int base = 0; + for(int i = 0; i < word.length(); i ++){ + ind = dat.get(ind).base + word.charAt(i); + if((ind >= datSize) || (dat.get(ind).check != base)) return -1; + base = ind; + } + ind = dat.get(base).base; + if((ind < datSize) && (dat.get(ind).check == base)){ + return ind; + } + return -1; + } + + public void update(String word, int value){ + int base = match(word); + if(base >= 0){ + dat.get(base).base = value; + } + } + + public int getInfo(String prefix){ + int ind = 0; + int base = 0; + for(int i = 0; i < prefix.length(); i ++){ + ind = dat.get(ind).base + prefix.charAt(i); + if((ind >= datSize) || dat.get(ind).check != base) return i; + base = ind; + } + return -base; + } + + public int getDatSize(){ + return datSize; + } + + public Vector getDat(){ + return dat; + } +} diff --git a/src/refactor/base/DatMaker.java b/src/refactor/base/DatMaker.java new file mode 100644 index 0000000..ee93fad --- /dev/null +++ b/src/refactor/base/DatMaker.java @@ -0,0 +1,226 @@ +/** + * Created:May 5, 2013 3:45:34 PM + * Project:ThulacJava + * @author cxx + * @since JDK 1.6.0_13 + * filename:DatMaker.java + * description: + */ +package base; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.Collections; +import java.util.Comparator; +import java.util.Vector; + +public class DatMaker extends Dat { + + public final static Comparator compareWords = new Comparator() { + @Override + public int compare(KeyValue first, KeyValue second) { + // TODO Auto-generated method stub + String firstKey = first.key; + String secondKey = second.key; + + int minSize = (firstKey.length() < secondKey.length()) ? firstKey.length() : secondKey.length(); + for(int i = 0; i < minSize; i ++){ + if(firstKey.charAt(i) > secondKey.charAt(i)) return 1; + if(firstKey.charAt(i) < secondKey.charAt(i)) return -1; + } + if(firstKey.length() < secondKey.length()){ + return -1; + }else if(firstKey.length() > secondKey.length()){ + return 1; + }else{ + return 0; + } + } + }; + + private int head; + private int tail; + + public DatMaker() { + datSize = 1; + dat = new Vector(); + Entry entry = new Entry(); + entry.base = 1; + entry.check = -1; + dat.add(entry); + head = 0; + tail = 0; + } + + + /** + *

Title:use

+ *

Description: use [ind] as an entry

+ * @param ind + */ + public void use(int ind){ + if(dat.get(ind).check >= 0) System.out.println("cell reused!!"); + if(dat.get(ind).base == 1){ + head = dat.get(ind).check; + }else{ + dat.get(-dat.get(ind).base).check = dat.get(ind).check; + } + if(dat.get(ind).check == -datSize){ + tail = dat.get(ind).base; + }else{ + dat.get(-dat.get(ind).check).base = dat.get(ind).base; + } + dat.get(ind).check = ind; + } + + public void extend(){ + int oldSize = datSize; + datSize *= 2; + for(int i = 0; i < oldSize; i ++){ + Entry entry = new Entry(); + entry.base = - (oldSize + i - 1); + entry.check = - (oldSize + i + 1); + dat.add(entry); + } + dat.get(oldSize).base = tail; + if(-tail > 0) dat.get(-tail).check = - oldSize; + tail = - (oldSize * 2 - 1); + } + + public void shrink(){ + int last = datSize - 1; + while(dat.get(last).check < 0){ + dat.remove(last); + last --; + } + datSize = last + 1; + } + + public int alloc(Vector offsets){ + int size = offsets.size(); + int base = - head; + while(true){ + if(base == datSize) extend(); + if(size != 0){ + while((base + offsets.get(size - 1)) >= datSize){ + extend(); + } + } + boolean flag = true; + if(dat.get(base).check >= 0){ + flag = false; + }else{ + for(int i = 0 ; i < size; i ++){ + if(dat.get(base + offsets.get(i)).check >= 0){// used + flag = false; + break; + } + } + } + if(flag){ + use(base); + for(int i = 0; i < size; i ++){ + use(base + offsets.get(i)); + } + return base;//got it and return it + } + if(dat.get(base).check == -datSize){ + extend(); + } + base = -dat.get(base).check; + } + } + + public void genChildren(Vector lexicon, int start, String prefix, Vector children){ + children.clear(); + int l = prefix.length(); + for(int ind = start; ind < lexicon.size(); ind ++){ + String word = lexicon.get(ind).key; + if(word.length() < l){ + return; + } + for(int i = 0; i < l; i ++){ + if(word.charAt(i) != prefix.charAt(i)){ + return; + } + } + if(word.length() > l){ + if(children.isEmpty() || (((int)word.charAt(l)) != children.lastElement())){ + children.add((int)word.charAt(l)); + } + } + } + } + + public int assign(int check, Vector offsets, boolean isWord){ + int base = alloc(offsets); + dat.get(base).base = 0; + if(isWord){ + dat.get(base).check = check; + }else{ + dat.get(base).check = base; + } + + for(int i = 0; i < offsets.size(); i ++){ + dat.get(base + offsets.get(i)).base = 0; + dat.get(base + offsets.get(i)).check = check; + } + dat.get(check).base = base; + + return base; + } + + public void makeDat(Vector lexicon){ + Collections.sort(lexicon, compareWords); + int size = lexicon.size(); + String prefix = ""; + Vector children = new Vector(); + genChildren(lexicon, 0, prefix, children); + int base = assign(0, children, true); + dat.get(0).base = base; + for(int i = 0; i < size; i ++){ + String word = lexicon.get(i).key; + int off = getInfo(word); + if(off <= 0){ + off = word.length(); + } + for(int offset = off; offset <= word.length(); offset ++){ + prefix = word.substring(0, offset); + int pBase = - getInfo(prefix); + genChildren(lexicon, i, prefix, children); + base = assign(pBase, children, (offset == word.length())); + } + off = -getInfo(word); + dat.get(dat.get(off).base).base = lexicon.get(i).value; + if((i != 0) && (i % 100000 == 0)){ + System.out.println(((double)i/(double)size)); + } + } + } + + public static void main(String[] args) throws IOException{ + DatMaker dm = new DatMaker(); + Vector lexicon = new Vector(); + + String filename = "res/pun.txt"; + BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filename),"UTF8")); + String line = ""; + int id = 0; + while((line = in.readLine()) != null){ + if(line.equals("")){ + continue; + } + + lexicon.add(new KeyValue(line.trim(), id)); + id ++; + } + in.close(); + System.out.println(lexicon.size()+" words are loaded."); + dm.makeDat(lexicon); + dm.shrink(); + System.out.println("size of DAT "+dm.getDatSize()); + dm.save("res/javaPun.dat"); + } +} diff --git a/src/refactor/base/Entry.java b/src/refactor/base/Entry.java new file mode 100644 index 0000000..786e2c0 --- /dev/null +++ b/src/refactor/base/Entry.java @@ -0,0 +1,25 @@ +/** + * Created:May 9, 2013 1:30:33 PM + * Project:ThulacJava + * @author cxx + * @since JDK 1.6.0_13 + * filename:Entry.java + * description: + */ +package base; + +public class Entry { + public int base; + public int check; + + public Entry() { + base = 0; + check = 0; + } + + public Entry(int base, int check) { + super(); + this.base = base; + this.check = check; + } +} diff --git a/src/refactor/base/Indexer.java b/src/refactor/base/Indexer.java new file mode 100644 index 0000000..487580c --- /dev/null +++ b/src/refactor/base/Indexer.java @@ -0,0 +1,43 @@ +/** + * Created:May 9, 2013 2:08:05 PM + * Project:ThulacJava + * @author cxx + * @since JDK 1.6.0_13 + * filename:Indexer.java + * description: + */ +package base; + +import java.util.HashMap; +import java.util.Vector; + +public class Indexer { + private HashMap dict; + private Vector list; + + public Indexer() { + super(); + dict.clear(); + } + + public int getIndex(E key){ + Integer value = dict.get(key); + if(value == null){ + int id = dict.size(); + dict.put(key, value); + list.add(key); + return id; + }else{ + return value; + } + } + + public E getObject(int ind){ + if(ind < 0 || ind >= dict.size()) return null; + return list.get(ind); + } + + public void setObject(int ind, E key){ + list.set(ind, key); + } +} diff --git a/src/refactor/base/KeyValue.java b/src/refactor/base/KeyValue.java new file mode 100644 index 0000000..2795100 --- /dev/null +++ b/src/refactor/base/KeyValue.java @@ -0,0 +1,26 @@ +/** + * Created:May 8, 2013 4:43:30 PM + * Project:ThulacJava + * @author cxx + * @since JDK 1.6.0_13 + * filename:KeyValue.java + * description: + */ +package base; + +public class KeyValue { + public String key; + public int value; + + public KeyValue(){ + key = ""; + value = 0; + } + + public KeyValue(String key, int value) { + super(); + this.key = key; + this.value = value; + } + +} diff --git a/src/refactor/base/Node.java b/src/refactor/base/Node.java new file mode 100644 index 0000000..8d927be --- /dev/null +++ b/src/refactor/base/Node.java @@ -0,0 +1,19 @@ +/** + * Created:May 10, 2013 2:16:51 PM + * Project:ThulacJava + * @author cxx + * @since JDK 1.6.0_13 + * filename:Node.java + * description: + */ +package base; + +/** + * topological information about a node + * type的定义: 默认0,如果是开始节点+1,如果是结尾节点+2 + */ +public class Node { + public int type; + public int[] predecessors;//ends with a -1 + public int[] successors;//ends with a -1 +} diff --git a/src/refactor/base/WordWithTag.java b/src/refactor/base/WordWithTag.java new file mode 100644 index 0000000..bff3eb5 --- /dev/null +++ b/src/refactor/base/WordWithTag.java @@ -0,0 +1,48 @@ +package base; + +import java.io.FileOutputStream; +import java.io.IOException; + +public class WordWithTag { + public String word = ""; + public String tag; + public char separator = '_'; + + public WordWithTag(){ + } + + public String getTag() { + return tag; + } + + public void setTag(String tag) { + this.tag = tag; + } + + public String getWord() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public void print(FileOutputStream out) throws IOException + { + byte[] buff=new byte[]{}; + buff=word.getBytes("utf-8"); + try { + out.write(buff,0,buff.length); + out.write(separator); + buff=tag.getBytes("utf-8"); + out.write(buff,0,buff.length); + out.write(' '); + } catch (IOException e) { + e.printStackTrace(); + } + + } + public String toString() { + return "WordWithTag{" + "word=" + word + ", tag=" + tag + '}'; + } +} diff --git a/src/refactor/manage/Adjust.java b/src/refactor/manage/Adjust.java new file mode 100644 index 0000000..82f0503 --- /dev/null +++ b/src/refactor/manage/Adjust.java @@ -0,0 +1,135 @@ +package manage; + +import base.Dat; +import base.WordWithTag; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created by amber on 16/11/25. + */ +public class Adjust { + static TimeWord timeWord = new TimeWord(); + + public List ajust(List taggedSentence) { + taggedSentence = nsIdiomAdjust(taggedSentence, "ns", Constants.MODEL_DAT_NS); + taggedSentence = nsIdiomAdjust(taggedSentence, "i", Constants.MODEL_DAT_IDIOM); + taggedSentence = puncAdjust(taggedSentence); + taggedSentence = timeWord.adjustDouble(taggedSentence); + taggedSentence = negAdjust(taggedSentence); + taggedSentence = verbAdjust(taggedSentence); + return taggedSentence; + } + + public static List verbAdjust(List taggedSentence) { + Dat vd = (Dat) WordDictionary.getInstance().getFile(Constants.MODEL_DAT_VD); + Dat vm = (Dat) WordDictionary.getInstance().getFile(Constants.MODEL_DAT_VM); + if((vm == null)||(vd == null))return taggedSentence; + for(int i = 0;i < taggedSentence.size()-1;i++){ + if((taggedSentence.get(i).tag == "v")&&(taggedSentence.get(i+1).tag == "v")){ + if(vm.match(taggedSentence.get(i).word)!=-1){ + taggedSentence.get(i).tag="vm"; + }else if(vd.match(taggedSentence.get(i+1).word)!=-1){ + taggedSentence.get(i+1).tag="vd"; + } + } + } + return taggedSentence; + } + + public static List negAdjust(List taggedSentence) { + Dat neg = (Dat) WordDictionary.getInstance().getFile(Constants.MODEL_DAT_NEG); + if((neg == null))return taggedSentence; + for(int i = taggedSentence.size()-1;i >= 0;i --){ + if(neg.match(taggedSentence.get(i).word) != -1){ + WordWithTag tmpWord = new WordWithTag(); + tmpWord.word = ""; + tmpWord.word += (taggedSentence.get(i).word.charAt(1)); + tmpWord.tag = "v"; + taggedSentence.add(i + 1, tmpWord); + int tmpInt = taggedSentence.get(i).word.charAt(0); + taggedSentence.get(i).word = ""; + taggedSentence.get(i).word += (char)tmpInt; + taggedSentence.get(i).tag = "d"; + } + } + return taggedSentence; + } + + public static List puncAdjust(List taggedSentence) { + Dat puncDic = (Dat) WordDictionary.getInstance().getFile(Constants.MODEL_DAT_SINGLEPUN); + if(puncDic==null)return taggedSentence; + List tmpVec= new ArrayList<>(); + boolean findMulti = false; + for(int i = 0 ; i < taggedSentence.size(); i ++){ + String tmp = taggedSentence.get(i).word; + if(puncDic.getInfo(tmp) >= 0) continue; + tmpVec.clear(); + int j; + for(j = i + 1; j < taggedSentence.size(); j ++){ + tmp += taggedSentence.get(j).word; + if(puncDic.getInfo(tmp) >= 0){ + break; + } + tmpVec.add(tmp); + } + int vecSize = tmpVec.size(); + findMulti = false; + for(int k = vecSize - 1; k >= 0; k--){ + tmp = tmpVec.get(k); + if(puncDic.match(tmp) != -1){ + for(j = i + 1; j < i + k + 2; j ++){ + taggedSentence.get(i).word += taggedSentence.get(j).word; + } + for(j = i + k + 1; j > i; j--){ + taggedSentence.remove(j); + } + taggedSentence.get(i).tag = "w"; + findMulti = true; + break; + } + } + if(!findMulti){ + if(puncDic.match(taggedSentence.get(i).word) != -1){ + taggedSentence.get(i).tag = "w"; + } + } + } + return taggedSentence; + } + + public static List nsIdiomAdjust(List taggedSentence, String tag, String fileName) { + Dat nsDic = (Dat) WordDictionary.getInstance().getFile(fileName); + if(nsDic == null)return taggedSentence; + List tmpVec =new ArrayList<>(); + for(int i = 0 ; i < taggedSentence.size(); i ++){ + String tmp = taggedSentence.get(i).word; + if(nsDic.getInfo(tmp) >= 0) continue; + int j; + for(j = i + 1; j < taggedSentence.size(); j ++){ + tmp += taggedSentence.get(j).word; + if(nsDic.getInfo(tmp) >= 0){ + break; + } + tmpVec.add(tmp); + } + int vecSize = tmpVec.size(); + + for(int k = vecSize - 1; k >= 0; k--){ + tmp = tmpVec.get(k); + if(nsDic.match(tmp) != -1){ + for(j = i + 1; j < i + k + 2; j ++){ + taggedSentence.get(i).word += taggedSentence.get(j).word; + } + for(j = i + k + 1; j > i; j--){ + taggedSentence.remove(j); + } + taggedSentence.get(i).tag = tag; + break; + } + } + } + return taggedSentence; + } +} diff --git a/src/refactor/manage/CBModel.java b/src/refactor/manage/CBModel.java new file mode 100644 index 0000000..0cad564 --- /dev/null +++ b/src/refactor/manage/CBModel.java @@ -0,0 +1,28 @@ +/** + * Created锛�May 9, 2013 12:22:21 PM + * Project锛�ThulacJava + * @author cxx + * @since JDK 1.6.0_13 + * filename锛�CBModel.java + * description锛� + */ +package manage; + +public class CBModel { + + private static int DEC = 1000; + + public int l_size; //size of the labels + public int f_size; //size of the features + + public int[] ll_weights; // weights of (label, label) + public int[] fl_weights; // weights of (feature, label) + + public double[] ave_ll_weights; + public double[] ave_fl_weights; + + public CBModel(int l, int f){ + l_size = l; + f_size = f; + } +} diff --git a/src/refactor/manage/Calculation.java b/src/refactor/manage/Calculation.java new file mode 100644 index 0000000..ed2e6d7 --- /dev/null +++ b/src/refactor/manage/Calculation.java @@ -0,0 +1,434 @@ +package manage; + +import base.*; + +import java.util.ArrayList; +import java.util.List; +import java.util.Vector; + +/** + * Created by amber on 16/11/22. + */ +public class Calculation { + private static WordDictionary wordDict = WordDictionary.getInstance(); + static int lsize = wordDict.cbModellSize; + static int[] flWeights = wordDict.flWeights; + static int[] llWeights = wordDict.llWeights; + private static int MAX_LENGTH = 10000; + private static int SENTENCE_BOUNDARY='#'; + private static int SEPERATOR = ' '; + + public static List getTagList(String text, List graph, List taggedList) { + String labelInfo[] = wordDict.labelInfo; + int length = text.length(); + int offset = 0; + int[] result = getResult(text, graph); + + for (int i = 0;i < text.length(); i ++) { + if ((i == length - 1) || (labelInfo[result[i]].charAt(0) == '2') || (labelInfo[result[i]].charAt(0) == '3')) { + taggedList.add(new WordWithTag()); + for(int j = offset;j < i + 1; j ++) { + taggedList.get(taggedList.size() - 1).word += (text.charAt(j)); + } + offset = i + 1; + if((labelInfo[result[i]]+1) != null){//输出标签(如果有的话) + taggedList.get(taggedList.size() - 1).tag = labelInfo[result[i]].substring(1); + } + } + } + Adjust ajust = new Adjust(); + taggedList = ajust.ajust(taggedList); + return taggedList; + } + + public static int[] getResult(String text, List graph) { + int length = text.length(); + int[][] allowedLabelLists = allowedLabelLists(length, graph); + int[] values = getValues(text); + int[] result = new int[length * lsize]; + Node[] nodes = getNodes(length); + AlphaBeta[] alphas = new AlphaBeta[length * lsize]; + int[][] preLabel = getLabelPre(); + result = dbDecode(lsize, llWeights, length, nodes, values, alphas, result, preLabel, allowedLabelLists); + return result; + } + + public static int[] getValues(String text) { + int length = text.length(); + if (length > MAX_LENGTH) { + return new int[0]; + } + int size = lsize * length; + int values[] = new int[2*size]; + values = putValues(values, text, lsize); + return values; + } + + public static int[][] getLabelPre() { + String labelInfo[] = wordDict.labelInfo; + List> preLabels = new ArrayList>(); + List> postLabels = new ArrayList<>(); + for(int i = 0; i < lsize; i ++){ + preLabels.add(new Vector()); + postLabels.add(new Vector()); + } + for(int i = 0; i < lsize; i ++){ + for(int j = 0; j < lsize; j ++){ + int ni = labelInfo[i].charAt(0) - '0'; + int nj = labelInfo[j].charAt(0) - '0'; + boolean iIsEnd = ((ni == 2) || (ni == 3)); + boolean jIsBegin = ((nj == 0) || (nj == 3)); + boolean sameTag = labelInfo[i].substring(1).equals(labelInfo[j].substring(1)); + if(sameTag){ + if((ni == 0 && nj == 1) || + (ni == 0 && nj == 2) || + (ni == 1 && nj == 2) || + (ni == 1 && nj == 1) || + (ni == 2 && nj == 0) || + (ni == 2 && nj == 3) || + (ni == 3 && nj == 3) || + (ni == 3 && nj == 0)){ + preLabels.get(j).add(i); + postLabels.get(i).add(j); + } + }else{ + if(iIsEnd && jIsBegin){ + preLabels.get(j).add(i); + postLabels.get(i).add(j); + } + } + } + } + int[][] labelTransPre = new int[lsize][]; + for(int i = 0 ; i < lsize; i ++){ + labelTransPre[i] = new int[preLabels.get(i).size() + 1]; + for(int j = 0; j < preLabels.get(i).size(); j++){ + labelTransPre[i][j] = preLabels.get(i).get(j); + } + labelTransPre[i][preLabels.get(i).size()] = -1; + } + return labelTransPre; + } + + public static int[] putValues(int[] values, String text, int size) { + Dat dat = (Dat) wordDict.getFile(Constants.MODEL_BIN_DAT); + int length = text.length(); + if(text.length() >= MAX_LENGTH){ + System.err.println("The text is too long..."); + return new int[0]; + } + List result = findBases(dat, SENTENCE_BOUNDARY, SENTENCE_BOUNDARY); + int[] uniBases = new int[length + 2]; + int[] biBases = new int[length + 4]; + uniBases[0] = result.get(0); + biBases[0] = result.get(1); + result = findBases(dat, SENTENCE_BOUNDARY,text.charAt(0)); + uniBases[0] = result.get(0); + biBases[1] = result.get(1); + + for(int i = 0 ; i + 1 < text.length(); i ++){ + result = findBases(dat, text.charAt(i), text.charAt(i+1)); + uniBases[i + 1] = result.get(0); + biBases[i + 2] = result.get(1); + } + + result = findBases(dat, (int)text.charAt(length - 1), SENTENCE_BOUNDARY); + uniBases[length] = result.get(0); + biBases[length + 1] = result.get(1); + + result = findBases(dat, SENTENCE_BOUNDARY, SENTENCE_BOUNDARY); + uniBases[length + 1] = result.get(0); + biBases[length + 2] = result.get(1); + + int base = 0; + for(int i = 0; i < length; i ++){ + int valueOffset = i * size; + if((base = uniBases[i + 1]) != -1){ + values = addValues(valueOffset, base, 49, null, values); + } + if((base = uniBases[i]) != -1){ + values = addValues(valueOffset, base, 50, null, values); + } + if((base = uniBases[i + 2]) != -1){ + values = addValues(valueOffset, base, 51, null, values); + } + if((base = biBases[i + 1]) != -1){ + values = addValues(valueOffset, base, 49, null, values); + } + if((base = biBases[i + 2]) != -1){ + values = addValues(valueOffset, base, 50, null, values); + } + if((base = biBases[i]) != -1){ + values = addValues(valueOffset, base, 51, null, values); + } + if((base = biBases[i + 3]) != -1){ + values = addValues(valueOffset, base, 52, null, values); + } + } + return values; + } + + private static List findBases(Dat dat, int ch1, int ch2){ + List result = new ArrayList<>(); + int datSize = dat.getDatSize(); + List vdat = dat.getDat(); + int uniBase; + int biBase; + if(ch1 > 32 && ch1 < 128) ch1+=65248; + if(ch2 > 32 && ch2 < 128) ch2+=65248; + if(ch1 >= datSize || vdat.get(ch1).check != 0){ + uniBase = -1; + biBase = -1; + result.clear(); + result.add(uniBase); + result.add(biBase); + return result; + } + uniBase = vdat.get(ch1).base + SEPERATOR; + int ind = vdat.get(ch1).base + ch2; + if(ind >= datSize || vdat.get(ind).check != ch1){ + biBase = -1; + result.clear(); + result.add(uniBase); + result.add(biBase); + return result; + } + biBase = vdat.get(ind).base + SEPERATOR; + result.clear(); + result.add(uniBase); + result.add(biBase); + return result; + } + + private static int[] addValues(int valueOffset, int base, int del, int[] pAllowedLable, int[] values){ + Dat dat = (Dat) wordDict.getFile(Constants.MODEL_BIN_DAT); + List vdat = dat.getDat(); + int ind = vdat.get(base).base + del; + int datSize = dat.getDatSize(); + if(ind >= datSize || vdat.get(ind).check != base){ + return values; + } + int offset = vdat.get(ind).base; + int weightOffset = offset * lsize; + int allowedLabel; + if(lsize == 4){ + values[valueOffset] += flWeights[weightOffset]; + values[valueOffset + 1] += flWeights[weightOffset + 1]; + values[valueOffset + 2] += flWeights[weightOffset + 2]; + values[valueOffset + 3] += flWeights[weightOffset + 3]; + }else{ + if(pAllowedLable != null){ + for(int i = 0; i < pAllowedLable.length; i ++){ + allowedLabel = pAllowedLable[i]; + values[valueOffset + allowedLabel] += flWeights[weightOffset + allowedLabel]; + } + }else{ + for(int i = 0; i < lsize; i ++){ + values[valueOffset + i] += flWeights[weightOffset + i]; + } + } + } + return values; + } + + public static int[][] allowedLabelLists(int length, List graph) { + int [][] allowedLabelLists = new int[length][]; + int[][] pocsToTags = wordDict.pocsToTags; + for (int i = 0; i < length; i ++) { + allowedLabelLists[i] = null; + } + for(int i = 0; i < length;i++){ + int pocs = graph.get(i); + if(pocs != 0){ + allowedLabelLists[i] = pocsToTags[pocs]; + }else{ + allowedLabelLists[i] = pocsToTags[15]; + } + } + return allowedLabelLists; + } + + public static Node[] getNodes(int length) { + if(length == 0) return new Node[0]; + Node[] nodes = new Node[length]; + for(int i = 0; i < length; i ++){ + nodes[i] = new Node(); + nodes[i].type = 0; + int[] pre = new int[2]; + pre[0] = i - 1; + pre[1] = -1; + nodes[i].predecessors = pre; + + pre = new int[2]; + pre[0] = i + 1; + pre[1] = -1; + nodes[i].successors = pre; + } + nodes[0].type += 1; + nodes[length-1].type += 2; + return nodes; + } + + public static int[] dbDecode(int l_size, int[] llWeights, int nodeCount, Node[] nodes, int[] values, AlphaBeta[] alphas, + int[] result, int[][] preLabels, int[][] allowedLabelLists){ + int nodeId; + int[] pNodeId; + int[] pPreLabel; + int[] pAllowedLabel; + int k; + int j; + AlphaBeta tmp; + AlphaBeta best = new AlphaBeta(); + best.nodeId = -1; + AlphaBeta preAlpha; + + int score; + int index = 0; + int index2 = 0; + int index3 = 0; + + for(int i = 0; i < nodeCount * l_size; i ++) + { + alphas[i]=new AlphaBeta(); + alphas[i].nodeId = -2; + } + for(int i = 0; i < nodeCount; i ++){ + pAllowedLabel = allowedLabelLists != null ? allowedLabelLists[i] : null; + j = -1; + int maxValue = 0; + boolean hasMaxValue = false; + if(pAllowedLabel != null){ + index = 0; + while((j = pAllowedLabel[index]) != -1){ + index ++; + if(!hasMaxValue || (maxValue < values[i*l_size +j])){ + hasMaxValue = true; + maxValue = values[i*l_size + j]; + } + } + index = 0; + j = -1; + while((j = pAllowedLabel[index]) != -1){ + index ++; + tmp = alphas[i*l_size + j]; + tmp.value = 0; + pNodeId = nodes[i].predecessors; + pPreLabel = preLabels != null ? preLabels[j] : null; + index2 = 0; + while((nodeId = pNodeId[index2]) >= 0){ + index2 ++; + k = -1; + if(pPreLabel != null){ + index3 = 0; + while((k = pPreLabel[index3]) != -1){ + index3 ++; + preAlpha = alphas[nodeId * l_size + k]; + if(preAlpha.nodeId == -2) continue; + score = preAlpha.value + llWeights[k*l_size + j]; + if((tmp.nodeId<0) || (score > tmp.value)){ + tmp.value = score; + tmp.nodeId = nodeId; + tmp.labelId = k; + } + } + }else{ + k ++; + while(k != l_size){ + preAlpha = alphas[nodeId * l_size + k]; + if(preAlpha.nodeId == -2) continue; + score = preAlpha.value + llWeights[k*l_size + j]; + if((tmp.nodeId<0) || (score > tmp.value)){ + tmp.value = score; + tmp.nodeId = nodeId; + tmp.labelId = k; + } + k ++; + } + } + } + tmp.value += values[i*l_size + j]; + if((nodes[i].type == 1) || (nodes[i].type == 3)){ + tmp.nodeId = -1; + } + if(nodes[i].type >= 2){ + if((best.nodeId == -1) || best.value < tmp.value){ + best.value = tmp.value; + best.nodeId = i; + best.labelId = j; + } + } + } + + }else{ + j ++; + while(j != l_size){ + if(!hasMaxValue || (maxValue < values[i*l_size +j])){ + hasMaxValue = true; + maxValue = values[i*l_size + j]; + } + j ++; + } + j = 0; + while(j != l_size){ + tmp = alphas[i*l_size + j]; + tmp.value = 0; + pNodeId = nodes[i].predecessors; + pPreLabel = preLabels != null ? preLabels[j] : null; + index2 = 0; + while((nodeId = pNodeId[index2]) >= 0){ + index2 ++; + k = -1; + if(pPreLabel != null){ + index3 = 0; + while((k = pPreLabel[index3]) != -1){ + index3 ++; + preAlpha = alphas[nodeId * l_size + k]; + if(preAlpha.nodeId == -2) continue; + score = preAlpha.value + llWeights[k*l_size + j]; + if((tmp.nodeId<0) || (score > tmp.value)){ + tmp.value = score; + tmp.nodeId = nodeId; + tmp.labelId = k; + } + + } + }else{ + k ++; + while(k != l_size){ + preAlpha = alphas[nodeId * l_size + k]; + if(preAlpha.nodeId == -2) continue; + score = preAlpha.value + llWeights[k*l_size + j]; + if((tmp.nodeId<0) || (score > tmp.value)){ + tmp.value = score; + tmp.nodeId = nodeId; + tmp.labelId = k; + } + k ++; + } + } + } + tmp.value += values[i*l_size + j]; + if((nodes[i].type == 1) || (nodes[i].type == 3)){ + tmp.nodeId = -1; + } + if(nodes[i].type >= 2){ + if((best.nodeId == -1) || best.value < tmp.value){ + best.value = tmp.value; + best.nodeId = i; + best.labelId = j; + } + } +// System.out.println(""+tmp.value+" "+tmp.nodeId+" "+tmp.labelId); + j ++; + } + + } + } + tmp = best; + while(tmp.nodeId >= 0){ + result[tmp.nodeId] = tmp.labelId; + tmp = alphas[tmp.nodeId * l_size + tmp.labelId]; + } + return result; + } +} diff --git a/src/refactor/manage/Constants.java b/src/refactor/manage/Constants.java new file mode 100644 index 0000000..bc91d6e --- /dev/null +++ b/src/refactor/manage/Constants.java @@ -0,0 +1,18 @@ +package manage; + +/** + * Created by amber on 16/11/22. + */ +public class Constants { + /** 加载词性标注所需模型文件 **/ + public static final String MODEL_BIN_MODEL = "model_c_model.bin"; + public static final String MODEL_BIN_DAT = "model_c_dat.bin"; + public static final String MODEL_TXT_LABEL = "model_c_label.txt"; + public static final String MODEL_DAT_T2S = "t2s.dat"; + public static final String MODEL_DAT_NS = "ns.dat"; + public static final String MODEL_DAT_IDIOM = "idiom.dat"; + public static final String MODEL_DAT_SINGLEPUN = "singlepun.dat"; + public static final String MODEL_DAT_NEG = "neg.dat"; + public static final String MODEL_DAT_VM = "vM.dat"; + public static final String MODEL_DAT_VD = "vD.dat"; +} diff --git a/src/refactor/manage/Preprocesser.java b/src/refactor/manage/Preprocesser.java new file mode 100644 index 0000000..ee6e104 --- /dev/null +++ b/src/refactor/manage/Preprocesser.java @@ -0,0 +1,318 @@ +package manage; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + +public class Preprocesser { + static HashSet otherSet = new HashSet(); // store all the number, digit and punctuation + static HashSet singlePunSet = new HashSet(); // store all the punctuation that need to split + static HashSet httpSet = new HashSet(); // store all the number, digit and punctuation that may be a charactor in a url + + static { + init(); + } + + public Preprocesser(){} + + private static void init() { + for(int i = 65; i < 91; i ++){ + otherSet.add(i); + httpSet.add(i); + } + for(int i = 97; i < 123; i ++){ + otherSet.add(i); + httpSet.add(i); + } + for(int i = 48; i < 58; i ++){ + otherSet.add(i); + httpSet.add(i); + } + int other[] = {65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 12304, 12305, + 12289, 12298, 12299, 126, 183, 64, 124, 35, 65509, 37, 8230, 38, 42, 65288, + 65289, 8212, 45, 43, 61, 44, 46, 60, 62, 63, 47, 33, 59, 58, 39, 34, 123, 125, + 91, 93, 92, 124, 35, 36, 37, 94, 38, 42, 40, 41, 95, 45, 43, 61, 9700, 9734, 9733}; + int len = 63; + for(int i = 0; i < len; i ++){ + otherSet.add(other[i]); + } + + // singlePun correspond to (see otherSet) + int singlePun[] = {65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 1230, 12304, + 12305, 12289, 12298, 12299, 64,35, 65288, 65289, 34, 91, 93, 126, 47, 44, 58, + 63, 9700, 9734, 9733, 8230, 39, 33, 42, 43, 62, 40, 41}; + len = 39; + for(int i = 0 ; i < len; i ++){ + singlePunSet.add(singlePun[i]); + } + + char httpChar[] = {'/', '.', ':', '#', '"', '_', '-', '=', '+', '&', '$', ';'}; + len = 12; + for(int i = 0; i < len; i ++){ + httpSet.add((int)httpChar[i]); + } + } + + public boolean isOther(int c){ + if(otherSet.contains(c)){ + return true; + }else{ + return false; + } + } + + public boolean isSinglePun(int c){ + if(singlePunSet.contains(c)){ + return true; + }else{ + return false; + } + } + + public boolean isHttp(int c){ + if(httpSet.contains(c)){ + return true; + }else{ + return false; + } + } + + public String clean(String sentence, List graph){ + String senClean= new String(); + graph.clear(); + boolean hasSpace = false; //use to check whether the char is a space + boolean hasOther = false; //use to check whether isOther(char); + boolean hasSinglePun = false; //use to check whether isSinglePun(char); + boolean hasHttp = false; //use to check whether isHttp(char); + boolean hasAt = false; //use to check whether the char is @ + boolean hasTitle = false; //use to check whether the sentence has 《》 + List httpStartVec =new ArrayList<>(); + int httpStart = -1; + List> httpVec =new ArrayList<>(); + int c = -1; + List tmpRaw =new ArrayList(); + List npRaw =new ArrayList(); + int npStart = -1; + List npStartVec = new ArrayList(); + List> npVec =new ArrayList>(); + List titleRaw =new ArrayList(); + int titleStart = -1; + List titleStartVec =new ArrayList(); + List> titleVec =new ArrayList>(); + for(int i = 0; i < sentence.length(); i++){ + c = sentence.charAt(i); + if(c == 32 || c == 12288){ + hasOther = false; + if(hasSpace){ + continue; + }else{ + if(graph.size()>0){ + int o=graph.get(graph.size() - 1)&12; + graph.set(graph.size()-1, o); + } + hasSpace=true; + } + + if(hasAt){ + npVec.add(npRaw); + npStartVec.add(npStart); + hasAt = false; + } + }else if(isOther(c)){ + if(hasSpace){ + senClean+=sentence.charAt(i); + if(isSinglePun(c)){ + graph.add(8); + hasSinglePun = true; + }else{ + graph.add(9); + hasSinglePun = false; + } + hasSpace = false; + }else if(hasOther){ + if(isSinglePun(c)){ + if(graph.size()>0){ + int o=graph.get(graph.size() - 1)&12; + graph.set(graph.size()-1, o); + } + senClean+=sentence.charAt(i); + graph.add(8); + hasSinglePun = true; + + }else{ + if(hasSinglePun){ + senClean+=sentence.charAt(i); + graph.add(9); + }else{ + if (graph.get(graph.size() - 1) == 0) { + graph.set(graph.size()-1, 7); + } + senClean+=sentence.charAt(i); + graph.add(2); + } + hasSinglePun = false; + } + }else{ + senClean+=sentence.charAt(i); + graph.add(9); + if(isSinglePun(c)){ + hasSinglePun = true; + }else{ + hasSinglePun = false; + } + } + if(c == 41 || c == 65289){ + if(hasAt){ + npVec.add(npRaw); + npStartVec.add(npStart); + hasAt = false; + } + } + if(c == 12299){ + if(hasTitle){ + titleVec.add(titleRaw); + titleStartVec.add(titleStart); + hasTitle = false; + } + } + hasOther = true; + }else{ + if(hasSpace){ + senClean+=sentence.charAt(i); + graph.add(9); + }else if(hasOther){ + int o=graph.get(graph.size() - 1)&12; + graph.set(graph.size()-1, o); + if(hasSinglePun){ + senClean+=sentence.charAt(i); + graph.add(9); + hasSinglePun = false; + }else{ + senClean+=sentence.charAt(i); + graph.add(15); + } + }else{ + senClean+=sentence.charAt(i); + graph.add(15); + } + hasSpace = false; + hasOther = false; + } + + if(isHttp(c)){ + if(!hasHttp){ + if(c == 'h'){ + httpStart = graph.size() - 1; + tmpRaw.clear(); + tmpRaw.add(c); + hasHttp = true; + } + }else{ + tmpRaw.add(c); + } + }else{ + if(hasHttp){ + httpVec.add(tmpRaw); + httpStartVec.add(httpStart); + hasHttp = false; + } + } + + if(c == 64){ + if(hasAt){ + npVec.add(npRaw); + npStartVec.add(npStart); + npRaw.clear(); + } + hasAt = true; + npStart = graph.size() - 1; + npRaw.clear(); + }else if(hasAt){ + npRaw.add(c); + } + + if(c == 12298){ + hasTitle = true; + titleStart = graph.size() - 1; + titleRaw.clear(); + }else if(hasTitle){ + titleRaw.add(c); + } + } + if(tmpRaw.size() != 0){ + httpVec.add(tmpRaw); + httpStartVec.add(httpStart); + } + if(npRaw.size() != 0){ + npVec.add(npRaw); + npStartVec.add(npStart); + } + + String str; + for(int i = 0 ; i < httpVec.size(); i ++){ + str=httpVec.get(i).toString(); + int found = str.indexOf("http"); + if(found != -1){ + int start = httpStartVec.get(i); + int size = str.length(); + graph.set(start, 1); + for(int j = start + 1; j < start + size - 1; j ++){ + graph.set(j, 2); + } + graph.set(start + size - 1, 4); + + } + } + + for(int i = 0; i < npVec.size(); i ++){ + npRaw = npVec.get(i); + if(npRaw.size() < 15 && npRaw.size() > 0){ + int start = npStartVec.get(i); + int size = npRaw.size(); + graph.set(start, 1); + for(int j = start + 1; j < start + size - 1; j ++){ + graph.set(j, 2); + } + graph.set(start + size - 1, 4); + + } + } + + for(int i = 0; i < titleVec.size(); i ++){ + titleRaw = titleVec.get(i); + if(isPossibleTitle(titleRaw)){ + int start = titleStartVec.get(i); + int size = titleRaw.size(); + graph.set(start, 1); + for(int j = start + 1; j < start + size - 1; j ++){ + graph.set(j, 2); + } + graph.set(start + size - 1, 4); + } + } + + if(graph.size()!=0){ + graph.set(0, graph.get(0)); + graph.set(graph.size()-1, graph.get(graph.size() - 1)&12); + + if(graph.get(0)==0) graph.set(0,9); + + if(graph.size()-1==0) graph.set(graph.size()-1, 12); + } + + return senClean; + } + + boolean isPossibleTitle(List titleRaw){ + if(titleRaw.size() > 10 || titleRaw.size() == 0){ + return false; + }else{ + for(int i = 0; i < titleRaw.size(); i ++){ + if(isOther(titleRaw.get(i))){ + return false; + } + } + return true; + } + } +} diff --git a/src/refactor/manage/Thulac.java b/src/refactor/manage/Thulac.java new file mode 100644 index 0000000..b8b7bd2 --- /dev/null +++ b/src/refactor/manage/Thulac.java @@ -0,0 +1,52 @@ +package manage; + +import base.WordWithTag; + +import java.util.*; + +/** + * Created by amber on 16/11/14. + */ +public class Thulac { + public Thulac() {} + + /** + * 获得词性标注的结果 + * @param text + * @return + */ + public List getLabelResult(String text) { + Preprocesser prepro = new Preprocesser(); + List pocCands = new ArrayList(); + text = prepro.clean(text, pocCands); + List resultList = new ArrayList(); + + List taggedSentence = getTagged(pocCands, text); + if(taggedSentence == null || taggedSentence.size() == 0) { + return resultList; + } + for (int i = 0; i < taggedSentence.size(); i ++) { + resultList.add(taggedSentence.get(i)); + } + return resultList; + } + + public List getTagged(List pocCands, String text) { + if (text == null || text.length() == 0) { + return new ArrayList<>(); + } + List taggedList = new ArrayList<>(); + Calculation cal = new Calculation(); + taggedList = cal.getTagList(text, pocCands, taggedList); + return taggedList; + } + + public static void main(String args[]) { + Thulac thulac = new Thulac(); + String text = "今天高高兴兴去上学"; + List result = thulac.getLabelResult(text); + for (WordWithTag word : result) { + System.out.println(word.toString()); + } + } +} diff --git a/src/refactor/manage/TimeWord.java b/src/refactor/manage/TimeWord.java new file mode 100644 index 0000000..e04574e --- /dev/null +++ b/src/refactor/manage/TimeWord.java @@ -0,0 +1,168 @@ +package manage; + +import base.WordWithTag; + +import java.util.HashSet; +import java.util.List; + +public class TimeWord { + HashSet arabicNumSet; + //std::set chineseNumSet; + HashSet timeWordSet; + HashSet otherSet; + public TimeWord() + { + arabicNumSet =new HashSet(); + timeWordSet =new HashSet(); + otherSet =new HashSet(); + + for(int i = 48; i < 58; i ++){ + arabicNumSet.add(i); + } + for(int i = 65296; i < 65306; i ++){ + arabicNumSet.add(i); + } + /* + int chineseNums[] = {12295,19968,20108,19977,22235,20116,20845,19971,20843,20061}; + for(int i = 0; i < 10; i ++){ + chineseNumSet.insert(chineseNums[i]); + } + */ + + //��:24180 ��:26376 ��:26085 ��:21495 ʱ:26102 ��:28857 ��:20998 ��:31186 + int timeWord[] = {24180, 26376, 26085, 21495, 26102, 28857, 20998, 31186}; + int len = 8; + for(int i = 0; i < len; i ++){ + timeWordSet.add(timeWord[i]); + } + + for(int i = 65; i < 91; i ++){ + otherSet.add(i); + } + for(int i = 97; i < 123; i ++){ + otherSet.add(i); + } + for(int i = 48; i < 58; i ++){ + otherSet.add(i); + } + + int other[] = {65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 12304, 12305, + 12289, 12298, 12299, 126, 183, 64, 124, 35, 65509, 37, 8230, 38, 42, 65288, + 65289, 8212, 45, 43, 61, 44, 46, 60, 62, 63, 47, 33, 59, 58, 39, 34, 123, 125, + 91, 93, 92, 124, 35, 36, 37, 94, 38, 42, 40, 41, 95, 45, 43, 61, 9700, 9734, 9733}; + len = 63; + for(int i = 0; i < len; i ++){ + otherSet.add(other[i]); + } + }; + + public boolean isArabicNum(String word){ + boolean allArabic = true; + for(int i = 0; i < word.length(); i ++){ + if(arabicNumSet.contains(Integer.valueOf(word.charAt(i)))){ + allArabic = false; + break; + } + } + return allArabic; + } + + public boolean isTimeWord(String word){ + if(word.length() == 0 || word.length() > 1){ + return false; + } + if(!timeWordSet.contains((int)word.charAt(0))){ + return false; + }else{ + return true; + } + } + + public boolean isDoubleWord(String word, String postWord) + { + if(word.length() != 1 || postWord.length() != 1){ + return false; + }else + { + int wordInt = word.charAt(0); + int postWordInt = postWord.charAt(0); + if(wordInt == postWordInt){ + if(!otherSet.contains((int)wordInt)){ + return true; + }else{ + return false; + } + }else{ + return false; + } + } + } + + boolean isHttpWord(String word){ + if(word.length() < 5){ + return false; + }else{ + if(word.charAt(0) == 'h' && word.charAt(1) == 't' && word.charAt(2) == 't' && word.charAt(3) == 'p' ){ + return true; + }else{ + return false; + } + } + } + + public List adjustDouble(List sentence){ + int size = sentence.size(); + String word; + boolean hasTimeWord = false; + + for(int i = size - 1; i >= 0; i --){ + word = sentence.get(i).word; + if(isTimeWord(word)){ + hasTimeWord = true; + }else{ + if(hasTimeWord){ + //if(isArabicNum(word) || isChineseNum(word)){ + if(isArabicNum(word)){ + sentence.get(i).word += sentence.get(i+1).word; + sentence.remove(i + 1); + sentence.get(i).tag = "t"; + } + } + hasTimeWord = false; + } + } + + size = sentence.size(); + String postWord; + for(int i = size - 2; i >= 0; i --){ + word = sentence.get(i).word; + postWord = sentence.get(i + 1).word; + if(isDoubleWord(word, postWord)){ + sentence.get(i).word += sentence.get(i+1).word; + sentence.remove(i + 1); + } + } + + size = sentence.size(); + for(int i = 0; i < size; i ++){ + word = sentence.get(i).word; + if(isHttpWord(word)){ + sentence.get(i).tag = "x"; + } + } + + size = sentence.size(); + String preWord; + for(int i = 1; i < size; i ++){ + preWord = sentence.get(i-1).word; + word = sentence.get(i).word; + if(preWord.length() == 1 && preWord.charAt(0) == 64){ + if((word.length() != 1) || (word.charAt(0) != 64)){ + sentence.get(i).tag = "np"; + } + } + } + return sentence; + } + +} diff --git a/src/refactor/manage/WordDictionary.java b/src/refactor/manage/WordDictionary.java new file mode 100644 index 0000000..8c7e5ea --- /dev/null +++ b/src/refactor/manage/WordDictionary.java @@ -0,0 +1,216 @@ +package manage; + +import base.Dat; +import base.Entry; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.util.HashMap; +import java.util.Map; +import java.util.Vector; + +/** + * Created by amber on 16/11/25. + */ +public class WordDictionary { + static Logger logger = LoggerFactory.getLogger(WordDictionary.class); + private static refactor.WordDictionary singleton; + public Map wordsMap = new HashMap(); + private static final String PATH_PREFIX = "models/"; + + /** 加载词性标注所需模型文件 **/ + public static final String MODEL_BIN_MODEL = PATH_PREFIX + "model_c_model.bin"; + public static final String MODEL_BIN_DAT = PATH_PREFIX + "model_c_dat.bin"; + public static final String MODEL_TXT_LABEL = PATH_PREFIX + "model_c_label.txt"; + public static final String MODEL_DAT_T2S = PATH_PREFIX + "t2s.dat"; + public static final String MODEL_DAT_NS = PATH_PREFIX + "ns.dat"; + public static final String MODEL_DAT_IDIOM = PATH_PREFIX + "idiom.dat"; + public static final String MODEL_DAT_SINGLEPUN = PATH_PREFIX + "singlepun.dat"; + public static final String MODEL_DAT_NEG = PATH_PREFIX + "neg.dat"; + public static final String MODEL_DAT_VM = PATH_PREFIX + "vM.dat"; + public static final String MODEL_DAT_VD = PATH_PREFIX + "vD.dat"; + + public static final HashMap t2s = new HashMap(); + public static final HashMap s2t = new HashMap(); + public static String[] labelInfo = new String[10000]; + public static int[][] pocsToTags = new int[16][]; + public int cbModellSize = 0; + public int cbModelfSize = 0; + public int[] llWeights; + public int[] flWeights; + + public static refactor.WordDictionary getInstance(){ + if (singleton == null) { + synchronized (refactor.WordDictionary.class) { + if (singleton == null) { + singleton = new refactor.WordDictionary(); + return singleton; + } + } + } + return singleton; + } + + private WordDictionary(){ + try { + this.loadDict(); + } catch (IOException e) { + throw new RuntimeException("WordDictionary error occurs while init " ); + } + } + + public void loadDict() throws IOException { + long start = System.currentTimeMillis(); + setT2SMap(); + loadLabelTxt(); + wordsMap.put(Constants.MODEL_DAT_NS, loadDatDic(MODEL_DAT_NS)); + wordsMap.put(Constants.MODEL_DAT_IDIOM, loadDatDic(MODEL_DAT_IDIOM)); + wordsMap.put(Constants.MODEL_DAT_SINGLEPUN, loadDatDic(MODEL_DAT_SINGLEPUN)); + wordsMap.put(Constants.MODEL_DAT_NEG, loadDatDic(MODEL_DAT_NEG)); + wordsMap.put(Constants.MODEL_DAT_VM, loadDatDic(MODEL_DAT_VM)); + wordsMap.put(Constants.MODEL_DAT_VD, loadDatDic(MODEL_DAT_VD)); + wordsMap.put(Constants.MODEL_BIN_MODEL, loadCBModelDic()); + wordsMap.put(Constants.MODEL_BIN_DAT, loadDatDic(MODEL_BIN_DAT)); + long end = System.currentTimeMillis(); + logger.info("-------------- loadDict finished " + (end - start) + "ms -------------"); + } + + public Object getFile(String fileName) { + return wordsMap.get(fileName); + } + + public Dat loadDatDic(String filename){ + try { + File file = new File(refactor.WordDictionary.class.getClassLoader().getResource(filename).getFile()); +// File file = new File(filename); + int datSize = (int)(file.length() / 8); + FileInputStream in = new FileInputStream(file); + byte[] tempbytes = new byte[8 * datSize]; + Vector dat = new Vector(); + in.read(tempbytes); + for(int i = 0; i < datSize; i ++){ + Entry entry = new Entry(); + entry.base = bytesToInt(tempbytes, 8 * i); + dat.add(entry); + dat.get(i).check = bytesToInt(tempbytes, 8 * i + 4); + } + Dat newDat = new Dat(datSize, dat); + return newDat; + } catch (Exception e) { + throw new RuntimeException("WordDictionary Could not find file: " + filename); + } + } + + public CBModel loadCBModelDic() { + InputStream in = refactor.WordDictionary.class.getClassLoader().getResourceAsStream(MODEL_BIN_MODEL); +// File file = new File(MODEL_BIN_MODEL); +// FileInputStream in = null; +// try { +// in = new FileInputStream(file); +// } catch (FileNotFoundException e) { +// e.printStackTrace(); +// } + CBModel cbModel = null; + + byte[] tempbytes = new byte[4]; + try { + in.read(tempbytes); + cbModellSize = bytesToInt(tempbytes, 0); + in.read(tempbytes); + cbModelfSize = bytesToInt(tempbytes, 0); + + cbModel = new CBModel(cbModellSize, cbModelfSize); + llWeights = new int[cbModellSize * cbModellSize]; + tempbytes = new byte[4 * llWeights.length]; + in.read(tempbytes); + for(int i = 0; i < llWeights.length; i ++){ + llWeights[i] = bytesToInt(tempbytes, 4 * i); + } + flWeights = new int[cbModelfSize * cbModellSize]; + tempbytes = new byte[4 * flWeights.length]; + in.read(tempbytes); + for(int i = 0; i < flWeights.length; i ++){ + flWeights[i] = bytesToInt(tempbytes, 4 * i); + } + + cbModel.ll_weights = llWeights; + cbModel.fl_weights = flWeights; + in.close(); + } catch (IOException e) { + e.printStackTrace(); + } + return cbModel; + } + + public void setT2SMap() { + File file = new File(refactor.WordDictionary.class.getClassLoader().getResource(MODEL_DAT_T2S).getFile()); + int datSize = (int)(file.length() / 8); + FileInputStream in = null; + try { + in = new FileInputStream(file); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + int[] tra = new int[datSize]; + int[] sim = new int[datSize]; + byte[] tempbytes = new byte[4 * datSize]; + try { + in.read(tempbytes); + for(int i=0;i> pocTags = new Vector>(); + for(int i = 0; i < 16; i ++){ + pocTags.add(new Vector()); + } + try { + BufferedReader in = new BufferedReader(new InputStreamReader(refactor.WordDictionary.class.getClassLoader().getResourceAsStream(MODEL_TXT_LABEL),"utf-8")); + String raw = ""; + int ind = 0; + while((raw = in.readLine()) != null){ + labelInfo[ind] = raw; + int segInd = raw.charAt(0) - '0'; + for(int j = 0; j < 16; j ++){ + if(((1<