forked from sudar/Yahoo_LDA
-
Notifications
You must be signed in to change notification settings - Fork 31
/
Tokenizer.java
56 lines (51 loc) · 1.95 KB
/
Tokenizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Tokenizer {
public static final Pattern LEX_ALPHA = Pattern.compile ("\\p{Alpha}+");
public static String clean(String text_line){
text_line = text_line.replaceAll("\\p{Punct}"," ");
text_line = text_line.replaceAll("\\p{Space}+", " ");
return text_line.toLowerCase().trim();
}
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
BufferedReader br = null;
if(args.length==0)
br = new BufferedReader(new InputStreamReader(System.in));
else
br = new BufferedReader(new FileReader(args[0]));
String s = null;
int doc_ind = 0;
while((s=br.readLine())!=null){
//System.err.print(++doc_ind + " ");
int firstSpace = s.indexOf(' ');
if(firstSpace<0 || firstSpace>=s.length()-1) continue;
String url = s.substring(0,firstSpace);
int secSpace = s.indexOf(' ',firstSpace+1);
if(secSpace<1 || secSpace>=s.length()-1) continue;
String label = s.substring(firstSpace+1,secSpace);
String cont = s.substring(secSpace+1,s.length());
StringBuffer sb = new StringBuffer(url + " " + label + " ");
cont = clean(cont);
Matcher matcher = LEX_ALPHA.matcher(cont.subSequence(0, cont.length()));
String matchText = null;
while(matcher.find()){
matchText = matcher.group();
if(matchText.length()>=100) continue;
if(!matchText.equals("")){
sb.append(matchText);
sb.append(" ");
}
}
sb.deleteCharAt(sb.length()-1);
System.out.println(sb.toString());
}
}
}