-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwordCountObject.pde
237 lines (191 loc) · 6.55 KB
/
wordCountObject.pde
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/*
Using SearchObject Class to create url list
url title
Map<String, Integer> wordCount dictionary of url's word's count
*/
public class WordCount{
// public variables
private Map<String, Integer> wordCount;
public String url;
public String title;
// Constructor
public WordCount(String url, String title){
this.wordCount = new TreeMap<String, Integer> () ;
this.url = url;
this.title = title;
}
// url getter
public String getUrl(){
return this.url;
}
// title getter
public String getTitle(){
return this.title;
}
// set title with string
public void setTitle(String s){
this.title = s;
}
// size getter
public Integer size(){
return this.wordCount.size();
}
// wordCount getter
public Map<String, Integer> getWordCount(){
return this.wordCount;
}
// OVERLOAD wordCount setter - WordCount Object
public void setWordCount(WordCount wordCount){
this.wordCount = wordCount.getWordCount();
}
// OVERLOAD wordCount setter - Map<String, Integer>
public void setWordCount(Map<String, Integer> wordCountMap){
this.wordCount = wordCountMap;
}
// calculate for wordCount
public void calculate(){
//System.out.println(this.url);
int error;
String[] source = loadStrings(getUrlFixed(this.url));
// create empty word storage for each loop
List<String> wordList = new ArrayList<String>();
try {
// loop for each url's line
for (int i=0; i<source.length; i++) {
// create parser of each line of url's string
Document doc = Jsoup.parse(source[i]);
// Take out html tags with .text() function.
String text = doc.text();
//-----------------------------------------------------------
// See if there is enough txt to collect
if (text.length() > 15) { // threshold for 15 letters and above
getWordlist(wordList, text);
}
}
} catch (Exception e){
System.out.println(e);
return;
}
Map<String, Integer> wordCountmp = new TreeMap<String, Integer>();
// create wordCount Map
for (String word : wordList) {
if (wordCountmp.get(word) == null) {
wordCountmp.put(word, 1);
} else if (wordCountmp.get(word) != null) {
wordCountmp.put(word, wordCountmp.get(word)+1);
}
}
// find max count of the wordCount map to find %
float maxval = maxVal(wordCountmp);
// delete words that have count below 12% and above 62%
deleteByThreshold(wordCountmp, maxval, 0.12, 0.62);
// IMPLEMENT NEEDED add title's word as biggest numbers
titleWordsAdd(wordCountmp, maxval, this.title);
// sort by values - USING CUSTOM FUNCTION
wordCount = sortByValues(wordCountmp);
}
}
// STATIC METHODS
// to sort by values for Map / It would not be used later I guess..
public static <K extends Comparable,V extends Comparable> Map<K,V> sortByValues(Map<K,V> map){
List<Map.Entry> entries = new LinkedList<Map.Entry>(map.entrySet());
Collections.sort(entries, new Comparator() {
public int compare(Object o1, Object o2) {
Map.Entry e1 = (Map.Entry)o1;
Map.Entry e2 = (Map.Entry)o2;
Integer first = (Integer)e1.getValue();
Integer second = (Integer)e2.getValue();
return second.compareTo(first);
}
});
//LinkedHashMap will keep the keys in the order they are inserted
//which is currently sorted on natural ordering
Map<K,V> sortedMap = new LinkedHashMap<K,V>();
for(Map.Entry entry: entries){
sortedMap.put((K)entry.getKey(), (V)entry.getValue());
}
return sortedMap;
}
//remove all the useless words from the stringList - pass as reference
public void removeUselessWords(List<String> lsStr){
// delete non-good length words from list
for (int idw=0; idw<lsStr.size(); idw++){
if (lsStr.get(idw).length() > 12 || lsStr.get(idw).length() < 3){
lsStr.remove(lsStr.get(idw));
}
}
}
// remove all the string except a-z, A-Z and single whitespaces. make em all lower case
public String getWordsOnly(String str){
// with this regex, most of code "word" will be part of above 80% and below 15%
String regex = "[^a-zA-Z\\s]*"; //"[^a-zA-Z0-9\\s]*";
Pattern r = Pattern.compile(regex);
String regex2 = "\\s+";
Pattern r2 = Pattern.compile(regex2);
// Cleaning bad characters and make em lower case
Matcher m = r.matcher(str);
String out = m.replaceAll("");
Matcher m2 = r2.matcher(out);
out = m2.replaceAll(" ");
return out.toLowerCase();
}
// get url string fixed
public String getUrlFixed(String s){
return s.substring(1,s.length()-1);
}
// find maxval from amp
public Integer maxVal(Map<String, Integer> map){
//System.out.print(map);
if (map.size() == 0){
//System.out.println("map is null.");
return 0;
}
Map.Entry maxEntry = null;
for (Map.Entry entry : map.entrySet()){
if (maxEntry == null){
maxEntry = entry;
} else if ((Integer)entry.getValue() > (Integer)maxEntry.getValue()){
maxEntry = entry;
}
}
return (Integer)maxEntry.getValue();
}
// delete too repetitive or unique words
public void deleteByThreshold(Map<String, Integer> map, float maxval, float minthres, float maxthres){
// Get a set of the entries
Set set = map.entrySet();
// Get an iterator
Iterator it = set.iterator();
// Display elements
while (it.hasNext()) {
Map.Entry entry = (Map.Entry)it.next();
float eval = (Integer)entry.getValue()/maxval;
if (eval > maxthres || eval < minthres) {
it.remove();
}
}
}
public void getWordlist(List<String> wordList, String text){
String out = getWordsOnly(text);
// splitted array into list
List<String> wordslist = new ArrayList<String>(Arrays.asList(out.split("\\s")));
// removeUselessWords - STATIC FUNCTION CALL
removeUselessWords(wordslist);
// covert wordslist list into string array
// 1. create empty String array that has size of List
String[] words = wordslist.toArray(new String[wordslist.size()]);
// 2. add all the SINGLE words into String array
for (String s : words)
wordList.add(s);
// 3. from the String array concatenate i and i+1
for (int wid=0; wid<words.length-1; wid++)
wordList.add(String.format("%s %s", words[wid], words[wid+1]));
}
public void titleWordsAdd(Map<String, Integer> map, float maxval, String text){
String title = getWordsOnly(text);
List<String> wordList = new ArrayList<String>();
getWordlist(wordList, title);
for (String s : wordList){
map.put(s, (int)maxval);
}
}