diff --git a/.gitignore b/.gitignore index 7f5d84f..d5d4f8d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ mistermeow/.project mistermeow/.settings/org.eclipse.buildship.core.prefs mistermeow/app/.settings mistermeow/app/src/meowapp/node_modules +.vscode/* diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index c5f3f6b..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "java.configuration.updateBuildConfiguration": "interactive" -} \ No newline at end of file diff --git a/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java b/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java index c27af2e..988aa3e 100644 --- a/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java +++ b/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java @@ -5,7 +5,6 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import meowdbmanager.DBManager; import org.bson.Document; import org.bson.types.ObjectId; import org.jsoup.Jsoup; @@ -13,9 +12,8 @@ import meowdbmanager.DBManager; import meowindexer.Tokenizer; -import meowranker.PhraseRanker; +import meowranker.*; -//TODO: normal queries with ranking //TODO: bold in snippts @CrossOrigin(origins = "*", allowedHeaders = "*") @RestController @@ -23,13 +21,13 @@ public class QueryEngineController { private DBManager dbManager; private Tokenizer tokenizer; - private PhraseRanker phraseRanker; + private Ranker ranker, phraseRanker; private List docs; private String currentQuery; private boolean isPhraseMatching, isFirstTime; private String[] phrases; private int[] operators; // 0: None, 1: AND, 2: OR, 3: NOT - private List tokens, tags, suggestions; + private List tokens, suggestions; private int resultCount; private final int numOfDocsInPage = 20, windowCharSize = 100; @@ -37,13 +35,13 @@ public QueryEngineController() { dbManager = new DBManager(); tokenizer = new Tokenizer(); phraseRanker = new PhraseRanker(); + ranker = new QueryRanker(); docs = new ArrayList<>(); currentQuery = ""; isPhraseMatching = false; isFirstTime = true; phrases = new String[3]; operators = new int[2]; - tags = new ArrayList<>(); tokens = new ArrayList<>(); suggestions = new ArrayList<>(); resultCount = 0; @@ -72,7 +70,6 @@ public Document searchQuery(@RequestParam("query") String query, parse(currentQuery); dbManager.insertSuggestion(currentQuery); tokens = tokenizer.tokenizeString(currentQuery, false); - tags = tokenizer.tokenizeString(currentQuery, false); docs = rankDocs(); isFirstTime = false; resultCount = docs.size(); @@ -129,11 +126,10 @@ private Document getResults(List docs) { availableCount--; } - System.out.println("Results: " + results); Document data = new Document("results", results) .append("count", resultCount) .append("availableCount", availableCount) - .append("tags", tags) + .append("tags", tokens) .append("suggestions", suggestions); return data; @@ -147,6 +143,8 @@ public String getSnippet(String doc) { Matcher stringMatch = Pattern.compile("\\b" + string + "\\b").matcher(textContent); if (stringMatch.find()) { int index = stringMatch.start(); + textContent = textContent.substring(0, index) + "*" + string + "*" + + textContent.substring(index + string.length()); int start = Math.max(0, index - windowCharSize); int end = Math.min(textContent.length(), index + windowCharSize); return textContent.substring(start, end); @@ -163,10 +161,9 @@ private List rankDocs() { useOperator(docIDs, operators[0], 1); if (phrases[2] != null) useOperator(docIDs, operators[1], 2); - System.out.println("DocIDs: " + docIDs); return docIDs; } - return dbManager.getDocIDs(tags); + return ranker.rank(currentQuery); } private void useOperator(List docIDs, int operator, int phraseIndex) { diff --git a/mistermeow/app/src/main/java/meowdbmanager/DBManager.java b/mistermeow/app/src/main/java/meowdbmanager/DBManager.java index 0a19942..9510bc0 100644 --- a/mistermeow/app/src/main/java/meowdbmanager/DBManager.java +++ b/mistermeow/app/src/main/java/meowdbmanager/DBManager.java @@ -414,14 +414,13 @@ public Document getDocument(String docID) { public List getDocuments(List docIDs) { try { List pipeline = new ArrayList<>(); - pipeline.add(new Document( - "$match", new Document("_id", new Document("$in", docIDs)))); + pipeline.add(new Document("$match", new Document("_id", new Document("$in", docIDs)))); pipeline.add(new Document("$project", new Document() .append("host", 1) .append("URL", 1) .append("title", 1) - .append("content", 1)) - .append("ranker_id" , 1)); + .append("content", 1) + .append("ranker_id", 1))); List results = docCollection.aggregate(pipeline).into(new ArrayList<>()); @@ -473,12 +472,12 @@ public double getDocumentFromInverted(String token, ObjectId docID) { } } - public String getPoisitionFromInverted(String token, ObjectId docID) { + public String getPositionFromInverted(String token, ObjectId docID) { try { Document query = new Document("token", token).append("docs._id", docID); Document result = invertedCollection.find(query) - .projection(new Document("docs.$", 1)) + .projection(new Document("docs", 1)) .first(); if (result != null) diff --git a/mistermeow/app/src/main/java/meowranker/Main.java b/mistermeow/app/src/main/java/meowranker/Main.java index c33ed47..53d4124 100644 --- a/mistermeow/app/src/main/java/meowranker/Main.java +++ b/mistermeow/app/src/main/java/meowranker/Main.java @@ -1,28 +1,25 @@ package meowranker; -import java.util.*; -import org.bson.types.ObjectId; - public class Main { public static void main(String[] argv) { - Ranker ranker = new PhraseRanker(); - - String query = "The dfl;akjf;asd Free Encyclopedia"; // tests searching for unfound token - ranker.rank(query); - - query = "The Free Encyclopedia"; - ranker.rank(query); - - // query = "Wikipedia"; + // Ranker ranker = new PhraseRanker(); + // + // String query = "The dfl;akjf;asd Free Encyclopedia"; // tests searching for + // unfound token + // ranker.rank(query); + // + // query = "The Free Encyclopedia"; + // ranker.rank(query); + // + // // query = "Wikipedia"; + // // ranker.rank(query); + // + // query = "I love you"; // ranker.rank(query); - query = "I love you"; - ranker.rank(query); - - - ranker = new QueryRanker(); - ranker.rank("The Free Encyclopedia"); + Ranker ranker = new QueryRanker(); + ranker.rank("cats"); // QueryRanker Qr = new QueryRanker(); // Qr.rank("Wkiipedia the free encyclopedia"); diff --git a/mistermeow/app/src/main/java/meowranker/Ranker.java b/mistermeow/app/src/main/java/meowranker/Ranker.java index 89baa99..5274edd 100644 --- a/mistermeow/app/src/main/java/meowranker/Ranker.java +++ b/mistermeow/app/src/main/java/meowranker/Ranker.java @@ -1,14 +1,12 @@ package meowranker; -import com.google.common.collect.Table; import java.lang.Math; import java.util.*; import meowdbmanager.DBManager; import meowindexer.Tokenizer; import org.bson.Document; import org.bson.types.ObjectId; -import org.springframework.data.mongodb.core.query.Query; public abstract class Ranker { @@ -49,19 +47,21 @@ public List rank(String query) { finalRank.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue())); - System.out.println("======================================"); - System.out.println("=========== Final Result ============="); - System.out.println("======================================"); + // System.out.println("======================================"); + // System.out.println("=========== Final Result ============="); + // System.out.println("======================================"); List SortedList = new ArrayList<>(); for (Map.Entry e : finalRank) { SortedList.add(e.getKey()); - System.out - .println("URL: " + db.getDocument(e.getKey().toString()).getString("URL") + " || Rank = " + e.getValue()); + // System.out + // .println("URL: " + db.getDocument(e.getKey().toString()).getString("URL") + " + // || Rank = " + e.getValue()); // The previous printing is time costly, comment it if you're not testing of // debugging } + System.out.println("Ranking finished!"); return SortedList; } @@ -151,7 +151,7 @@ public static double[][] constructUrlsGraph() { } public double[] getPopularityArr() { - int numberOfUrls = db.getUrlsCount(); + // int numberOfUrls = db.getUrlsCount(); // double[] popularityArr = new double[numberOfUrls]; // for (int i = 0; i < numberOfUrls; i++) { @@ -239,10 +239,10 @@ public List calculateRelevance(List docIds, double val = 0; for (String token : searchTokens) { // summation(tf-idf) - String position = db.getPoisitionFromInverted(token, docIds.get(i)); + String position = db.getPositionFromInverted(token, docIds.get(i)); val += db.getDocumentFromInverted(token, docIds.get(i)) * getIDF(token); - if (!position.equals("other")) + if (position != null && !position.equals("other")) val += boost; // NOTE: uncomment when testing // System.out.println( @@ -255,8 +255,8 @@ public List calculateRelevance(List docIds, relevance.add(val); } - if(this instanceof QueryRanker){ - QueryRanker ranker = (QueryRanker)this; + if (this instanceof QueryRanker) { + QueryRanker ranker = (QueryRanker) this; relevance = ranker.addQueryDocRel(relevance); } @@ -267,7 +267,7 @@ public double getIDF(String token) { double df; Document invertedInd = db.getInvertedIndex(token); - if (invertedInd == null) // Handling tokens that are not in any documnets + if (invertedInd == null) // Handling tokens that are not in any documents return 0; df = (double) db.getInvertedIndex(token).getInteger("DF");