diff --git a/mistermeow/app/src/main/java/meowdbmanager/DBManager.java b/mistermeow/app/src/main/java/meowdbmanager/DBManager.java index 35f53e6..e2c0e40 100644 --- a/mistermeow/app/src/main/java/meowdbmanager/DBManager.java +++ b/mistermeow/app/src/main/java/meowdbmanager/DBManager.java @@ -420,7 +420,8 @@ public List getDocuments(List docIDs) { .append("host", 1) .append("URL", 1) .append("title", 1) - .append("content", 1))); + .append("content", 1)) + .append("ranker_id" , 1)); List results = docCollection.aggregate(pipeline).into(new ArrayList<>()); diff --git a/mistermeow/app/src/main/java/meowindexer/Main.java b/mistermeow/app/src/main/java/meowindexer/Main.java index 16d7989..f54ea95 100644 --- a/mistermeow/app/src/main/java/meowindexer/Main.java +++ b/mistermeow/app/src/main/java/meowindexer/Main.java @@ -6,6 +6,7 @@ import java.util.concurrent.Executors; import meowdbmanager.DBManager; import meowindexer.Tokenizer.Token; +import meowranker.QueryRanker; import meowranker.Ranker; import org.bson.Document; import org.jsoup.Jsoup; @@ -89,7 +90,7 @@ private static void calculatePopularity() { System.out.println("===================================="); System.out.println("|| Calculating popularity ||"); System.out.println("===================================="); - Ranker ranker = new Ranker(); + Ranker ranker = new QueryRanker(); ranker.calculatePopularity(); System.out.println("===================================="); System.out.println("|| Popularity calculated! ||"); diff --git a/mistermeow/app/src/main/java/meowranker/Main.java b/mistermeow/app/src/main/java/meowranker/Main.java index 4f099dd..c33ed47 100644 --- a/mistermeow/app/src/main/java/meowranker/Main.java +++ b/mistermeow/app/src/main/java/meowranker/Main.java @@ -6,16 +6,25 @@ public class Main { public static void main(String[] argv) { - PhraseRanker phRanker = new PhraseRanker(); + Ranker ranker = new PhraseRanker(); String query = "The dfl;akjf;asd Free Encyclopedia"; // tests searching for unfound token - phRanker.rank(query); + ranker.rank(query); query = "The Free Encyclopedia"; - phRanker.rank(query); + ranker.rank(query); + + // query = "Wikipedia"; + // ranker.rank(query); query = "I love you"; - phRanker.rank(query); + ranker.rank(query); + + + ranker = new QueryRanker(); + ranker.rank("The Free Encyclopedia"); + // QueryRanker Qr = new QueryRanker(); + // Qr.rank("Wkiipedia the free encyclopedia"); } } diff --git a/mistermeow/app/src/main/java/meowranker/PhraseRanker.java b/mistermeow/app/src/main/java/meowranker/PhraseRanker.java index f5efd99..08e1d14 100644 --- a/mistermeow/app/src/main/java/meowranker/PhraseRanker.java +++ b/mistermeow/app/src/main/java/meowranker/PhraseRanker.java @@ -13,59 +13,14 @@ public PhraseRanker() { super(); } - // TODO: change function return type - public List rank(String query) { - - double[] popularity = getPopularityArr(); - - // Tokenizing query - List searchTokens = tokenizer.tokenizeString(query, true); - System.out.println(searchTokens); - - // getting docs common in all tokens & matches the query phrase - List matchedDocs = getMatchingDocs(searchTokens, query); - - System.out.println(matchedDocs.size() + " || " + ProcessedDocs.size()); - // for (Document doc : ProcessedDocs) { - // System.out.println(doc.getString("URL")); - // System.out.println(doc.) - // } - - // calculating relevance for each document - List relevance = this.calculateRelevance(matchedDocs, searchTokens, popularity); - - // for (Double val: relevance){ - // System.out.println(val); - // } - - List> finalRank = combineRelWithPop(matchedDocs, relevance, popularity); - - finalRank.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue())); - - System.out.println("======================================"); - System.out.println("=========== Final Result ============="); - System.out.println("======================================"); - - List SortedList = new ArrayList<>(); - for (Map.Entry e : finalRank) { - SortedList.add(e.getKey()); - // System.out.println("URL: "+ - // db.getDocument(e.getKey().toString()).getString("URL") + "|| Rank = " + - // e.getValue()); The previous printing is time costly, comment it if - // you're not testing of debugging - } - - return SortedList; - } - - private List getMatchingDocs(List searchTokens, - String query) { + protected List getMatchingDocs(List searchTokens, String query) { List docs = getCommonDocs(searchTokens); List finalDocs = new ArrayList<>(); - this.ProcessedDocs = new ArrayList<>(); + ProcessedDocs = new ArrayList<>(); + for (ObjectId id : docs) { Document currDoc = db.getDocument(id.toString()); // getting the document by id @@ -79,15 +34,14 @@ private List getMatchingDocs(List searchTokens, boolean flag = matcher.find(); if (flag) { - finalDocs.add(currDoc.getObjectId( - "_id")); // adding the documents that matches the query - ProcessedDocs.add(db.getDocument( - id.toString())); // Saving documents to access db only once - } + finalDocs.add(currDoc.getObjectId("_id")); // adding the documents that matches the query + ProcessedDocs.add(db.getDocument(id.toString())); // Saving documents to access db only once + } + } return finalDocs; - } + } private List getCommonDocs(List searchTokens) { diff --git a/mistermeow/app/src/main/java/meowranker/QueryRanker.java b/mistermeow/app/src/main/java/meowranker/QueryRanker.java new file mode 100644 index 0000000..d587bbe --- /dev/null +++ b/mistermeow/app/src/main/java/meowranker/QueryRanker.java @@ -0,0 +1,56 @@ +package meowranker; + +import java.util.*; +import org.bson.Document; +import org.bson.types.ObjectId; +import org.jsoup.Jsoup; + +public class QueryRanker extends Ranker{ + + public List QueryDocRel; + + public QueryRanker(){ + super(); + QueryDocRel = new ArrayList<>(); + } + + protected List getMatchingDocs(List searchTokens, String query){ + List docsUnion = db.getDocIDs(searchTokens); + + Map docRepetition = new HashMap<>(); + + List distinctDocs = new ArrayList<>(); + + ProcessedDocs = new ArrayList<>(); + for(ObjectId docId : docsUnion){ + if(docRepetition.containsKey(docId)){ + // System.out.println(docId); + int currRep = docRepetition.get(docId); + docRepetition.remove(docId); + docRepetition.put(docId , currRep+1); + } + else{ + distinctDocs.add(docId); + docRepetition.put(docId ,1 ); + ProcessedDocs.add(db.getDocument(docId.toString())); + } + } + + for(ObjectId docId :distinctDocs){ + QueryDocRel.add((double)docRepetition.get(docId)/(double)searchTokens.size()); + } + + return distinctDocs; + } + + protected List addQueryDocRel(List relevance){ + int ind = 0; + for(Double val:relevance){ + val*=QueryDocRel.get(ind); + ind++; + } + + + return relevance; + } +} diff --git a/mistermeow/app/src/main/java/meowranker/Ranker.java b/mistermeow/app/src/main/java/meowranker/Ranker.java index 86e3075..ece0adb 100644 --- a/mistermeow/app/src/main/java/meowranker/Ranker.java +++ b/mistermeow/app/src/main/java/meowranker/Ranker.java @@ -8,22 +8,65 @@ import meowindexer.Tokenizer; import org.bson.Document; import org.bson.types.ObjectId; +import org.springframework.data.mongodb.core.query.Query; -public class Ranker { +public abstract class Ranker { public static DBManager db; public static Tokenizer tokenizer; - public static int counter = 1; public List ProcessedDocs; // used to access db only once while crawling public Ranker() { db = new DBManager(); tokenizer = new Tokenizer(); + } + + public List rank(String query) { + + double[] popularity = getPopularityArr(); + + // Tokenizing query + List searchTokens = tokenizer.tokenizeString(query, true); + System.out.println(searchTokens); + + // getting docs common in all tokens & matches the query phrase + List matchedDocs = this.getMatchingDocs(searchTokens, query); + + System.out.println(matchedDocs.size() + " || " + ProcessedDocs.size()); + // for (Document doc : ProcessedDocs) { + // System.out.println(doc.getString("URL")); + // System.out.println(doc.) + // } + + // calculating relevance for each document + List relevance = this.calculateRelevance(matchedDocs, searchTokens, popularity); + + // for (Double val: relevance){ + // System.out.println(val); + // } + + List> finalRank = combineRelWithPop(matchedDocs, relevance, popularity); + + finalRank.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue())); - if (counter == 0) - calculatePopularity(); + System.out.println("======================================"); + System.out.println("=========== Final Result ============="); + System.out.println("======================================"); + + List SortedList = new ArrayList<>(); + for (Map.Entry e : finalRank) { + SortedList.add(e.getKey()); + System.out + .println("URL: " + db.getDocument(e.getKey().toString()).getString("URL") + " || Rank = " + e.getValue()); + // The previous printing is time costly, comment it if you're not testing of + // debugging + } + + return SortedList; } + protected abstract List getMatchingDocs(List searchTokens, String query); + // The function takes graph of links between documents // where edge from doc1 to doc2 is representing by adding // 1/(outgoing urls from doc1) in the cell M[doc2][doc1] @@ -200,6 +243,11 @@ public List calculateRelevance(List docIds, relevance.add(val); } + if(this instanceof QueryRanker){ + QueryRanker ranker = (QueryRanker)this; + relevance = ranker.addQueryDocRel(relevance); + } + return relevance; }