Skip to content

Commit

Permalink
feat: QueryRanker is done
Browse files Browse the repository at this point in the history
  • Loading branch information
AbdelruhmanSamy authored and AhmedHamed3699 committed May 14, 2024
1 parent 9fefa19 commit 5144016
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 64 deletions.
3 changes: 2 additions & 1 deletion mistermeow/app/src/main/java/meowdbmanager/DBManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,8 @@ public List<Document> getDocuments(List<ObjectId> docIDs) {
.append("host", 1)
.append("URL", 1)
.append("title", 1)
.append("content", 1)));
.append("content", 1))
.append("ranker_id" , 1));

List<Document> results = docCollection.aggregate(pipeline).into(new ArrayList<>());

Expand Down
3 changes: 2 additions & 1 deletion mistermeow/app/src/main/java/meowindexer/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.concurrent.Executors;
import meowdbmanager.DBManager;
import meowindexer.Tokenizer.Token;
import meowranker.QueryRanker;
import meowranker.Ranker;
import org.bson.Document;
import org.jsoup.Jsoup;
Expand Down Expand Up @@ -89,7 +90,7 @@ private static void calculatePopularity() {
System.out.println("====================================");
System.out.println("|| Calculating popularity ||");
System.out.println("====================================");
Ranker ranker = new Ranker();
Ranker ranker = new QueryRanker();
ranker.calculatePopularity();
System.out.println("====================================");
System.out.println("|| Popularity calculated! ||");
Expand Down
17 changes: 13 additions & 4 deletions mistermeow/app/src/main/java/meowranker/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,25 @@
public class Main {
public static void main(String[] argv) {

PhraseRanker phRanker = new PhraseRanker();
Ranker ranker = new PhraseRanker();

String query = "The dfl;akjf;asd Free Encyclopedia"; // tests searching for unfound token
phRanker.rank(query);
ranker.rank(query);

query = "The Free Encyclopedia";
phRanker.rank(query);
ranker.rank(query);

// query = "Wikipedia";
// ranker.rank(query);

query = "I love you";
phRanker.rank(query);
ranker.rank(query);


ranker = new QueryRanker();
ranker.rank("The Free Encyclopedia");

// QueryRanker Qr = new QueryRanker();
// Qr.rank("Wkiipedia the free encyclopedia");
}
}
62 changes: 8 additions & 54 deletions mistermeow/app/src/main/java/meowranker/PhraseRanker.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,59 +13,14 @@ public PhraseRanker() {
super();
}

// TODO: change function return type
public List<ObjectId> rank(String query) {

double[] popularity = getPopularityArr();

// Tokenizing query
List<String> searchTokens = tokenizer.tokenizeString(query, true);
System.out.println(searchTokens);

// getting docs common in all tokens & matches the query phrase
List<ObjectId> matchedDocs = getMatchingDocs(searchTokens, query);

System.out.println(matchedDocs.size() + " || " + ProcessedDocs.size());
// for (Document doc : ProcessedDocs) {
// System.out.println(doc.getString("URL"));
// System.out.println(doc.)
// }

// calculating relevance for each document
List<Double> relevance = this.calculateRelevance(matchedDocs, searchTokens, popularity);

// for (Double val: relevance){
// System.out.println(val);
// }

List<Map.Entry<ObjectId, Double>> finalRank = combineRelWithPop(matchedDocs, relevance, popularity);

finalRank.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue()));

System.out.println("======================================");
System.out.println("=========== Final Result =============");
System.out.println("======================================");

List<ObjectId> SortedList = new ArrayList<>();
for (Map.Entry<ObjectId, Double> e : finalRank) {
SortedList.add(e.getKey());
// System.out.println("URL: "+
// db.getDocument(e.getKey().toString()).getString("URL") + "|| Rank = " +
// e.getValue()); The previous printing is time costly, comment it if
// you're not testing of debugging
}

return SortedList;
}

private List<ObjectId> getMatchingDocs(List<String> searchTokens,
String query) {
protected List<ObjectId> getMatchingDocs(List<String> searchTokens, String query) {

List<ObjectId> docs = getCommonDocs(searchTokens);

List<ObjectId> finalDocs = new ArrayList<>();

this.ProcessedDocs = new ArrayList<>();
ProcessedDocs = new ArrayList<>();

for (ObjectId id : docs) {
Document currDoc = db.getDocument(id.toString()); // getting the document by id

Expand All @@ -79,15 +34,14 @@ private List<ObjectId> getMatchingDocs(List<String> searchTokens,
boolean flag = matcher.find();

if (flag) {
finalDocs.add(currDoc.getObjectId(
"_id")); // adding the documents that matches the query
ProcessedDocs.add(db.getDocument(
id.toString())); // Saving documents to access db only once
}
finalDocs.add(currDoc.getObjectId("_id")); // adding the documents that matches the query
ProcessedDocs.add(db.getDocument(id.toString())); // Saving documents to access db only once
}

}

return finalDocs;
}
}

private List<ObjectId> getCommonDocs(List<String> searchTokens) {

Expand Down
56 changes: 56 additions & 0 deletions mistermeow/app/src/main/java/meowranker/QueryRanker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package meowranker;

import java.util.*;
import org.bson.Document;
import org.bson.types.ObjectId;
import org.jsoup.Jsoup;

public class QueryRanker extends Ranker{

public List<Double> QueryDocRel;

public QueryRanker(){
super();
QueryDocRel = new ArrayList<>();
}

protected List<ObjectId> getMatchingDocs(List<String> searchTokens, String query){
List<ObjectId> docsUnion = db.getDocIDs(searchTokens);

Map<ObjectId , Integer> docRepetition = new HashMap<>();

List<ObjectId> distinctDocs = new ArrayList<>();

ProcessedDocs = new ArrayList<>();
for(ObjectId docId : docsUnion){
if(docRepetition.containsKey(docId)){
// System.out.println(docId);
int currRep = docRepetition.get(docId);
docRepetition.remove(docId);
docRepetition.put(docId , currRep+1);
}
else{
distinctDocs.add(docId);
docRepetition.put(docId ,1 );
ProcessedDocs.add(db.getDocument(docId.toString()));
}
}

for(ObjectId docId :distinctDocs){
QueryDocRel.add((double)docRepetition.get(docId)/(double)searchTokens.size());
}

return distinctDocs;
}

protected List<Double> addQueryDocRel(List<Double> relevance){
int ind = 0;
for(Double val:relevance){
val*=QueryDocRel.get(ind);
ind++;
}


return relevance;
}
}
56 changes: 52 additions & 4 deletions mistermeow/app/src/main/java/meowranker/Ranker.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,65 @@
import meowindexer.Tokenizer;
import org.bson.Document;
import org.bson.types.ObjectId;
import org.springframework.data.mongodb.core.query.Query;

public class Ranker {
public abstract class Ranker {

public static DBManager db;
public static Tokenizer tokenizer;
public static int counter = 1;
public List<Document> ProcessedDocs; // used to access db only once while crawling

public Ranker() {
db = new DBManager();
tokenizer = new Tokenizer();
}

public List<ObjectId> rank(String query) {

double[] popularity = getPopularityArr();

// Tokenizing query
List<String> searchTokens = tokenizer.tokenizeString(query, true);
System.out.println(searchTokens);

// getting docs common in all tokens & matches the query phrase
List<ObjectId> matchedDocs = this.getMatchingDocs(searchTokens, query);

System.out.println(matchedDocs.size() + " || " + ProcessedDocs.size());
// for (Document doc : ProcessedDocs) {
// System.out.println(doc.getString("URL"));
// System.out.println(doc.)
// }

// calculating relevance for each document
List<Double> relevance = this.calculateRelevance(matchedDocs, searchTokens, popularity);

// for (Double val: relevance){
// System.out.println(val);
// }

List<Map.Entry<ObjectId, Double>> finalRank = combineRelWithPop(matchedDocs, relevance, popularity);

finalRank.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue()));

if (counter == 0)
calculatePopularity();
System.out.println("======================================");
System.out.println("=========== Final Result =============");
System.out.println("======================================");

List<ObjectId> SortedList = new ArrayList<>();
for (Map.Entry<ObjectId, Double> e : finalRank) {
SortedList.add(e.getKey());
System.out
.println("URL: " + db.getDocument(e.getKey().toString()).getString("URL") + " || Rank = " + e.getValue());
// The previous printing is time costly, comment it if you're not testing of
// debugging
}

return SortedList;
}

protected abstract List<ObjectId> getMatchingDocs(List<String> searchTokens, String query);

// The function takes graph of links between documents
// where edge from doc1 to doc2 is representing by adding
// 1/(outgoing urls from doc1) in the cell M[doc2][doc1]
Expand Down Expand Up @@ -212,6 +255,11 @@ public List<Double> calculateRelevance(List<ObjectId> docIds,
relevance.add(val);
}

if(this instanceof QueryRanker){
QueryRanker ranker = (QueryRanker)this;
relevance = ranker.addQueryDocRel(relevance);
}

return relevance;
}

Expand Down

0 comments on commit 5144016

Please sign in to comment.