Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: some bugs related to connection between ranker and engine #55

Merged
merged 1 commit into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ mistermeow/.project
mistermeow/.settings/org.eclipse.buildship.core.prefs
mistermeow/app/.settings
mistermeow/app/src/meowapp/node_modules
.vscode/*
3 changes: 0 additions & 3 deletions .vscode/settings.json

This file was deleted.

19 changes: 8 additions & 11 deletions mistermeow/app/src/main/java/meowEngine/QueryEngineController.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,45 +5,43 @@
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import meowdbmanager.DBManager;
import org.bson.Document;
import org.bson.types.ObjectId;
import org.jsoup.Jsoup;
import org.springframework.web.bind.annotation.*;

import meowdbmanager.DBManager;
import meowindexer.Tokenizer;
import meowranker.PhraseRanker;
import meowranker.*;

//TODO: normal queries with ranking
//TODO: bold in snippts
@CrossOrigin(origins = "*", allowedHeaders = "*")
@RestController
@RequestMapping("/")
public class QueryEngineController {
private DBManager dbManager;
private Tokenizer tokenizer;
private PhraseRanker phraseRanker;
private Ranker ranker, phraseRanker;
private List<ObjectId> docs;
private String currentQuery;
private boolean isPhraseMatching, isFirstTime;
private String[] phrases;
private int[] operators; // 0: None, 1: AND, 2: OR, 3: NOT
private List<String> tokens, tags, suggestions;
private List<String> tokens, suggestions;
private int resultCount;
private final int numOfDocsInPage = 20, windowCharSize = 100;

public QueryEngineController() {
dbManager = new DBManager();
tokenizer = new Tokenizer();
phraseRanker = new PhraseRanker();
ranker = new QueryRanker();
docs = new ArrayList<>();
currentQuery = "";
isPhraseMatching = false;
isFirstTime = true;
phrases = new String[3];
operators = new int[2];
tags = new ArrayList<>();
tokens = new ArrayList<>();
suggestions = new ArrayList<>();
resultCount = 0;
Expand Down Expand Up @@ -72,7 +70,6 @@ public Document searchQuery(@RequestParam("query") String query,
parse(currentQuery);
dbManager.insertSuggestion(currentQuery);
tokens = tokenizer.tokenizeString(currentQuery, false);
tags = tokenizer.tokenizeString(currentQuery, false);
docs = rankDocs();
isFirstTime = false;
resultCount = docs.size();
Expand Down Expand Up @@ -129,11 +126,10 @@ private Document getResults(List<ObjectId> docs) {
availableCount--;
}

System.out.println("Results: " + results);
Document data = new Document("results", results)
.append("count", resultCount)
.append("availableCount", availableCount)
.append("tags", tags)
.append("tags", tokens)
.append("suggestions", suggestions);

return data;
Expand All @@ -147,6 +143,8 @@ public String getSnippet(String doc) {
Matcher stringMatch = Pattern.compile("\\b" + string + "\\b").matcher(textContent);
if (stringMatch.find()) {
int index = stringMatch.start();
textContent = textContent.substring(0, index) + "*" + string + "*"
+ textContent.substring(index + string.length());
int start = Math.max(0, index - windowCharSize);
int end = Math.min(textContent.length(), index + windowCharSize);
return textContent.substring(start, end);
Expand All @@ -163,10 +161,9 @@ private List<ObjectId> rankDocs() {
useOperator(docIDs, operators[0], 1);
if (phrases[2] != null)
useOperator(docIDs, operators[1], 2);
System.out.println("DocIDs: " + docIDs);
return docIDs;
}
return dbManager.getDocIDs(tags);
return ranker.rank(currentQuery);
}

private void useOperator(List<ObjectId> docIDs, int operator, int phraseIndex) {
Expand Down
11 changes: 5 additions & 6 deletions mistermeow/app/src/main/java/meowdbmanager/DBManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -414,14 +414,13 @@ public Document getDocument(String docID) {
public List<Document> getDocuments(List<ObjectId> docIDs) {
try {
List<Document> pipeline = new ArrayList<>();
pipeline.add(new Document(
"$match", new Document("_id", new Document("$in", docIDs))));
pipeline.add(new Document("$match", new Document("_id", new Document("$in", docIDs))));
pipeline.add(new Document("$project", new Document()
.append("host", 1)
.append("URL", 1)
.append("title", 1)
.append("content", 1))
.append("ranker_id" , 1));
.append("content", 1)
.append("ranker_id", 1)));

List<Document> results = docCollection.aggregate(pipeline).into(new ArrayList<>());

Expand Down Expand Up @@ -473,12 +472,12 @@ public double getDocumentFromInverted(String token, ObjectId docID) {
}
}

public String getPoisitionFromInverted(String token, ObjectId docID) {
public String getPositionFromInverted(String token, ObjectId docID) {
try {
Document query = new Document("token", token).append("docs._id", docID);

Document result = invertedCollection.find(query)
.projection(new Document("docs.$", 1))
.projection(new Document("docs", 1))
.first();

if (result != null)
Expand Down
33 changes: 15 additions & 18 deletions mistermeow/app/src/main/java/meowranker/Main.java
Original file line number Diff line number Diff line change
@@ -1,28 +1,25 @@
package meowranker;

import java.util.*;
import org.bson.types.ObjectId;

public class Main {
public static void main(String[] argv) {

Ranker ranker = new PhraseRanker();

String query = "The dfl;akjf;asd Free Encyclopedia"; // tests searching for unfound token
ranker.rank(query);

query = "The Free Encyclopedia";
ranker.rank(query);

// query = "Wikipedia";
// Ranker ranker = new PhraseRanker();
//
// String query = "The dfl;akjf;asd Free Encyclopedia"; // tests searching for
// unfound token
// ranker.rank(query);
//
// query = "The Free Encyclopedia";
// ranker.rank(query);
//
// // query = "Wikipedia";
// // ranker.rank(query);
//
// query = "I love you";
// ranker.rank(query);

query = "I love you";
ranker.rank(query);


ranker = new QueryRanker();
ranker.rank("The Free Encyclopedia");
Ranker ranker = new QueryRanker();
ranker.rank("cats");

// QueryRanker Qr = new QueryRanker();
// Qr.rank("Wkiipedia the free encyclopedia");
Expand Down
26 changes: 13 additions & 13 deletions mistermeow/app/src/main/java/meowranker/Ranker.java
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@

package meowranker;

import com.google.common.collect.Table;
import java.lang.Math;
import java.util.*;
import meowdbmanager.DBManager;
import meowindexer.Tokenizer;
import org.bson.Document;
import org.bson.types.ObjectId;
import org.springframework.data.mongodb.core.query.Query;

public abstract class Ranker {

Expand Down Expand Up @@ -49,19 +47,21 @@ public List<ObjectId> rank(String query) {

finalRank.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue()));

System.out.println("======================================");
System.out.println("=========== Final Result =============");
System.out.println("======================================");
// System.out.println("======================================");
// System.out.println("=========== Final Result =============");
// System.out.println("======================================");

List<ObjectId> SortedList = new ArrayList<>();
for (Map.Entry<ObjectId, Double> e : finalRank) {
SortedList.add(e.getKey());
System.out
.println("URL: " + db.getDocument(e.getKey().toString()).getString("URL") + " || Rank = " + e.getValue());
// System.out
// .println("URL: " + db.getDocument(e.getKey().toString()).getString("URL") + "
// || Rank = " + e.getValue());
// The previous printing is time costly, comment it if you're not testing of
// debugging
}

System.out.println("Ranking finished!");
return SortedList;
}

Expand Down Expand Up @@ -151,7 +151,7 @@ public static double[][] constructUrlsGraph() {
}

public double[] getPopularityArr() {
int numberOfUrls = db.getUrlsCount();
// int numberOfUrls = db.getUrlsCount();
// double[] popularityArr = new double[numberOfUrls];

// for (int i = 0; i < numberOfUrls; i++) {
Expand Down Expand Up @@ -239,10 +239,10 @@ public List<Double> calculateRelevance(List<ObjectId> docIds,
double val = 0;
for (String token : searchTokens) {
// summation(tf-idf)
String position = db.getPoisitionFromInverted(token, docIds.get(i));
String position = db.getPositionFromInverted(token, docIds.get(i));

val += db.getDocumentFromInverted(token, docIds.get(i)) * getIDF(token);
if (!position.equals("other"))
if (position != null && !position.equals("other"))
val += boost;
// NOTE: uncomment when testing
// System.out.println(
Expand All @@ -255,8 +255,8 @@ public List<Double> calculateRelevance(List<ObjectId> docIds,
relevance.add(val);
}

if(this instanceof QueryRanker){
QueryRanker ranker = (QueryRanker)this;
if (this instanceof QueryRanker) {
QueryRanker ranker = (QueryRanker) this;
relevance = ranker.addQueryDocRel(relevance);
}

Expand All @@ -267,7 +267,7 @@ public double getIDF(String token) {
double df;
Document invertedInd = db.getInvertedIndex(token);

if (invertedInd == null) // Handling tokens that are not in any documnets
if (invertedInd == null) // Handling tokens that are not in any documents
return 0;

df = (double) db.getInvertedIndex(token).getInteger("DF");
Expand Down