Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Engine continue #46

Merged
merged 3 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 50 additions & 25 deletions mistermeow/app/src/main/java/meowEngine/QueryEngineController.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package meowEngine;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand All @@ -12,26 +11,37 @@
import org.springframework.web.bind.annotation.*;

import meowdbmanager.DBManager;
import meowindexer.Tokenizer;
import meowranker.PhraseRanker;

@RestController
@RequestMapping("/")
public class QueryEngineController {
private DBManager dbManager;
private List<String> docs;
private Tokenizer tokenizer;
private PhraseRanker phraseRanker;
private List<ObjectId> docs;
private String currentQuery;
private boolean isPhraseMatching, isFirstTime;
private String[] phrases;
private int[] operators; // 0: None, 1: AND, 2: OR, 3: NOT
private List<String> tags, suggestions;
private int resultCount;
private final int numOfDocsInPage = 20, windowCharSize = 100;

public QueryEngineController() {
dbManager = new DBManager();
tokenizer = new Tokenizer();
phraseRanker = new PhraseRanker();
docs = new ArrayList<>();
currentQuery = "";
isPhraseMatching = false;
isFirstTime = true;
phrases = new String[3];
operators = new int[2];
tags = new ArrayList<>();
suggestions = new ArrayList<>();
resultCount = 0;
}

@GetMapping("/")
Expand All @@ -41,7 +51,8 @@ public String sayHello() {

@GetMapping("/suggestions")
public Document getSuggestions(@RequestParam("query") String query) {
return new Document("data", dbManager.getSuggestions(query, 10));
suggestions = dbManager.getSuggestions(query, 10);
return new Document("data", suggestions);
}

@GetMapping("/search")
Expand All @@ -58,12 +69,15 @@ public Document searchQuery(
docs = rankDocs(query.toLowerCase().split("\\s+"));
isFirstTime = false;
currentQuery = query;
resultCount = docs.size();
tags = tokenizer.tokenizeString(currentQuery);
suggestions = dbManager.getSuggestions(query, 10);
}

int startIndex = page * numOfDocsInPage;
int startIndex = (page - 1) * numOfDocsInPage;
int endIndex = Math.min(startIndex + numOfDocsInPage, docs.size());
List<String> subList = docs.subList(startIndex, endIndex);
return new Document("data", subList);
List<ObjectId> subList = docs.subList(startIndex, endIndex);
return new Document("data", getResults(subList));
}

private void parse(String query) {
Expand All @@ -77,14 +91,12 @@ private void parse(String query) {
int i = 0;
while (phraseMatch.find()) {
String phrase = phraseMatch.group().replaceAll("^\"|\"$", "").trim();
System.out.println(phrase);
phrases[i++] = phrase;
}

i = 0;
while (operatorMatch.find()) {
String operator = operatorMatch.group().replaceAll("^\"|\"$", "").trim();
System.out.println(operator);
operators[i++] = operator.equals("AND") ? 1
: operator.equals("OR") ? 2
: 3;
Expand All @@ -94,28 +106,37 @@ private void parse(String query) {
operators[0] = operators[1] = 0;
}

private List<Document> getResults(List<ObjectId> docs) {
getResultsMetadata();
getResultsInfo();
return null;
}
private Document getResults(List<ObjectId> docs) {

private String getResultsMetadata() {
return "Results Metadata";
}
List<Document> results = dbManager.getDocuments(docs);
for (Document result : results) {
String doc = result.getString("content");
String snippet = isPhraseMatching ? getSnippet(doc, phrases[0])
: getSnippet(doc, tags);
result.remove("content");
result.remove("_id");
result.append("snippet", snippet);
}

System.out.println(results);
Document data = new Document("results", results)
.append("count", resultCount)
.append("tags", tags)
.append("suggestions", suggestions);

private String getResultsInfo() {
return "Results Info";
return data;
}

public String getSnippet(String doc, HashSet<String> tokens) {
public String getSnippet(String doc, List<String> tokens) {
String textContent = Jsoup.parse(doc).text();

for (String token : tokens) {
token = " " + token + " ";
if (textContent.contains(token)) {
int index = textContent.indexOf(token);
return textContent.substring(index - windowCharSize,
index + windowCharSize);
int start = Math.max(0, index - windowCharSize);
int end = Math.min(textContent.length(), index + windowCharSize);
return textContent.substring(start, end);
}
}

Expand All @@ -125,16 +146,20 @@ public String getSnippet(String doc, HashSet<String> tokens) {
public String getSnippet(String doc, String phrase) {
String textContent = Jsoup.parse(doc).text();

phrase = " " + phrase + " ";
if (textContent.contains(phrase)) {
int index = textContent.indexOf(phrase);
return textContent.substring(index - windowCharSize,
index + windowCharSize);
int start = Math.max(0, index - windowCharSize);
int end = Math.min(textContent.length(), index + windowCharSize);
return textContent.substring(start, end);
}

return "No Snippet Found";
}

private List<String> rankDocs(String[] tokens) {
return dbManager.getDocs(tokens);
private List<ObjectId> rankDocs(String[] tokens) {
if (isPhraseMatching)
return phraseRanker.rank(phrases[0]);
return dbManager.getDocIDs(tokens);
}
}
29 changes: 25 additions & 4 deletions mistermeow/app/src/main/java/meowdbmanager/DBManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ public int getUrlsCount() {
System.out.println("Error while getting urls count: " + e.getMessage());
return -1;
}
}
}

/**
* getParentsArr - returns an array of parents for a certain url.
Expand Down Expand Up @@ -329,6 +329,27 @@ public Document getDocument(String docID) {
}
}

public List<Document> getDocuments(List<ObjectId> docIDs) {
try {
List<Document> pipeline = new ArrayList<>();
pipeline.add(new Document("$match", new Document("_id", new Document("$in", docIDs))));
pipeline.add(new Document("$project", new Document()
.append("host", 1)
.append("URL", 1)
.append("title", 1)
.append("content", 1)));

List<Document> results = docCollection.aggregate(pipeline).into(new ArrayList<>());

return results;
} catch (MongoException e) {

System.out.println("Error occurred while getting docs: " +
e.getMessage());
return null;
}
}

public Document getInvertedIndex(String token) {
try {
Document indices = invertedCollection.find(new Document("token", token)).first();
Expand Down Expand Up @@ -367,8 +388,8 @@ public double getDocumentFromInverted(String token, ObjectId docID) {
}
}

public List<String> getDocs(String[] tokens) {
List<String> docIds = new ArrayList<>();
public List<ObjectId> getDocIDs(String[] tokens) {
List<ObjectId> docIds = new ArrayList<>();
List<String> tokenList = Arrays.asList(tokens);

try {
Expand All @@ -379,7 +400,7 @@ public List<String> getDocs(String[] tokens) {

List<Document> aggregationResult = invertedCollection.aggregate(pipeline).into(new ArrayList<>());
for (Document doc : aggregationResult) {
docIds.add(doc.getObjectId("_id").toString());
docIds.add(new ObjectId(doc.getObjectId("_id").toString()));
}

return docIds;
Expand Down
24 changes: 12 additions & 12 deletions mistermeow/app/src/main/java/meowranker/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@

public class Main {
public static void main(String[] argv) {

PhraseRanker phRanker = new PhraseRanker();

String query = "The dfl;akjf;asd Free Encyclopedia"; // tests searching for unfound token
phRanker.rank(query);
query = "The Free Encyclopedia";
phRanker.rank(query);
query = "domestic cat";
phRanker.rank(query);
}
}
String query = "The dfl;akjf;asd Free Encyclopedia"; // tests searching for unfound token
phRanker.rank(query);

query = "The Free Encyclopedia";
phRanker.rank(query);

query = "I love you";
phRanker.rank(query);

}
}
59 changes: 27 additions & 32 deletions mistermeow/app/src/main/java/meowranker/PhraseRanker.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,56 +14,52 @@ public PhraseRanker() {
}

// TODO: change function return type
public List<Document> rank(String query) {

// applying PR algorithm
double [][] M = this.constructUrlsGraph();
double [] popularity = this.getPopularity(M , M.length);
public List<ObjectId> rank(String query) {

// Tokenizing query
// applying PR algorithm
double[][] M = this.constructUrlsGraph();
double[] popularity = this.getPopularity(M, M.length);

// Tokenizing query
List<String> searchTokens = tokenizer.tokenizeString(query);
System.out.println(searchTokens);


// getting docs common in all tokens & matches the query phrase
// getting docs common in all tokens & matches the query phrase
List<Document> matchedDocs = getMatchingDocs(searchTokens, query);

// System.out.println(matchedDocs.size());
// for (Document doc : matchedDocs) {
// System.out.println(doc.getString("URL"));
// System.out.println(doc.getString("URL"));
// }

// calculating relevance for each document
List<Double> relevance = this.calculateRelevance(matchedDocs , searchTokens , popularity);
// calculating relevance for each document
List<Double> relevance = this.calculateRelevance(matchedDocs, searchTokens, popularity);

// for (Double val: relevance){
// System.out.println(val);
// System.out.println(val);
// }

List<Map.Entry<Document , Double>> finalRank = this.combineRelWithPop(matchedDocs , relevance , popularity);
List<Map.Entry<Document, Double>> finalRank = this.combineRelWithPop(matchedDocs, relevance, popularity);

finalRank.sort(Map.Entry.comparingByValue());

System.out.println("======================================");
System.out.println("=========== Final Result =============");
System.out.println("======================================");

List<Document> SortedList = new ArrayList<>();
for(Map.Entry<Document , Double> e:finalRank){
SortedList.add(e.getKey());
List<ObjectId> SortedList = new ArrayList<>();
for (Map.Entry<Document, Double> e : finalRank) {
SortedList.add(e.getKey().getObjectId("_id"));
System.out.println(e.getKey().getString("URL") + " " + e.getValue());
}



// TODO: call function sort by higher rank
return SortedList;
}

private List<Document> getMatchingDocs(List<String> searchTokens , String query) {
private List<Document> getMatchingDocs(List<String> searchTokens, String query) {

List<Document> docs = getCommonDocs(searchTokens);

List<Document> finalDocs = new ArrayList<>();

for (Document doc : docs) {
Expand Down Expand Up @@ -92,18 +88,17 @@ private List<Document> getCommonDocs(List<String> searchTokens) {
return new ArrayList<>();

// getting the first token present in db
int ind = 0;
int ind = 0;
Document invertedInd = db.getInvertedIndex(searchTokens.get(0));
ind++;
while(invertedInd == null && ind< searchTokens.size()){

while (invertedInd == null && ind < searchTokens.size()) {
invertedInd = db.getInvertedIndex(searchTokens.get(ind));
ind++;
}

if(invertedInd == null)
return new ArrayList<>();

if (invertedInd == null)
return new ArrayList<>();

List<Document> docs = invertedInd.getList("docs", Document.class);
List<ObjectId> docsId = new ArrayList<>();
Expand All @@ -113,7 +108,7 @@ private List<Document> getCommonDocs(List<String> searchTokens) {
}

for (int i = ind; i < searchTokens.size(); i++) {

invertedInd = db.getInvertedIndex(searchTokens.get(i));
if (invertedInd != null) {

Expand All @@ -135,4 +130,4 @@ private List<Document> getCommonDocs(List<String> searchTokens) {

return commonDocs;
}
}
}