Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: phrase matching done #36

Merged
merged 1 commit into from
May 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions mistermeow/app/src/main/java/meowdbmanager/DBManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,27 @@ public Document getInvertedIndex(String token) {
}
}

//! Not working
public List<Document> getCommonDocs(List<String> searchTokens){
try {
Document query = new Document("$and", Arrays.asList(
new Document("token", new Document("docs" , new Document("$exists", true).append("$all", Arrays.asList(searchTokens))))));

System.out.println(query);
List<Document> docs = invertedCollection.find(query).into(new ArrayList<>());

for(Document doc:docs){
System.out.println(doc);
}
return docs;

} catch (MongoException e) {
System.out.println("Error occurred while getting docs: " +
e.getMessage());
return null;
}
}

@Override
protected void finalize() throws Throwable {
mongoClient.close();
Expand Down
2 changes: 1 addition & 1 deletion mistermeow/app/src/main/java/meowindexer/Tokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ private void loadStopWords(String filename) {
* @param text: String to tokenize
* @return List of tokens
*/
private List<String> tokenizeString(String text) {
public List<String> tokenizeString(String text) {
List<String> tokens = new ArrayList<String>();
PorterStemmer stemmer = new PorterStemmer();

Expand Down
23 changes: 7 additions & 16 deletions mistermeow/app/src/main/java/meowranker/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,11 @@ public static void main(String[] argv) {
// testing phrase matching
PhraseRanker r = new PhraseRanker();

// list of tokens after tokenization (without stemming)
List<String> tokens = new ArrayList<>();
tokens.add("github"); // [ 661b0e19c22b5f3c3cc84f54, 661b0e25c22b5f3c3cc84f5e,
// 661b0e26c22b5f3c3cc84f60, 661b0e27c22b5f3c3cc84f63,
// 661b0e33c22b5f3c3cc84f7a, 661b0e34c22b5f3c3cc84f83 ]

tokens.add("git"); // [ 661b0e27c22b5f3c3cc84f63, 661b0e2dc22b5f3c3cc84f68,
// 661b0e33c22b5f3c3cc84f72, 661b0e34c22b5f3c3cc84f83,
// 661b0e34c22b5f3c3cc84f8b ]

List<ObjectId> matchedDocs = r.rank(tokens); // expected output: [661b0e27c22b5f3c3cc84f63,
// 661b0e34c22b5f3c3cc84f83]

for (ObjectId id : matchedDocs)
System.out.println(id);
}
String query = "The Free Encyclopedia";
r.rank(query);

query = "domestic cat";
r.rank(query);

}
}
88 changes: 71 additions & 17 deletions mistermeow/app/src/main/java/meowranker/PhraseRanker.java
Original file line number Diff line number Diff line change
@@ -1,42 +1,96 @@
package meowranker;

import java.util.*;

import org.bson.Document;
import org.bson.types.ObjectId;
import org.jsoup.Jsoup;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PhraseRanker extends Ranker {

public class PhraseRanker extends Ranker{

public PhraseRanker(){
public PhraseRanker() {
super();
}

//type to be changed later
List<ObjectId> rank(List<String> tokens){
List<ObjectId> matchedDocs = getMatchingDocs(tokens);

// TODO: change function return type
public List<Document> rank(String query) {
// TODO:Call function to construct M matrix required to calculate popularity
// TODO: Call function getPopularity()

List<Document> matchedDocs = getMatchingDocs(query);

System.out.println(matchedDocs.size());

for (Document doc : matchedDocs) {
System.out.println(doc.getString("URL"));
}

// TODO: call function sort by higher rank
return matchedDocs;
}

private List<ObjectId> getMatchingDocs(List<String> searchTokens){
private List<Document> getMatchingDocs(String query) {

List<String> searchTokens = tokenizer.tokenizeString(query);

// List<Document> docs = db.getInvertedIndex(firstToken).getList("docs",
// Document.class);
List<Document> docs = getCommonDocs(searchTokens);
List<Document> finalDocs = new ArrayList<>();

for (Document doc : docs) {
ObjectId id = doc.getObjectId("_id");
Document currDoc = db.getDocument(id.toString()); // getting the document by id

String content = currDoc.getString("content"); // getting the content of the document
String text = Jsoup.parse(content).text(); // separating html from content

String regex = "(?i)" + query; // making case-insensitive search
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(text);

boolean flag = matcher.find();

if (flag) // checking if the query is a part of the document
finalDocs.add(currDoc); // adding the documents that matches the query
}

return finalDocs;
}

private List<Document> getCommonDocs(List<String> searchTokens) {

if (searchTokens.size() == 0)
return new ArrayList<>();

List<Document> docs = db.getInvertedIndex(searchTokens.get(0)).getList("docs", Document.class);
List<ObjectId> docsId = new ArrayList<>();

for(Document doc :docs){
for (Document doc : docs) {
docsId.add(doc.getObjectId("_id"));
}

for(int i = 1; i<searchTokens.size() ; i++){
List<Document> currDocs = db.getInvertedIndex(searchTokens.get(i)).getList("docs", Document.class);
List<ObjectId> currDocsId = new ArrayList<>();
for (int i = 1; i < searchTokens.size(); i++) {
Document invertedInd = db.getInvertedIndex(searchTokens.get(i));
if (invertedInd != null) {

for(Document doc :currDocs){
currDocsId.add(doc.getObjectId("_id"));
List<Document> currDocs = invertedInd.getList("docs", Document.class);
List<ObjectId> currDocsId = new ArrayList<>();

for (Document doc : currDocs) {
currDocsId.add(doc.getObjectId("_id"));
}

docsId.retainAll(currDocsId);
}
}

docsId.retainAll(currDocsId);
List<Document> commonDocs = new ArrayList<>();
for (ObjectId id : docsId) {
commonDocs.add(db.getDocument(id.toString()));
}

return docsId;
return commonDocs;
}
}
7 changes: 5 additions & 2 deletions mistermeow/app/src/main/java/meowranker/Ranker.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
import java.util.*;
import java.lang.Math;
import meowdbmanager.DBManager;
import meowindexer.Tokenizer;

public class Ranker {

public DBManager db;

public Ranker() {
public Tokenizer tokenizer;

public Ranker(){
db = new DBManager();
tokenizer = new Tokenizer();
}

// The function takes graph of links between documents
Expand Down