From e820ecaabf737e7d4ba95d6cac5c4c0dee5172e2 Mon Sep 17 00:00:00 2001 From: AhmedHamed3699 Date: Mon, 13 May 2024 18:46:36 +0300 Subject: [PATCH] feat: front and back are connected --- .../app/src/main/java/meowEngine/Main.java | 2 + .../meowEngine/QueryEngineController.java | 39 ++-- .../main/java/meowdbmanager/DBManager.java | 5 +- .../src/main/java/meowindexer/Tokenizer.java | 41 ++-- .../main/java/meowranker/PhraseRanker.java | 2 +- mistermeow/app/src/meowapp/src/SRP.tsx | 47 ++--- .../app/src/meowapp/src/utils/results-api.tsx | 176 +++++++++--------- 7 files changed, 162 insertions(+), 150 deletions(-) diff --git a/mistermeow/app/src/main/java/meowEngine/Main.java b/mistermeow/app/src/main/java/meowEngine/Main.java index 2610981..3bd540b 100644 --- a/mistermeow/app/src/main/java/meowEngine/Main.java +++ b/mistermeow/app/src/main/java/meowEngine/Main.java @@ -3,6 +3,8 @@ import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; +import meowindexer.Tokenizer; + @SpringBootApplication public class Main { public static void main(String[] args) { diff --git a/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java b/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java index 6f4a240..531cf10 100644 --- a/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java +++ b/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java @@ -1,6 +1,7 @@ package meowEngine; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -26,7 +27,7 @@ public class QueryEngineController { private boolean isPhraseMatching, isFirstTime; private String[] phrases; private int[] operators; // 0: None, 1: AND, 2: OR, 3: NOT - private List tags, suggestions; + private List tokens, tags, suggestions; private int resultCount; private final int numOfDocsInPage = 20, windowCharSize = 100; @@ -41,6 +42,7 @@ public QueryEngineController() { phrases = new String[3]; operators = new int[2]; tags = new ArrayList<>(); + tokens = new ArrayList<>(); suggestions = new ArrayList<>(); resultCount = 0; } @@ -65,13 +67,14 @@ public Document searchQuery( isFirstTime = true; if (isFirstTime) { - parse(query); - dbManager.insertSuggestion(query); - docs = rankDocs(query.toLowerCase().split("\\s+")); - isFirstTime = false; currentQuery = query; + parse(currentQuery); + dbManager.insertSuggestion(currentQuery); + tokens = tokenizer.tokenizeString(currentQuery, false); + tags = tokenizer.tokenizeString(currentQuery, false); + docs = rankDocs(); + isFirstTime = false; resultCount = docs.size(); - tags = tokenizer.tokenizeString(currentQuery); suggestions = dbManager.getSuggestions(query, 10); } @@ -113,10 +116,10 @@ private Document getResults(List docs) { for (Document result : results) { String doc = result.getString("content"); String snippet = isPhraseMatching ? getSnippet(doc, phrases[0]) - : getSnippet(doc, tags); + : getSnippet(doc, tokens); result.remove("content"); result.remove("_id"); - result.append("snippet", snippet); + result.append("snippets", snippet); } System.out.println(results); @@ -132,35 +135,35 @@ public String getSnippet(String doc, List tokens) { String textContent = Jsoup.parse(doc).text(); for (String token : tokens) { - token = " " + token + " "; - if (textContent.contains(token)) { - int index = textContent.indexOf(token); + Matcher tokenMatch = Pattern.compile("\\b" + token + "\\b").matcher(textContent); + if (tokenMatch.find()) { + int index = tokenMatch.start(); int start = Math.max(0, index - windowCharSize); int end = Math.min(textContent.length(), index + windowCharSize); return textContent.substring(start, end); } } - return "No Snippet Found"; + return null; } public String getSnippet(String doc, String phrase) { String textContent = Jsoup.parse(doc).text(); - phrase = " " + phrase + " "; - if (textContent.contains(phrase)) { - int index = textContent.indexOf(phrase); + Matcher phraseMatch = Pattern.compile("//b" + phrase + "//b").matcher(textContent); + if (phraseMatch.find()) { + int index = phraseMatch.start(); int start = Math.max(0, index - windowCharSize); int end = Math.min(textContent.length(), index + windowCharSize); return textContent.substring(start, end); } - return "No Snippet Found"; + return null; } - private List rankDocs(String[] tokens) { + private List rankDocs() { if (isPhraseMatching) return phraseRanker.rank(phrases[0]); - return dbManager.getDocIDs(tokens); + return dbManager.getDocIDs(tags); } } diff --git a/mistermeow/app/src/main/java/meowdbmanager/DBManager.java b/mistermeow/app/src/main/java/meowdbmanager/DBManager.java index 469e4be..93bdea2 100644 --- a/mistermeow/app/src/main/java/meowdbmanager/DBManager.java +++ b/mistermeow/app/src/main/java/meowdbmanager/DBManager.java @@ -388,13 +388,12 @@ public double getDocumentFromInverted(String token, ObjectId docID) { } } - public List getDocIDs(String[] tokens) { + public List getDocIDs(List tokens) { List docIds = new ArrayList<>(); - List tokenList = Arrays.asList(tokens); try { List pipeline = new ArrayList<>(); - pipeline.add(new Document("$match", new Document("token", new Document("$in", tokenList)))); + pipeline.add(new Document("$match", new Document("token", new Document("$in", tokens)))); pipeline.add(new Document("$unwind", "$docs")); pipeline.add(new Document("$project", new Document("_id", "$docs._id"))); diff --git a/mistermeow/app/src/main/java/meowindexer/Tokenizer.java b/mistermeow/app/src/main/java/meowindexer/Tokenizer.java index 34b5d95..1fd5cc4 100644 --- a/mistermeow/app/src/main/java/meowindexer/Tokenizer.java +++ b/mistermeow/app/src/main/java/meowindexer/Tokenizer.java @@ -89,7 +89,7 @@ public static HashSet loadStopWords(String filename) { */ public HashMap tokenize(Document doc) { String text = doc.text(); - List tokens = tokenizeString(text); + List tokens = tokenizeString(text, true); HashMap tokenMap = new HashMap(); for (String token : tokens) { @@ -121,16 +121,16 @@ public HashMap tokenize(Document doc) { * @param text: String to tokenize * @return List of tokens */ - public List tokenizeString(String text) { + public List tokenizeString(String text, boolean stem) { List tokens = new ArrayList(); PorterStemmer stemmer = new PorterStemmer(); - String cleanText = text.toLowerCase().replaceAll("[^a-zA-Z ]", ""); + String cleanText = text.toLowerCase().replaceAll("[^a-z ]", ""); String[] words = cleanText.split("\\s+"); for (String word : words) { if (word.length() > 1 && !stopWords.contains(word)) { - String stemmedWord = stemmer.stem(word); + String stemmedWord = stem ? stemmer.stem(word) : word; if (!stemmedWord.equals(word)) tokens.add(stemmer.stem(word)); @@ -149,9 +149,9 @@ public List tokenizeString(String text) { * @param doc: Document to search for token positions */ private void fillPosistions(HashMap tokens, Document doc) { - List titleTokens = tokenizeString(doc.title()); - List h1Tokens = tokenizeString(doc.select("h1").text()); - List h2Tokens = tokenizeString(doc.select("h2").text()); + List titleTokens = tokenizeString(doc.title(), true); + List h1Tokens = tokenizeString(doc.select("h1").text(), true); + List h2Tokens = tokenizeString(doc.select("h2").text(), true); for (String token : tokens.keySet()) { Token t = tokens.get(token); @@ -178,24 +178,25 @@ public void test() { System.out.println(ANSI_RESET); Document doc = null; - final String url = "https://en.wikipedia.org/wiki/Cat"; + final String url = "https://www.imdb.com/chart/top/"; try { doc = Jsoup.connect(url).get(); } catch (IOException e) { e.printStackTrace(); } - System.out.println("tokenizing: " + url + " : " + doc.title()); - HashMap tokens = tokenize(doc); - // print sorted by count - System.out.println("Sorted by count:"); - final String ANSI_YELLOW = "\u001B[33m"; - final String ANSI_RESET2 = "\u001B[0m"; - tokens.entrySet().stream().forEach( - e -> System.out.println(ANSI_YELLOW + "{ " - + "word: " + e.getKey() + ", " - + "count: " + e.getValue().count + ", " - + "position: " + e.getValue().position + " }" + - ANSI_RESET2)); + System.out.println(doc); + // System.out.println("tokenizing: " + url + " : " + doc.title()); + // HashMap tokens = tokenize(doc); + // // print sorted by count + // System.out.println("Sorted by count:"); + // final String ANSI_YELLOW = "\u001B[33m"; + // final String ANSI_RESET2 = "\u001B[0m"; + // tokens.entrySet().stream().forEach( + // e -> System.out.println(ANSI_YELLOW + "{ " + // + "word: " + e.getKey() + ", " + // + "count: " + e.getValue().count + ", " + // + "position: " + e.getValue().position + " }" + + // ANSI_RESET2)); } } diff --git a/mistermeow/app/src/main/java/meowranker/PhraseRanker.java b/mistermeow/app/src/main/java/meowranker/PhraseRanker.java index 888c673..3a6483d 100644 --- a/mistermeow/app/src/main/java/meowranker/PhraseRanker.java +++ b/mistermeow/app/src/main/java/meowranker/PhraseRanker.java @@ -21,7 +21,7 @@ public List rank(String query) { double[] popularity = this.getPopularity(M, M.length); // Tokenizing query - List searchTokens = tokenizer.tokenizeString(query); + List searchTokens = tokenizer.tokenizeString(query, true); System.out.println(searchTokens); // getting docs common in all tokens & matches the query phrase diff --git a/mistermeow/app/src/meowapp/src/SRP.tsx b/mistermeow/app/src/meowapp/src/SRP.tsx index d4a3bf6..5f82f4e 100644 --- a/mistermeow/app/src/meowapp/src/SRP.tsx +++ b/mistermeow/app/src/meowapp/src/SRP.tsx @@ -93,29 +93,32 @@ function SRP() {