Skip to content

Commit

Permalink
feat: front and back are connected
Browse files Browse the repository at this point in the history
  • Loading branch information
AhmedHamed3699 committed May 13, 2024
1 parent eba4cb7 commit e820eca
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 150 deletions.
2 changes: 2 additions & 0 deletions mistermeow/app/src/main/java/meowEngine/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

import meowindexer.Tokenizer;

@SpringBootApplication
public class Main {
public static void main(String[] args) {
Expand Down
39 changes: 21 additions & 18 deletions mistermeow/app/src/main/java/meowEngine/QueryEngineController.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package meowEngine;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand All @@ -26,7 +27,7 @@ public class QueryEngineController {
private boolean isPhraseMatching, isFirstTime;
private String[] phrases;
private int[] operators; // 0: None, 1: AND, 2: OR, 3: NOT
private List<String> tags, suggestions;
private List<String> tokens, tags, suggestions;
private int resultCount;
private final int numOfDocsInPage = 20, windowCharSize = 100;

Expand All @@ -41,6 +42,7 @@ public QueryEngineController() {
phrases = new String[3];
operators = new int[2];
tags = new ArrayList<>();
tokens = new ArrayList<>();
suggestions = new ArrayList<>();
resultCount = 0;
}
Expand All @@ -65,13 +67,14 @@ public Document searchQuery(
isFirstTime = true;

if (isFirstTime) {
parse(query);
dbManager.insertSuggestion(query);
docs = rankDocs(query.toLowerCase().split("\\s+"));
isFirstTime = false;
currentQuery = query;
parse(currentQuery);
dbManager.insertSuggestion(currentQuery);
tokens = tokenizer.tokenizeString(currentQuery, false);
tags = tokenizer.tokenizeString(currentQuery, false);
docs = rankDocs();
isFirstTime = false;
resultCount = docs.size();
tags = tokenizer.tokenizeString(currentQuery);
suggestions = dbManager.getSuggestions(query, 10);
}

Expand Down Expand Up @@ -113,10 +116,10 @@ private Document getResults(List<ObjectId> docs) {
for (Document result : results) {
String doc = result.getString("content");
String snippet = isPhraseMatching ? getSnippet(doc, phrases[0])
: getSnippet(doc, tags);
: getSnippet(doc, tokens);
result.remove("content");
result.remove("_id");
result.append("snippet", snippet);
result.append("snippets", snippet);
}

System.out.println(results);
Expand All @@ -132,35 +135,35 @@ public String getSnippet(String doc, List<String> tokens) {
String textContent = Jsoup.parse(doc).text();

for (String token : tokens) {
token = " " + token + " ";
if (textContent.contains(token)) {
int index = textContent.indexOf(token);
Matcher tokenMatch = Pattern.compile("\\b" + token + "\\b").matcher(textContent);
if (tokenMatch.find()) {
int index = tokenMatch.start();
int start = Math.max(0, index - windowCharSize);
int end = Math.min(textContent.length(), index + windowCharSize);
return textContent.substring(start, end);
}
}

return "No Snippet Found";
return null;
}

public String getSnippet(String doc, String phrase) {
String textContent = Jsoup.parse(doc).text();

phrase = " " + phrase + " ";
if (textContent.contains(phrase)) {
int index = textContent.indexOf(phrase);
Matcher phraseMatch = Pattern.compile("//b" + phrase + "//b").matcher(textContent);
if (phraseMatch.find()) {
int index = phraseMatch.start();
int start = Math.max(0, index - windowCharSize);
int end = Math.min(textContent.length(), index + windowCharSize);
return textContent.substring(start, end);
}

return "No Snippet Found";
return null;
}

private List<ObjectId> rankDocs(String[] tokens) {
private List<ObjectId> rankDocs() {
if (isPhraseMatching)
return phraseRanker.rank(phrases[0]);
return dbManager.getDocIDs(tokens);
return dbManager.getDocIDs(tags);
}
}
5 changes: 2 additions & 3 deletions mistermeow/app/src/main/java/meowdbmanager/DBManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -388,13 +388,12 @@ public double getDocumentFromInverted(String token, ObjectId docID) {
}
}

public List<ObjectId> getDocIDs(String[] tokens) {
public List<ObjectId> getDocIDs(List<String> tokens) {
List<ObjectId> docIds = new ArrayList<>();
List<String> tokenList = Arrays.asList(tokens);

try {
List<Document> pipeline = new ArrayList<>();
pipeline.add(new Document("$match", new Document("token", new Document("$in", tokenList))));
pipeline.add(new Document("$match", new Document("token", new Document("$in", tokens))));
pipeline.add(new Document("$unwind", "$docs"));
pipeline.add(new Document("$project", new Document("_id", "$docs._id")));

Expand Down
41 changes: 21 additions & 20 deletions mistermeow/app/src/main/java/meowindexer/Tokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ public static HashSet<String> loadStopWords(String filename) {
*/
public HashMap<String, Token> tokenize(Document doc) {
String text = doc.text();
List<String> tokens = tokenizeString(text);
List<String> tokens = tokenizeString(text, true);
HashMap<String, Token> tokenMap = new HashMap<String, Token>();

for (String token : tokens) {
Expand Down Expand Up @@ -121,16 +121,16 @@ public HashMap<String, Token> tokenize(Document doc) {
* @param text: String to tokenize
* @return List of tokens
*/
public List<String> tokenizeString(String text) {
public List<String> tokenizeString(String text, boolean stem) {
List<String> tokens = new ArrayList<String>();
PorterStemmer stemmer = new PorterStemmer();

String cleanText = text.toLowerCase().replaceAll("[^a-zA-Z ]", "");
String cleanText = text.toLowerCase().replaceAll("[^a-z ]", "");
String[] words = cleanText.split("\\s+");

for (String word : words) {
if (word.length() > 1 && !stopWords.contains(word)) {
String stemmedWord = stemmer.stem(word);
String stemmedWord = stem ? stemmer.stem(word) : word;

if (!stemmedWord.equals(word))
tokens.add(stemmer.stem(word));
Expand All @@ -149,9 +149,9 @@ public List<String> tokenizeString(String text) {
* @param doc: Document to search for token positions
*/
private void fillPosistions(HashMap<String, Token> tokens, Document doc) {
List<String> titleTokens = tokenizeString(doc.title());
List<String> h1Tokens = tokenizeString(doc.select("h1").text());
List<String> h2Tokens = tokenizeString(doc.select("h2").text());
List<String> titleTokens = tokenizeString(doc.title(), true);
List<String> h1Tokens = tokenizeString(doc.select("h1").text(), true);
List<String> h2Tokens = tokenizeString(doc.select("h2").text(), true);

for (String token : tokens.keySet()) {
Token t = tokens.get(token);
Expand All @@ -178,24 +178,25 @@ public void test() {
System.out.println(ANSI_RESET);

Document doc = null;
final String url = "https://en.wikipedia.org/wiki/Cat";
final String url = "https://www.imdb.com/chart/top/";
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
e.printStackTrace();
}

System.out.println("tokenizing: " + url + " : " + doc.title());
HashMap<String, Token> tokens = tokenize(doc);
// print sorted by count
System.out.println("Sorted by count:");
final String ANSI_YELLOW = "\u001B[33m";
final String ANSI_RESET2 = "\u001B[0m";
tokens.entrySet().stream().forEach(
e -> System.out.println(ANSI_YELLOW + "{ "
+ "word: " + e.getKey() + ", "
+ "count: " + e.getValue().count + ", "
+ "position: " + e.getValue().position + " }" +
ANSI_RESET2));
System.out.println(doc);
// System.out.println("tokenizing: " + url + " : " + doc.title());
// HashMap<String, Token> tokens = tokenize(doc);
// // print sorted by count
// System.out.println("Sorted by count:");
// final String ANSI_YELLOW = "\u001B[33m";
// final String ANSI_RESET2 = "\u001B[0m";
// tokens.entrySet().stream().forEach(
// e -> System.out.println(ANSI_YELLOW + "{ "
// + "word: " + e.getKey() + ", "
// + "count: " + e.getValue().count + ", "
// + "position: " + e.getValue().position + " }" +
// ANSI_RESET2));
}
}
2 changes: 1 addition & 1 deletion mistermeow/app/src/main/java/meowranker/PhraseRanker.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public List<ObjectId> rank(String query) {
double[] popularity = this.getPopularity(M, M.length);

// Tokenizing query
List<String> searchTokens = tokenizer.tokenizeString(query);
List<String> searchTokens = tokenizer.tokenizeString(query, true);
System.out.println(searchTokens);

// getting docs common in all tokens & matches the query phrase
Expand Down
47 changes: 25 additions & 22 deletions mistermeow/app/src/meowapp/src/SRP.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -93,29 +93,32 @@ function SRP() {
</div>
<div className="grid grid-cols-1 md:grid-cols-3 gap-4 mt-4">
<div className="col-span-2">
{data.results.map((result) => (
<div className="mb-4" key={result.url}>
<a href={result.url} target="_blank" rel="noreferrer">
<div className="flex gap-2">
<CatIcon />
<div className="flex flex-col ">
<span className="text-sm text-sr-host leading-tight">
{result.host}
</span>
<span className="text-sm text-sr-url leading-tight hover:underline">
{result.url}
</span>
</div>
{data.results.map(
(result) =>
result.snippets && (
<div className="mb-4" key={result.URL}>
<a href={result.URL} target="_blank">
<div className="flex gap-2">
<CatIcon />
<div className="flex flex-col ">
<span className="text-sm text-sr-host leading-tight">
{result.host}
</span>
<span className="text-sm text-sr-url leading-tight hover:underline">
{result.URL}
</span>
</div>
</div>
<h3 className="text-sr-title text-xl hover:underline">
{result.title}
</h3>
<p className="text-sr-snippet leading-tight">
{result.snippets}
</p>
</a>
</div>
<h3 className="text-sr-title text-xl hover:underline">
{result.title}
</h3>
<p className="text-sr-snippet leading-tight">
{result.snippets}
</p>
</a>
</div>
))}
),
)}
<SRPPagination
page={Number(page)}
query={query}
Expand Down
Loading

0 comments on commit e820eca

Please sign in to comment.