amir-kedis · akramhany · May 13, 2024 · May 13, 2024 · May 13, 2024 · May 13, 2024
diff --git a/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java b/mistermeow/app/src/main/java/meowEngine/QueryEngineController.java
@@ -5,7 +5,7 @@
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-
+import meowdbmanager.DBManager;
 import org.bson.Document;
 import org.bson.types.ObjectId;
 import org.jsoup.Jsoup;
@@ -59,9 +59,8 @@ public Document getSuggestions(@RequestParam("query") String query) {
   }
 
   @GetMapping("/search")
-  public Document searchQuery(
-      @RequestParam("query") String query,
-      @RequestParam("page") int page) {
+  public Document searchQuery(@RequestParam("query") String query,
+                              @RequestParam("page") int page) {
 
     if (!query.equals(currentQuery))
       isFirstTime = true;
@@ -90,7 +89,8 @@ private void parse(String query) {
     phrases[0] = phrases[1] = phrases[2] = null;
 
     Matcher phraseMatch = Pattern.compile("\"[^\"]+\"").matcher(query);
-    Matcher operatorMatch = Pattern.compile("\"\\s*(AND|OR|NOT)\\s*\"").matcher(query);
+    Matcher operatorMatch =
+        Pattern.compile("\"\\s*(AND|OR|NOT)\\s*\"").matcher(query);
 
     int i = 0;
     while (phraseMatch.find()) {

diff --git a/mistermeow/app/src/main/java/meowcrawler/Crawler.java b/mistermeow/app/src/main/java/meowcrawler/Crawler.java
@@ -1,8 +1,11 @@
 package meowcrawler;
 
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
 import java.util.stream.Collectors;
 import meowdbmanager.DBManager;
 import org.bson.Document;
@@ -47,7 +50,8 @@ private boolean handleHashingURL(Url nUrl, int parent_id) {
    * @param parent_id - parent url of the current url.
    * @return void.
    */
-  private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc, int parent_id) {
+  private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc,
+      int parent_id) {
     // Make sure that we are crawling english websites only.
     String docLang = doc.select("html").attr("lang");
     boolean insertOrNot = false;
@@ -72,7 +76,8 @@ private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc, int parent
       }
     }
 
-    this.handleInsertionIntoDB(insertOrNot, nUrl, outerDoc, doc.title(), parent_id);
+    this.handleInsertionIntoDB(insertOrNot, nUrl, outerDoc, doc.title(),
+        parent_id);
     outerDoc = null;
   }
 
@@ -100,13 +105,15 @@ private void handleInsertionIntoDB(boolean insertOrNot, Url nUrl, String doc,
 
       synchronized (db) {
         db.insertDocument(nUrl.getUrlString(), title, nUrl.getDomainName(), doc,
-            nUrl.getHashedURL(), nUrl.getHashedDoc(), rankerIndex, parents);
+            nUrl.getHashedURL(), nUrl.getHashedDoc(), rankerIndex,
+            parents);
 
         nUrl.setRankerId(rankerIndex);
 
         System.out.println(ANSI_CYAN + "|| Inserted " + nUrl.getUrlString() +
             " into the database"
-            + " Count: " + ++countOfDocumentsCrawled + " ||" + " RankerId: " + rankerIndex++ + " ||");
+            + " Count: " + ++countOfDocumentsCrawled + " ||"
+            + " RankerId: " + rankerIndex++ + " ||");
       }
     } else {
       synchronized (db) {
@@ -162,16 +169,18 @@ public void run() {
           url = qM.pop();
           System.out.println("|| Popped " + url.getUrlString() + " ||");
         } catch (Exception e) {
-          System.out.println("Queue is empty");
+          // System.out.println("Queue is empty");
           continue;
         }
       }
 
       synchronized (db) {
-        if (!db.updateInQueue(url.getUrlString(), false)) {
-          System.out.println("Couldn't update the inQueue field of the url");
+        try {
+          doc = Jsoup.parse(db.getUrlDocument(url.getUrlString()));
+        } catch (Exception e) {
+          System.out.println(e.getMessage());
+          continue;
         }
-        doc = Jsoup.parse(db.getUrlDocument(url.getUrlString()));
       }
 
       // Extract Urls and handle them, hash and check that they was not crawled
@@ -181,6 +190,12 @@ public void run() {
 
       HandleHashing(extractedUrls, url.getRankerId());
 
+      synchronized (db) {
+        if (!db.updateInQueue(url.getUrlString(), false)) {
+          System.out.println("Couldn't update the inQueue field of the url");
+        }
+      }
+
       doc = null;
     }
   }
@@ -249,7 +264,21 @@ static public void loadQueueData() {
    */
   static public void ProvideSeed(List<Url> urls) {
     Set<String> seeds = urls.stream().map(Url::getUrlString).collect(Collectors.toSet());
-    Crawler c = new Crawler();
-    c.HandleHashing(seeds, -1);
+    int numThreads = seeds.size();
+    ExecutorService executor = Executors.newFixedThreadPool(numThreads);
+
+    for (String seed : seeds) {
+      Crawler c = new Crawler();
+      Set<String> seedsSet = new HashSet<>();
+      seedsSet.add(seed);
+      executor.submit(() -> {
+        c.HandleHashing(seedsSet, -1);
+      });
+    }
+
+    executor.shutdown();
+    // Wait until all threads are finish
+    while (!executor.isTerminated()) {
+    }
   }
-}
+}