amir-kedis · AbdelruhmanSamy · May 12, 2024 · May 12, 2024 · May 12, 2024
diff --git a/mistermeow/app/src/main/java/meowcrawler/Crawler.java b/mistermeow/app/src/main/java/meowcrawler/Crawler.java
@@ -13,21 +13,24 @@ public class Crawler implements Runnable {
   static private QueueManager qM = new QueueManager();
   static private DBManager db = new DBManager();
   static private int countOfDocumentsCrawled = 0;
+  static private int rankerIndex = 0;
 
   /**
    * handleHashingURL - takes a Url, hash and check it.
    *
-   * @param nUrl - the Url to handle.
+   * @param nUrl      - the Url to handle.
+   * @param parent_id - parent url of the current url.
    * @return boolean - true if url is new (not crawled before), else return
    *         false.
    */
-  private boolean handleHashingURL(Url nUrl) {
+  private boolean handleHashingURL(Url nUrl, int parent_id) {
     // Hash and check if the url was not crawled (store it if it wasn't)
     synchronized (hM) {
       String hashedUrl = hM.HashAndCheckURL(nUrl);
       if (hashedUrl == null) {
         synchronized (db) {
-          db.incrementPopularity("URL", nUrl.getUrlString());
+          if (parent_id != -1)
+            db.updateParents("hashedURL", nUrl.getHashedURL(), parent_id);
         }
         return false;
       }
@@ -39,11 +42,12 @@ private boolean handleHashingURL(Url nUrl) {
    * handleHashingDoc - takes a nUrl and handles its document fetching, hashing
    * and insertion into DB.
    *
-   * @param nUrl - the given url.
-   * @param doc  - the html document of the url.
+   * @param nUrl      - the given url.
+   * @param doc       - the html document of the url.
+   * @param parent_id - parent url of the current url.
    * @return void.
    */
-  private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc) {
+  private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc, int parent_id) {
     // Make sure that we are crawling english websites only.
     String docLang = doc.select("html").attr("lang");
     boolean insertOrNot = false;
@@ -68,7 +72,7 @@ private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc) {
       }
     }
 
-    this.handleInsertionIntoDB(insertOrNot, nUrl, outerDoc, doc.title());
+    this.handleInsertionIntoDB(insertOrNot, nUrl, outerDoc, doc.title(), parent_id);
     outerDoc = null;
   }
 
@@ -81,24 +85,32 @@ private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc) {
    * @param nUrl        - the nUrl from which we would get its related data.
    * @param doc         - outerHtml document of the url.
    * @param title       - of the document.
+   * @param parent_id   - parent url of the current url.
    * @return void.
    */
   private void handleInsertionIntoDB(boolean insertOrNot, Url nUrl, String doc,
-                                     String title) {
+      String title, int parent_id) {
     final String ANSI_CYAN = "\u001B[36m";
 
     // check if the url & its doc needs to be put into the database.
     if (insertOrNot) {
+
+      List<Integer> parents = new ArrayList<>();
+      parents.add(parent_id);
+
       synchronized (db) {
         db.insertDocument(nUrl.getUrlString(), title, nUrl.getDomainName(), doc,
-                          nUrl.getHashedURL(), nUrl.getHashedDoc());
+            nUrl.getHashedURL(), nUrl.getHashedDoc(), rankerIndex, parents);
+
+        nUrl.setRankerId(rankerIndex);
+
         System.out.println(ANSI_CYAN + "|| Inserted " + nUrl.getUrlString() +
-                           " into the database"
-                           + " Count: " + ++countOfDocumentsCrawled + " ||");
+            " into the database"
+            + " Count: " + ++countOfDocumentsCrawled + " ||" + " RankerId: " + rankerIndex++ + " ||");
       }
     } else {
       synchronized (db) {
-        db.incrementPopularity("hashedDoc", nUrl.getHashedDoc());
+        db.updateParents("hashedDoc", nUrl.getHashedDoc(), parent_id);
       }
     }
 
@@ -112,21 +124,21 @@ private void handleInsertionIntoDB(boolean insertOrNot, Url nUrl, String doc,
    * @param urls - the set of urls extracted from the html document.
    * @return void.
    */
-  public void HandleHashing(Set<String> urls) {
+  public void HandleHashing(Set<String> urls, int parent_id) {
     for (String url : urls) {
       // Create a Url object for the url string.
       Url nUrl = new Url(url, 1);
       org.jsoup.nodes.Document doc = null;
 
-      if (!this.handleHashingURL(nUrl)) {
+      if (!this.handleHashingURL(nUrl, parent_id)) {
         continue;
       }
 
       // Fetch the document of the url, then hash and check it.
       doc = nUrl.fetchDocument();
 
       if (doc != null) {
-        this.handleHashingDoc(nUrl, doc);
+        this.handleHashingDoc(nUrl, doc, parent_id);
       }
 
       doc = null;
@@ -167,7 +179,7 @@ public void run() {
       URLsHandler urlH = new URLsHandler();
       Set<String> extractedUrls = urlH.HandleURLs(doc, url.getUrlString());
 
-      HandleHashing(extractedUrls);
+      HandleHashing(extractedUrls, url.getRankerId());
 
       doc = null;
     }
@@ -182,12 +194,16 @@ public void run() {
   static public void loadHashedData() {
     List<Document> urlsData = null;
 
-    synchronized (db) { urlsData = db.retrieveHashedDataOfUrls(); }
+    synchronized (db) {
+      urlsData = db.retrieveHashedDataOfUrls();
+    }
 
     synchronized (hM) {
       hM.fillHashedURLs(urlsData);
       hM.fillHashedDocs(urlsData);
     }
+
+    rankerIndex = urlsData.size();
   }
 
   /**
@@ -200,17 +216,20 @@ static public void loadQueueData() {
     List<Document> data = null;
     int count = 0;
 
-    synchronized (db) { data = db.retrieveUrlsInQueue(); }
+    synchronized (db) {
+      data = db.retrieveUrlsInQueue();
+    }
 
     System.out.println("size of retrieved urls that was in queue: " +
-                       data.size());
+        data.size());
 
     for (Document urlData : data) {
       try {
         Url url = new Url(urlData.getString("URL"), 1);
 
         url.setHashedDoc(urlData.getString("hashedDoc"));
         url.setHashedURL(urlData.getString("hashedURL"));
+        url.setRankerId(urlData.getInteger("ranker_id"));
 
         synchronized (qM) {
           qM.push(url);
@@ -229,9 +248,8 @@ static public void loadQueueData() {
    * A static function that provides initial seed for the queueManager.
    */
   static public void ProvideSeed(List<Url> urls) {
-    Set<String> seeds =
-        urls.stream().map(Url::getUrlString).collect(Collectors.toSet());
+    Set<String> seeds = urls.stream().map(Url::getUrlString).collect(Collectors.toSet());
     Crawler c = new Crawler();
-    c.HandleHashing(seeds);
+    c.HandleHashing(seeds, -1);
   }
-}
+}
diff --git a/mistermeow/app/src/main/java/meowcrawler/Url.java b/mistermeow/app/src/main/java/meowcrawler/Url.java
@@ -10,6 +10,7 @@ public class Url {
   private static final int minPriority = 3;
   private String hashedURL;
   private String hashedDoc;
+  private int rankerId;
 
   public Url(String s, int p) {
     this.urlString = s;
@@ -18,35 +19,62 @@ public Url(String s, int p) {
     this.hashedDoc = null;
   }
 
+  /**
+   * Setter for the rankerId data member.
+   *
+   * @param id - the value to set with.
+   * @return void.
+   */
+  public void setRankerId(int id) {
+    this.rankerId = id;
+  }
+
+  /**
+   * Getter for the rankerId data member.
+   *
+   * @return id.
+   */
+  public int getRankerId() {
+    return this.rankerId;
+  }
+
   /**
    * Setter for the hashedURL data member.
    *
    * @param value - the value to set with.
    * @return void.
    */
-  public void setHashedURL(String value) { this.hashedURL = value; }
+  public void setHashedURL(String value) {
+    this.hashedURL = value;
+  }
 
   /**
    * Setter for the hashedDoc data member.
    *
    * @param value - the value to set with.
    * @return void.
    */
-  public void setHashedDoc(String value) { this.hashedDoc = value; }
+  public void setHashedDoc(String value) {
+    this.hashedDoc = value;
+  }
 
   /**
    * Getter for the hashedURL data member.
    *
    * @return hashedURL.
    */
-  public String getHashedURL() { return this.hashedURL; }
+  public String getHashedURL() {
+    return this.hashedURL;
+  }
 
   /**
    * Getter for the hashedDoc data member.
    *
    * @return hashedDoc.
    */
-  public String getHashedDoc() { return this.hashedDoc; }
+  public String getHashedDoc() {
+    return this.hashedDoc;
+  }
 
   /**
    * Fetches the htmlDoc of the URL with the urlString datamember and returns
@@ -70,7 +98,9 @@ public String toString() {
     return "{ " + this.urlString + ", " + this.priority + " }";
   }
 
-  public int getPriority() { return priority; }
+  public int getPriority() {
+    return priority;
+  }
 
   public boolean decrementPriority() {
     if (priority > minPriority) {
@@ -80,7 +110,9 @@ public boolean decrementPriority() {
     return false;
   }
 
-  public String getUrlString() { return urlString; }
+  public String getUrlString() {
+    return urlString;
+  }
 
   public String getDomainName() {
     return this.urlString.split("/")[2];