Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ranker popularity #32

Merged
merged 2 commits into from
May 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 41 additions & 23 deletions mistermeow/app/src/main/java/meowcrawler/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,24 @@ public class Crawler implements Runnable {
static private QueueManager qM = new QueueManager();
static private DBManager db = new DBManager();
static private int countOfDocumentsCrawled = 0;
static private int rankerIndex = 0;

/**
* handleHashingURL - takes a Url, hash and check it.
*
* @param nUrl - the Url to handle.
* @param nUrl - the Url to handle.
* @param parent_id - parent url of the current url.
* @return boolean - true if url is new (not crawled before), else return
* false.
*/
private boolean handleHashingURL(Url nUrl) {
private boolean handleHashingURL(Url nUrl, int parent_id) {
// Hash and check if the url was not crawled (store it if it wasn't)
synchronized (hM) {
String hashedUrl = hM.HashAndCheckURL(nUrl);
if (hashedUrl == null) {
synchronized (db) {
db.incrementPopularity("URL", nUrl.getUrlString());
if (parent_id != -1)
db.updateParents("hashedURL", nUrl.getHashedURL(), parent_id);
}
return false;
}
Expand All @@ -39,11 +42,12 @@ private boolean handleHashingURL(Url nUrl) {
* handleHashingDoc - takes a nUrl and handles its document fetching, hashing
* and insertion into DB.
*
* @param nUrl - the given url.
* @param doc - the html document of the url.
* @param nUrl - the given url.
* @param doc - the html document of the url.
* @param parent_id - parent url of the current url.
* @return void.
*/
private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc) {
private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc, int parent_id) {
// Make sure that we are crawling english websites only.
String docLang = doc.select("html").attr("lang");
boolean insertOrNot = false;
Expand All @@ -68,7 +72,7 @@ private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc) {
}
}

this.handleInsertionIntoDB(insertOrNot, nUrl, outerDoc, doc.title());
this.handleInsertionIntoDB(insertOrNot, nUrl, outerDoc, doc.title(), parent_id);
outerDoc = null;
}

Expand All @@ -81,24 +85,32 @@ private void handleHashingDoc(Url nUrl, org.jsoup.nodes.Document doc) {
* @param nUrl - the nUrl from which we would get its related data.
* @param doc - outerHtml document of the url.
* @param title - of the document.
* @param parent_id - parent url of the current url.
* @return void.
*/
private void handleInsertionIntoDB(boolean insertOrNot, Url nUrl, String doc,
String title) {
String title, int parent_id) {
final String ANSI_CYAN = "\u001B[36m";

// check if the url & its doc needs to be put into the database.
if (insertOrNot) {

List<Integer> parents = new ArrayList<>();
parents.add(parent_id);

synchronized (db) {
db.insertDocument(nUrl.getUrlString(), title, nUrl.getDomainName(), doc,
nUrl.getHashedURL(), nUrl.getHashedDoc());
nUrl.getHashedURL(), nUrl.getHashedDoc(), rankerIndex, parents);

nUrl.setRankerId(rankerIndex);

System.out.println(ANSI_CYAN + "|| Inserted " + nUrl.getUrlString() +
" into the database"
+ " Count: " + ++countOfDocumentsCrawled + " ||");
" into the database"
+ " Count: " + ++countOfDocumentsCrawled + " ||" + " RankerId: " + rankerIndex++ + " ||");
}
} else {
synchronized (db) {
db.incrementPopularity("hashedDoc", nUrl.getHashedDoc());
db.updateParents("hashedDoc", nUrl.getHashedDoc(), parent_id);
}
}

Expand All @@ -112,21 +124,21 @@ private void handleInsertionIntoDB(boolean insertOrNot, Url nUrl, String doc,
* @param urls - the set of urls extracted from the html document.
* @return void.
*/
public void HandleHashing(Set<String> urls) {
public void HandleHashing(Set<String> urls, int parent_id) {
for (String url : urls) {
// Create a Url object for the url string.
Url nUrl = new Url(url, 1);
org.jsoup.nodes.Document doc = null;

if (!this.handleHashingURL(nUrl)) {
if (!this.handleHashingURL(nUrl, parent_id)) {
continue;
}

// Fetch the document of the url, then hash and check it.
doc = nUrl.fetchDocument();

if (doc != null) {
this.handleHashingDoc(nUrl, doc);
this.handleHashingDoc(nUrl, doc, parent_id);
}

doc = null;
Expand Down Expand Up @@ -167,7 +179,7 @@ public void run() {
URLsHandler urlH = new URLsHandler();
Set<String> extractedUrls = urlH.HandleURLs(doc, url.getUrlString());

HandleHashing(extractedUrls);
HandleHashing(extractedUrls, url.getRankerId());

doc = null;
}
Expand All @@ -182,12 +194,16 @@ public void run() {
static public void loadHashedData() {
List<Document> urlsData = null;

synchronized (db) { urlsData = db.retrieveHashedDataOfUrls(); }
synchronized (db) {
urlsData = db.retrieveHashedDataOfUrls();
}

synchronized (hM) {
hM.fillHashedURLs(urlsData);
hM.fillHashedDocs(urlsData);
}

rankerIndex = urlsData.size();
}

/**
Expand All @@ -200,17 +216,20 @@ static public void loadQueueData() {
List<Document> data = null;
int count = 0;

synchronized (db) { data = db.retrieveUrlsInQueue(); }
synchronized (db) {
data = db.retrieveUrlsInQueue();
}

System.out.println("size of retrieved urls that was in queue: " +
data.size());
data.size());

for (Document urlData : data) {
try {
Url url = new Url(urlData.getString("URL"), 1);

url.setHashedDoc(urlData.getString("hashedDoc"));
url.setHashedURL(urlData.getString("hashedURL"));
url.setRankerId(urlData.getInteger("ranker_id"));

synchronized (qM) {
qM.push(url);
Expand All @@ -229,9 +248,8 @@ static public void loadQueueData() {
* A static function that provides initial seed for the queueManager.
*/
static public void ProvideSeed(List<Url> urls) {
Set<String> seeds =
urls.stream().map(Url::getUrlString).collect(Collectors.toSet());
Set<String> seeds = urls.stream().map(Url::getUrlString).collect(Collectors.toSet());
Crawler c = new Crawler();
c.HandleHashing(seeds);
c.HandleHashing(seeds, -1);
}
}
}
44 changes: 38 additions & 6 deletions mistermeow/app/src/main/java/meowcrawler/Url.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ public class Url {
private static final int minPriority = 3;
private String hashedURL;
private String hashedDoc;
private int rankerId;

public Url(String s, int p) {
this.urlString = s;
Expand All @@ -18,35 +19,62 @@ public Url(String s, int p) {
this.hashedDoc = null;
}

/**
* Setter for the rankerId data member.
*
* @param id - the value to set with.
* @return void.
*/
public void setRankerId(int id) {
this.rankerId = id;
}

/**
* Getter for the rankerId data member.
*
* @return id.
*/
public int getRankerId() {
return this.rankerId;
}

/**
* Setter for the hashedURL data member.
*
* @param value - the value to set with.
* @return void.
*/
public void setHashedURL(String value) { this.hashedURL = value; }
public void setHashedURL(String value) {
this.hashedURL = value;
}

/**
* Setter for the hashedDoc data member.
*
* @param value - the value to set with.
* @return void.
*/
public void setHashedDoc(String value) { this.hashedDoc = value; }
public void setHashedDoc(String value) {
this.hashedDoc = value;
}

/**
* Getter for the hashedURL data member.
*
* @return hashedURL.
*/
public String getHashedURL() { return this.hashedURL; }
public String getHashedURL() {
return this.hashedURL;
}

/**
* Getter for the hashedDoc data member.
*
* @return hashedDoc.
*/
public String getHashedDoc() { return this.hashedDoc; }
public String getHashedDoc() {
return this.hashedDoc;
}

/**
* Fetches the htmlDoc of the URL with the urlString datamember and returns
Expand All @@ -70,7 +98,9 @@ public String toString() {
return "{ " + this.urlString + ", " + this.priority + " }";
}

public int getPriority() { return priority; }
public int getPriority() {
return priority;
}

public boolean decrementPriority() {
if (priority > minPriority) {
Expand All @@ -80,7 +110,9 @@ public boolean decrementPriority() {
return false;
}

public String getUrlString() { return urlString; }
public String getUrlString() {
return urlString;
}

public String getDomainName() {
return this.urlString.split("/")[2];
Expand Down
Loading