Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: construct graph in the ranker #35

Merged
merged 1 commit into from
May 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion mistermeow/app/src/main/java/meowdbmanager/DBManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,49 @@ public boolean updateParents(String key, String value, int parent_id) {
return updateResult.getModifiedCount() == 1;

} catch (MongoException e) {
System.out.println("Error while updating popularity: " + e.getMessage());
System.out.println("Error while updating Parents array: " + e.getMessage());
return false;
}
}

/**
* getUrlsCount - Returns the number of Urls in the database.
*
* @return int - represents number of Urls in the database.
*/
public int getUrlsCount() {
try {
return (int) docCollection.countDocuments();

} catch (MongoException e) {
System.out.println("Error while getting urls count: " + e.getMessage());
return -1;
}
}

/**
* getParentsArr - returns an array of parents for a certain url.
*
* @param ranker_id - the url ranker id.
* @return List<Integer> - list of parents ids
*/
public List<Integer> getParentsArr(int ranker_id) {
try {

// Filter the Document I want.
Document filter = new Document("ranker_id", ranker_id);

// Find documents matching the filter in the docCollection
FindIterable<Document> matchingUrls = docCollection.find(filter);

return matchingUrls.first().getList("parents", Integer.class);

} catch (MongoException e) {
System.out.println("Error while getting parents array: " + e.getMessage());
return new ArrayList<Integer>();
}
}

public String insertDocument(String url, String title, String host,
String content, String hashedUrl,
String hashedDoc, int ranker_id, List<Integer> parents) {
Expand Down
33 changes: 17 additions & 16 deletions mistermeow/app/src/main/java/meowranker/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import org.bson.types.ObjectId;

public class Main {
public static void main(String[] argv){
public static void main(String[] argv) {

// int UrlCount = 4;
// double[][] M = new double[UrlCount][];

Expand All @@ -19,24 +19,25 @@ public static void main(String[] argv){
// double[] r = ranker.getPopularity(M , UrlCount);

// for(int i=0 ; i<UrlCount; i++)
// System.out.print(r[i]+ " ");
// System.out.print(r[i]+ " ");

// testing phrase matching
PhraseRanker r = new PhraseRanker();

//list of tokens after tokenization (without stemming)
// list of tokens after tokenization (without stemming)
List<String> tokens = new ArrayList<>();
tokens.add("github"); // [ 661b0e19c22b5f3c3cc84f54, 661b0e25c22b5f3c3cc84f5e,
// 661b0e26c22b5f3c3cc84f60, 661b0e27c22b5f3c3cc84f63,
// 661b0e33c22b5f3c3cc84f7a, 661b0e34c22b5f3c3cc84f83 ]

tokens.add("git"); // [ 661b0e27c22b5f3c3cc84f63, 661b0e2dc22b5f3c3cc84f68,
// 661b0e33c22b5f3c3cc84f72, 661b0e34c22b5f3c3cc84f83,
// 661b0e34c22b5f3c3cc84f8b ]

List<ObjectId> matchedDocs = r.rank(tokens); // expected output: [661b0e27c22b5f3c3cc84f63, 661b0e34c22b5f3c3cc84f83]

for(ObjectId id: matchedDocs)
tokens.add("github"); // [ 661b0e19c22b5f3c3cc84f54, 661b0e25c22b5f3c3cc84f5e,
// 661b0e26c22b5f3c3cc84f60, 661b0e27c22b5f3c3cc84f63,
// 661b0e33c22b5f3c3cc84f7a, 661b0e34c22b5f3c3cc84f83 ]

tokens.add("git"); // [ 661b0e27c22b5f3c3cc84f63, 661b0e2dc22b5f3c3cc84f68,
// 661b0e33c22b5f3c3cc84f72, 661b0e34c22b5f3c3cc84f83,
// 661b0e34c22b5f3c3cc84f8b ]

List<ObjectId> matchedDocs = r.rank(tokens); // expected output: [661b0e27c22b5f3c3cc84f63,
// 661b0e34c22b5f3c3cc84f83]

for (ObjectId id : matchedDocs)
System.out.println(id);
}
}
}
132 changes: 103 additions & 29 deletions mistermeow/app/src/main/java/meowranker/Ranker.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,68 +9,142 @@ public class Ranker {

public DBManager db;

public Ranker(){
public Ranker() {
db = new DBManager();
}

// The function takes graph of links between documents
// where edge from doc1 to doc2 is representing by adding
// 1/(outgoing urls from doc1) in the cell M[doc2][doc1]
// resulting in matrix with sum of it's columns always = 1
public double[] getPopularity(double[][] M , int UrlsCount){
public double[] getPopularity(double[][] M, int UrlsCount) {

double d =0.85;
double d = 0.85;
double[][] M_hat = new double[UrlsCount][UrlsCount];
for(int i =0 ; i<UrlsCount ; i++){
for(int j=0;j<UrlsCount ; j++){
M_hat[i][j] = d*M[i][j];
for (int i = 0; i < UrlsCount; i++) {
for (int j = 0; j < UrlsCount; j++) {
M_hat[i][j] = d * M[i][j];
}
}

double[] prevRank = new double[UrlsCount];
double[] currRank ;
double[] currRank;

for (int i = 0; i < UrlsCount; i++)
prevRank[i] = 1.0 / (double) UrlsCount;

for(int i=0 ; i<UrlsCount ; i++)
prevRank[i] = 1.0/(double)UrlsCount;

currRank = calculateCurrRank(prevRank, UrlsCount, d, M_hat);
while(Norm(currRank , UrlsCount) - Norm(prevRank , UrlsCount)> 1e-10){
prevRank=currRank;

while (Norm(currRank, UrlsCount) - Norm(prevRank, UrlsCount) > 1e-10) {
prevRank = currRank;
currRank = calculateCurrRank(prevRank, UrlsCount, d, M_hat);
}

//normalizing the final array
double norm = Norm(currRank , UrlsCount);

for(int i = 0 ; i<UrlsCount; i++){
currRank[i]/=norm;
// normalizing the final array
double norm = Norm(currRank, UrlsCount);

for (int i = 0; i < UrlsCount; i++) {
currRank[i] /= norm;
}
return currRank;
}
}

public double[] calculateCurrRank(double[] prevRank, int UrlsCount, double d, double[][] M_hat) {

public double[] calculateCurrRank(double[] prevRank , int UrlsCount , double d , double[][] M_hat){

double[] currRank = new double[UrlsCount];
for(int i = 0 ; i<UrlsCount ; i++){

for (int i = 0; i < UrlsCount; i++) {
double val = 0;
for(int j=0 ; j<UrlsCount ; j++){
val+=M_hat[i][j]*prevRank[j];
for (int j = 0; j < UrlsCount; j++) {
val += M_hat[i][j] * prevRank[j];
}
currRank[i] = val+(1-d);
currRank[i] = val + (1 - d);
}

return currRank;
}

public double Norm(double[] vector , int size){
public double Norm(double[] vector, int size) {
double norm = 0;

for(int i =0 ; i<size ; i++){
norm +=vector[i]*vector[i];
for (int i = 0; i < size; i++) {
norm += vector[i] * vector[i];
}

return Math.sqrt(norm);
}

/**
* constructUrlsGraph - returns a constructed graph from the Urls data in the
* database.
*
* @return double[][] - a matrix of doubles, with size N * N,
* where N equal to number of urls in the database.
*/
public double[][] constructUrlsGraph() {
// Number of nodes in graph is number of urls in database.
int nodesNum = db.getUrlsCount();

// Create a 2D array filled with 0s initialiy.
double[][] graph = new double[nodesNum][nodesNum];

// Construct graph with parents arrays.
constructMatrix(graph, nodesNum);

// Scale graph with a certain formula
scaleMatrix(graph, nodesNum);

return graph;
}

/**
* constructMatrix - takes a 2D matrix and fill it with edges from parents
* arrays of each node.
*
* @param graph - the 2D matrix to fill.
* @param nodesNum - the number of nodes and the size of the graph.
*/
public void constructMatrix(double[][] graph, int nodesNum) {

// Loop over each node (url) and get it's parents array and construct graph.
for (int i = 0; i < nodesNum; i++) {
List<Integer> nodeParents = db.getParentsArr(i);

// Loop over the parents array and add 1 for each edge from parent to node.
for (int j = 0; j < nodeParents.size(); j++) {
int parentId = nodeParents.get(j);

if (parentId > 0 && parentId < nodesNum) {
graph[i][parentId]++;
}
}
}

}

/**
* scaleMatrix - takes a 2D Matrix and scales each column by a certain formula.
*
* @param graph - the 2D matrix to scale.
* @param nodesNum - the number of nodes and the size of the graph.
*/
public void scaleMatrix(double[][] graph, int nodesNum) {

// Loop over each column in the graph.
for (int j = 0; j < nodesNum; j++) {
double sum = 0;

// Calculate sum of values in the j'th column
for (int i = 0; i < nodesNum; i++)
sum += graph[i][j];

if (sum == 0)
continue;

// Divide each cell value by the sum of column, to scale it.
for (int i = 0; i < nodesNum; i++)
graph[i][j] /= sum;
}
}

}