diff --git a/Rscript/aapp_dsyn.R b/Rscript/aapp_dsyn.R new file mode 100644 index 0000000..76125b5 --- /dev/null +++ b/Rscript/aapp_dsyn.R @@ -0,0 +1,110 @@ +# ---- Cleanup everything before start ---- +rm(list = ls()) +gc() + +### Find battle commander pairs + +# ---- GBSERVER API ---- +source("./experimentAPI.R") + +# ---- INPUT and CONFIGURATIONS ---- + +EDGE_TYPE_FILE = "../data/semmed_unique.edgetypes" # Example : "../data/lobbyist.edgetypes" +INPUT_FILE = "../facts/semmed/aapp_cause_dsyn_clean_id.csv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label +CLUSTER_SIZE = 48 # Number of workers in gbserver +FALSE_PER_TRUE = 5 +DISCARD_REL = 5 +ASSOCIATE_REL = c(22,28,25,17,39,20,37) +OUTPUT_PREFIX = "aapp_dsyn/aapp_dsyn" # "result/" OUTPUT_PREFIX + +# ---- Load edge type file ---- + +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +# ---- Init workers ---- + +cl <- makeCluster(CLUSTER_SIZE) +clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", + "preferential_attachment", "katz", "pcrw", + "multidimensional_adamic_adar", "heterogeneous_adamic_adar", + "connectedby", "rel_path", "truelabeled", "falselabeled", "str_split", + "as.numeric", "request","DISCARD_REL"), envir = environment()) + +# ---- Load input data ---- +dat.true <- unique(read.csv(INPUT_FILE)) + + +if (ncol(dat.true) < 3) + dat.true$label <- T + +# ---- Construct false labeled data ----- +set.seed(233) + +# TODO: reformat this so it is universal and file independent +dat.false <- rbind.fill(apply(dat.true, 1, function(x){ + candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2]) + candidates <- unlist(lapply(candidates, function(y){ + if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) { + return(NULL) + } + return(y) + })) + return(data.frame(src=x[1], + dst=sample(candidates, FALSE_PER_TRUE), + label=F)) +})) + +colnames(dat.true) <- c("src","dst","label") +dat <- rbind(dat.true,dat.false) +dat <- unique(dat) + +## Adamic Adar + +experiment.aa <- eval.aa(dat, DISCARD_REL) +write.csv(experiment.aa$raw, paste("../result/", OUTPUT_PREFIX ,".aa.csv",sep=""), row.names=F) + +## Semantic Proximity + +experiment.sp <- eval.sp(dat, DISCARD_REL) +write.csv(experiment.sp$raw, paste("../result/", OUTPUT_PREFIX ,".sp.csv",sep=""), row.names=F) + +## Personalized PageRank + +experiment.ppr <- eval.ppr(dat, DISCARD_REL) +write.csv(experiment.ppr$raw, paste("../result/", OUTPUT_PREFIX ,".ppr.csv",sep=""), row.names=F) + +## Preferential Attachment + +experiment.pa <- eval.pa(dat, DISCARD_REL) +write.csv(experiment.pa$raw, paste("../result/", OUTPUT_PREFIX ,".pa.csv",sep=""), row.names=F) + +## Katz + +experiment.katz <- eval.katz(dat, DISCARD_REL) +write.csv(experiment.katz$raw, paste("../result/", OUTPUT_PREFIX ,".katz.csv",sep=""), row.names=F) + +## AMIE + +experiment.amie <- eval.amie(dat, ASSOCIATE_REL) +write.csv(experiment.amie$raw, paste("../result/", OUTPUT_PREFIX ,".amie.csv",sep=""), row.names=F) + +## Test Method + +#experiment.test <- eval.test(dat, DISCARD_REL) +#write.csv(experiment.test$raw, paste("../result/", OUTPUT_PREFIX ,".test.csv",sep=""), row.names=F) + +experiment.pcrwamie <- eval.pcrw(dat, c(22,28,25,17,39,20,37)) +write.csv(experiment.pcrwamie$raw, paste("../result/", OUTPUT_PREFIX ,".pcrwamie.csv",sep=""), row.names=F) + + +stopCluster(cl) + +experiment.simrank <- read.csv("../facts/aapp_dsyn.simrank.csv", header=F) +colnames(experiment.simrank) <- c("src", "dst", "score") +experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")] +experiment.simrank <- eval.df(experiment.simrank) +write.csv(experiment.simrank$raw, paste("../result/", OUTPUT_PREFIX ,".simrank.csv",sep=""), row.names=F) + + diff --git a/Rscript/battle_commander.R b/Rscript/battle_commander.R new file mode 100644 index 0000000..45e0864 --- /dev/null +++ b/Rscript/battle_commander.R @@ -0,0 +1,151 @@ +# ---- Cleanup everything before start ---- +rm(list = ls()) +gc() + +### Find battle commander pairs + +# ---- GBSERVER API ---- +source("./experimentAPI.R") + +# ---- INPUT and CONFIGURATIONS ---- + +EDGE_TYPE_FILE = "../data/infobox.edgetypes" # Example : "../data/lobbyist.edgetypes" +INPUT_FILE = "../facts/civil_war//battle_commander_id.tsv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label +CLUSTER_SIZE = 48 # Number of workers in gbserver +FALSE_PER_TRUE = 5 +DISCARD_REL = 78 +ASSOCIATE_REL = c(144) + +# ---- Load edge type file ---- + +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +# ---- Init workers ---- + +cl <- makeCluster(CLUSTER_SIZE) +clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", "heter_path", + "preferential_attachment", "katz", "pcrw", "heter_full_path", "meta_path", + "multidimensional_adamic_adar", "heterogeneous_adamic_adar", + "connectedby", "rel_path", "truelabeled", "falselabeled", "str_split", + "as.numeric", "request","DISCARD_REL"), envir = environment()) + + + +# ---- Load input data ---- +dat.true <- unique(read.csv(INPUT_FILE)) + + +if (ncol(dat.true) < 3) + dat.true$label <- T + +# ---- Construct false labeled data ----- +set.seed(233) + +# TODO: reformat this so it is universal and file independent +dat.false <- rbind.fill(apply(dat.true, 1, function(x){ + candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2]) + candidates <- unlist(lapply(candidates, function(y){ + if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) { + return(NULL) + } + return(y) + })) + return(data.frame(src=x[1], + dst=sample(candidates, FALSE_PER_TRUE), + label=F)) +})) + +colnames(dat.true) <- c("src","dst","label") +dat <- rbind(dat.true, dat.false) +dat <- unique(dat) + +elapsed.time <- data.frame() + +## Adamic Adar + +experiment.aa <- eval.aa(dat, DISCARD_REL) +write.csv(experiment.aa$raw, "../result/civil_war/battle_commander.aa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="aa", + elapsed = experiment.aa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Semantic Proximity + +experiment.sp <- eval.sp(dat, DISCARD_REL) +write.csv(experiment.sp$raw, "../result/civil_war/battle_commander.sp.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="sp", + elapsed = experiment.sp$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Personalized PageRank + +experiment.ppr <- eval.ppr(dat, DISCARD_REL) +write.csv(experiment.ppr$raw, "../result/civil_war/battle_commander.ppr.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="ppr", + elapsed = experiment.ppr$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Preferential Attachment + +experiment.pa <- eval.pa(dat, DISCARD_REL) +write.csv(experiment.pa$raw, "../result/civil_war/battle_commander.pa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pa", + elapsed = experiment.pa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Katz + +experiment.katz <- eval.katz(dat, DISCARD_REL) +write.csv(experiment.katz$raw, "../result/civil_war/battle_commander.katz.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="katz", + elapsed = experiment.katz$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## AMIE + +experiment.amie <- eval.amie(dat, ASSOCIATE_REL) +write.csv(experiment.amie$raw, "../result/civil_war/battle_commander.amie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="amie", + elapsed = experiment.amie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Test Method + +experiment.fullpath.test <- eval.fullpath.test(dat, DISCARD_REL) +write.csv(experiment.fullpath.test$raw, "../result/civil_war/battle_commander.fullpath.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="fullpath.test", + elapsed = experiment.fullpath.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +experiment.test <- eval.test(dat, DISCARD_REL) +write.csv(experiment.test$raw, "../result/civil_war/battle_commander.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="test", + elapsed = experiment.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +experiment.pcrwamie <- eval.pcrw(dat, c(144)) +write.csv(experiment.pcrwamie$raw, "../result/civil_war/battle_commander.pcrwamie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pcrw", + elapsed = experiment.pcrwamie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +write.csv(elapsed.time, paste("../result/civil_war/battle_commander.elapsed.csv",sep=""), row.names=F) + + +stopCluster(cl) + +experiment.simrank <- read.csv("../facts/civil_war.simrank.csv", header=F) +colnames(experiment.simrank) <- c("src", "dst", "score") +experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")] +experiment.simrank <- eval.df(experiment.simrank) +write.csv(experiment.simrank$raw, "../result/civil_war/civil_war.simrank.csv", row.names=F) diff --git a/Rscript/capital_state.R b/Rscript/capital_state.R new file mode 100644 index 0000000..2150d59 --- /dev/null +++ b/Rscript/capital_state.R @@ -0,0 +1,144 @@ +# ---- Cleanup everything before start ---- +rm(list = ls()) +gc() + +### Find true capital-state pairs from all possible capital-state pairs + +# ---- GBSERVER API ---- +source("./experimentAPI.R") + +# ---- INPUT and CONFIGURATIONS ---- + +EDGE_TYPE_FILE = "../data/infobox.edgetypes" # Example : "../data/lobbyist.edgetypes" +INPUT_FILE = "../facts/state_capital.csv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label +CLUSTER_SIZE = 48 # Number of workers in gbserver +FALSE_PER_TRUE = 5 +DISCARD_REL = 191 +ASSOCIATE_REL = c(404) + +# ---- Load edge type file ---- + +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +# ---- Init workers ---- + +cl <- makeCluster(CLUSTER_SIZE) +clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", "heter_path", "max_depth", + "preferential_attachment", "katz", "pcrw", "heter_full_path", "meta_path", + "multidimensional_adamic_adar", "heterogeneous_adamic_adar", + "connectedby", "rel_path", "truelabeled", "falselabeled", "str_split", + "as.numeric", "request","DISCARD_REL"), envir = environment()) + +# ---- Load input data ---- +dat.true <- read.csv(INPUT_FILE) + +if (ncol(dat.true) < 3) + dat.true$label <- T + +# ---- Construct false labeled data ----- +set.seed(233) + +# TODO: reformat this so it is universal and file independent +dat.false <- rbind.fill(apply(dat.true, 1, function(x){ + candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2]) + candidates <- unlist(lapply(candidates, function(y){ + if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) { + return(NULL) + } + return(y) + })) + return(data.frame(src=x[1], + dst=sample(candidates, FALSE_PER_TRUE), + label=F)) +})) + +colnames(dat.true) <- c("src","dst","label") +dat <- rbind(dat.true, dat.false) + +elapsed.time <- data.frame() + +## Adamic Adar + +experiment.aa <- eval.aa(dat, DISCARD_REL) +write.csv(experiment.aa$raw, "../result/city/capital_state_all.aa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="aa", + elapsed = experiment.aa$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +## Semantic Proximity + +experiment.sp <- eval.sp(dat, DISCARD_REL) +write.csv(experiment.sp$raw, "../result/city/capital_state_all.sp.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="sp", + elapsed = experiment.sp$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +## Personalized PageRank + +experiment.ppr <- eval.ppr(dat, DISCARD_REL) +write.csv(experiment.ppr$raw, "../result/city/capital_state_all.ppr.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="ppr", + elapsed = experiment.ppr$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + + +## Preferential Attachment + +experiment.pa <- eval.pa(dat, DISCARD_REL) +write.csv(experiment.pa$raw, "../result/city/capital_state_all.pa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pa", + elapsed = experiment.pa$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + + + +## Katz + +experiment.katz <- eval.katz(dat, DISCARD_REL) +write.csv(experiment.katz$raw, "../result/city/capital_state_all.katz.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="katz", + elapsed = experiment.katz$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + + + +## AMIE + +experiment.amie <- eval.amie(dat, ASSOCIATE_REL) +write.csv(experiment.amie$raw, "../result/city/capital_state_all.amie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="amie", + elapsed = experiment.amie$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +## Test Method + +experiment.fullpath.test <- eval.fullpath.test(dat, DISCARD_REL) +write.csv(experiment.fullpath.test$raw, "../result/city/capital_state_all.fullpath.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="fullpath.test", + elapsed = experiment.fullpath.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +experiment.test <- eval.test(dat, DISCARD_REL) +write.csv(experiment.test$raw, "../result/city/capital_state_all.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="test", + elapsed = experiment.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +experiment.pcrwamie <- eval.pcrw(dat, c(404)) +write.csv(experiment.pcrwamie$raw, "../result/city/capital_state_all.pcrwamie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pcrw", + elapsed = experiment.pcrwamie$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +write.csv(elapsed.time, paste("../result/city/capital_state_all.elapsed.csv",sep=""), row.names=F) + +stopCluster(cl) + +experiment.simrank <- read.csv("../facts/state_capital.simrank.csv", header=F) +colnames(experiment.simrank) <- c("src", "dst", "score") +experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")] +experiment.simrank <- eval.df(experiment.simrank) +write.csv(experiment.simrank$raw, "../result/city/state_capital_all.simrank.csv", row.names=F) + diff --git a/Rscript/city_capital.R b/Rscript/city_capital.R new file mode 100644 index 0000000..9ced79c --- /dev/null +++ b/Rscript/city_capital.R @@ -0,0 +1,127 @@ +# ---- Cleanup everything before start ---- +rm(list = ls()) +gc() + +### Find capital among most populous cities + +# ---- GBSERVER API ---- +source("./experimentAPI.R") + +# ---- INPUT and CONFIGURATIONS ---- + +EDGE_TYPE_FILE = "../data/infobox.edgetypes" # Example : "../data/lobbyist.edgetypes" +INPUT_FILE = "../facts/city_capital.csv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label +CLUSTER_SIZE = 48 # Number of workers in gbserver +FALSE_PER_TRUE = 0 +DISCARD_REL = 191 +ASSOCIATE_REL = c(404) + +# ---- Load edge type file ---- + +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +# ---- Init workers ---- + +cl <- makeCluster(CLUSTER_SIZE) +clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", "heter_path", + "preferential_attachment", "katz", "pcrw", "heter_full_path", "meta_path", + "multidimensional_adamic_adar", "heterogeneous_adamic_adar", + "connectedby", "rel_path", "truelabeled", "falselabeled", "str_split", + "as.numeric", "request","DISCARD_REL"), envir = environment()) + + +# ---- Load input data ---- +dat <- read.csv(INPUT_FILE) + +elapsed.time <- data.frame() + +## Adamic Adar + +experiment.aa <- eval.aa(dat, DISCARD_REL) +write.csv(experiment.aa$raw, "../result/city/city_capital_all.aa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="aa", + elapsed = experiment.aa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +## Semantic Proximity + +experiment.sp <- eval.sp(dat, DISCARD_REL) +write.csv(experiment.sp$raw, "../result/city/city_capital_all.sp.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="sp", + elapsed = experiment.sp$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Personalized PageRank + +experiment.ppr <- eval.ppr(dat, DISCARD_REL) +write.csv(experiment.ppr$raw, "../result/city/city_capital_all.ppr.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="ppr", + elapsed = experiment.ppr$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +## Preferential Attachment + +experiment.pa <- eval.pa(dat, DISCARD_REL) +write.csv(experiment.pa$raw, "../result/city/city_capital_all.pa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pa", + elapsed = experiment.pa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +## Katz + +experiment.katz <- eval.katz(dat, DISCARD_REL) +write.csv(experiment.katz$raw, "../result/city/city_capital_all.katz.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="katz", + elapsed = experiment.katz$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +## AMIE + +experiment.amie <- eval.amie(dat, ASSOCIATE_REL) +write.csv(experiment.amie$raw, "../result/city/city_capital_all.amie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="amie", + elapsed = experiment.amie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Test Method + +experiment.fullpath.test <- eval.fullpath.test(dat, DISCARD_REL) +write.csv(experiment.fullpath.test$raw, "../result/city/city_capital_all.fullpath.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="fullpath.test", + elapsed = experiment.fullpath.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + + + +experiment.test <- eval.test(dat, DISCARD_REL) +write.csv(experiment.test$raw, "../result/city/city_capital_all.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="test", + elapsed = experiment.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +experiment.pcrwamie <- eval.pcrw(dat, c(404)) +write.csv(experiment.pcrwamie$raw, "../result/city/city_capital_all.pcrwamie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pcrw", + elapsed = experiment.pcrwamie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +write.csv(elapsed.time, "../result/city/city_capital_all.elapsed.csv", row.names=F) + +stopCluster(cl) + +## Simrank + +experiment.simrank <- read.csv("../facts/city_capital.simrank.csv", header=F) +colnames(experiment.simrank) <- c("state_id", "city_id", "simrank") +experiment.simrank <- merge(experiment.simrank, dat)[, c("isCapital","simrank")] +colnames(experiment.simrank) <- c("label", "simrank") +experiment.simrank <- eval.df(experiment.simrank) +write.csv(experiment.simrank$raw, "../result/city/city_capital_all.simrank.csv", row.names=F) diff --git a/utils/experimentAPI.R b/Rscript/experimentAPI.R similarity index 81% rename from utils/experimentAPI.R rename to Rscript/experimentAPI.R index 40ab6e9..2cc623f 100644 --- a/utils/experimentAPI.R +++ b/Rscript/experimentAPI.R @@ -19,7 +19,10 @@ eval.df <- function(df) { ## Basic experiment helper eval.helper <- function(df, discard_rel, gen_func) { res <- list() + ptm <- proc.time() res[["raw"]] <- gen_func(df, discard_rel) + write.csv(colnames(res[["raw"]]), "/data/bshi/dbpedia/gngm_celf.fullpath.csv") + res[["elapsed"]] <- proc.time() - ptm res[["model"]] <- Logistic(label~.,res[["raw"]]) res[["eval"]] <- evaluate_Weka_classifier(res[["model"]], numFolds = 10, complexity = T, class = T, seed = 233) return(res) @@ -108,11 +111,30 @@ eval.amie <- function(df, associated_rel) { return(eval.helper(df, associated_rel, func.amie)) } +## Test method with entire path +eval.fullpath.test <- function(df, discard_rel, max_depth = 3) { + func.test <- function(df, discard_rel) { + tmp.paths <- rbind.fill(parApply(cl, dat, 1, function(x) { + tmp_paths <- heter_full_path(as.numeric(x[1]), as.numeric(x[2]), DISCARD_REL, max_depth) + if(length(tmp_paths) == 0) { + return(data.frame(label = as.logical(x[3]))) + } + rtn <- as.data.frame(t(tmp_paths$Freq)) + colnames(rtn) <- tmp_paths$res + rtn <- cbind(label = as.logical(x[3]), rtn) + return(rtn) + })) + tmp.paths[is.na(tmp.paths)] <- 0 + return(tmp.paths) + } + return(eval.helper(df, discard_rel, func.test)) +} + ## Test method -eval.test <- function(df, discard_rel) { +eval.test <- function(df, discard_rel, max_depth = 3) { func.test <- function(df, discard_rel) { tmp.paths <- rbind.fill(parApply(cl, dat, 1, function(x) { - tmp_paths <- rel_path(as.numeric(x[1]), as.numeric(x[2]), 3,F, DISCARD_REL) + tmp_paths <- rel_path(as.numeric(x[1]), as.numeric(x[2]), max_depth, F, DISCARD_REL) if(length(tmp_paths) == 0) { return(data.frame(label = as.logical(x[3]))) } diff --git a/utils/experiment_template.R b/Rscript/experiment_template.R similarity index 100% rename from utils/experiment_template.R rename to Rscript/experiment_template.R diff --git a/Rscript/feature_selection.R b/Rscript/feature_selection.R new file mode 100644 index 0000000..fc6f83e --- /dev/null +++ b/Rscript/feature_selection.R @@ -0,0 +1,66 @@ +library(FSelector) +library(ggplot2) +library(bear) +library(RWeka) +source("experimentAPI.R") + +fm_val <- function(resdf) { + return(as.numeric(str_split(str_split(resdf$eval$string,"\n")[[1]][26], " +")[[1]][9])) +} + +best_val <- function(dat, weights) { + cl <- makeCluster(50) + clusterExport(cl = cl, varlist=c("dat", "weights", "eval.df", "cutoff.k", "fm_val","str_split", + "as.numeric"), envir = environment()) + res <- rbind.fill(parLapply(cl, seq(1,ncol(dat)-1,by = 1), function(x){ + library(FSelector) + library(RWeka) + wekares<-eval.df(dat[,c("label", cutoff.k(weights, x))]) + return(data.frame(f1=fm_val(wekares),nfeature=x)) + })) + return(res[which.max(res$f1),]) + stopCluster(cl) +} + +feature_selection_validation <- function(filepath, label) { + dat <- read.csv(filepath[1]) + weights <- information.gain(label~., dat) + weights <- weights[order(weights[,1], decreasing = T),,drop=F] + + res <- NULL + + mpath_res <- eval.df(dat) + + res <- rbind(res,data.frame(test=label, type="Meta Path", nfeature = ncol(dat) - 1, roc=fm_val(mpath_res))) + print(res) + mpath_res <- best_val(dat, weights) + + res <- rbind(res,data.frame(test=label, type="Meta Path Subset", nfeature = mpath_res$nfeature, roc=mpath_res$f1)) + print(res) + dat <- read.csv(filepath[2]) + weights <- information.gain(label~., dat) + weights <- weights[order(weights[,1], decreasing = T),,drop=F] + + ppath_res <- eval.df(dat) + + res <- rbind(res,data.frame(test=label, type="Predicate Path", nfeature = ncol(dat) - 1, roc=fm_val(ppath_res))) + print(res) + mpath_res <- best_val(dat, weights) + + res <- rbind(res,data.frame(test=label, type="Predicate Path Subset", nfeature = mpath_res$nfeature, roc=mpath_res$f1)) + print(res) + return(res) + +} + + +res <- rbind(feature_selection_validation("../result/city/city_capital_all.test.csv", "city_capital"), + feature_selection_validation("../result/city/capital_state_all.test.csv", "capital_state"), + feature_selection_validation("../result/civil_war/battle_commander.test.csv", "civil war"), + feature_selection_validation("../result/company/president_company_no_keyperson_all.test.csv", "company"), + feature_selection_validation("../result/president/president.test.csv", "president-vice"), + feature_selection_validation("../result/best_seller//best_seller.test.csv","best-seller")) + +res.summary <- summarySE(res, measurevar = "value", groupvars = c("type")) +res.score.summary <- summarySE(res, measurevar = "score", groupvars = c("type")) +ggplot(res.summary, aes(x=type,y=value)) + geom_point() + geom_line() + geom_errorbar(aes(ymax=value+ci, ymin=value-ci), width=.3) \ No newline at end of file diff --git a/Rscript/fiction_author.R b/Rscript/fiction_author.R new file mode 100644 index 0000000..acc3c66 --- /dev/null +++ b/Rscript/fiction_author.R @@ -0,0 +1,154 @@ +# ---- Cleanup everything before start ---- +rm(list = ls()) +gc() + +### Find which company/organization lobby SF mayor + +# ---- GBSERVER API ---- +source("./experimentAPI.R") + +# ---- INPUT and CONFIGURATIONS ---- + +EDGE_TYPE_FILE = "../data/infobox.edgetypes" # Example : "../data/lobbyist.edgetypes" +INPUT_FILE = "../facts/new_york_best_seller//best_seller.csv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label +CLUSTER_SIZE = 48 # Number of workers in gbserver +FALSE_PER_TRUE = 5 +DISCARD_REL = 27 +ASSOCIATE_REL = c(164,313,215,151,130,20) + +# ---- Load edge type file ---- + +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +# ---- Init workers ---- + +cl <- makeCluster(CLUSTER_SIZE) +clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", "heter_path", + "preferential_attachment", "katz", "pcrw", "heter_full_path", "meta_path", + "multidimensional_adamic_adar", "heterogeneous_adamic_adar", + "connectedby", "rel_path", "truelabeled", "falselabeled", "str_split", + "as.numeric", "request","DISCARD_REL"), envir = environment()) + + +# ---- Load input data ---- + dat.true <- unique(read.csv(INPUT_FILE)) + + +if (ncol(dat.true) < 3) + dat.true$label <- T + +# ---- Construct false labeled data ----- +set.seed(233) + +# TODO: reformat this so it is universal and file independent +dat.false <- rbind.fill(apply(dat.true, 1, function(x){ + candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2]) + candidates <- unlist(lapply(candidates, function(y){ + if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) { + return(NULL) + } + return(y) + })) + return(data.frame(src=x[1], + dst=sample(candidates, FALSE_PER_TRUE), + label=F)) +})) + +colnames(dat.true) <- c("src","dst","label") +dat <- rbind(dat.true, dat.false) +dat <- unique(dat) + +elapsed.time <- data.frame() + +## Adamic Adar + +experiment.aa <- eval.aa(dat, DISCARD_REL) +write.csv(experiment.aa$raw, "../result/best_seller/best_seller.aa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="aa", + elapsed = experiment.aa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Semantic Proximity + +experiment.sp <- eval.sp(dat, DISCARD_REL) +write.csv(experiment.sp$raw, "../result/best_seller/best_seller.sp.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="sp", + elapsed = experiment.sp$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Personalized PageRank + +experiment.ppr <- eval.ppr(dat, DISCARD_REL) +write.csv(experiment.ppr$raw, "../result/best_seller/best_seller.ppr.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="ppr", + elapsed = experiment.ppr$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Preferential Attachment + +experiment.pa <- eval.pa(dat, DISCARD_REL) +write.csv(experiment.pa$raw, "../result/best_seller/best_seller.pa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pa", + elapsed = experiment.pa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Katz + +experiment.katz <- eval.katz(dat, DISCARD_REL) +write.csv(experiment.katz$raw, "../result/best_seller/best_seller.katz.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="katz", + elapsed = experiment.katz$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## AMIE + +experiment.amie <- eval.amie(dat, ASSOCIATE_REL) +write.csv(experiment.amie$raw, "../result/best_seller/best_seller.amie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="amie", + elapsed = experiment.amie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Test Method + +experiment.fullpath.test <- eval.fullpath.test(dat, DISCARD_REL) +write.csv(experiment.fullpath.test$raw, "../result/best_seller/best_seller.fullpath.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="fullpath.test", + elapsed = experiment.fullpath.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +experiment.test <- eval.test(dat, DISCARD_REL) +write.csv(experiment.test$raw, "../result/best_seller/best_seller.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="test", + elapsed = experiment.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +experiment.pcrwamie <- eval.pcrw(dat, c(164,313,215,151,130,20)) +write.csv(experiment.pcrwamie$raw, "../result/best_seller/best_seller.pcrwamie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pcrw", + elapsed = experiment.pcrwamie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +write.csv(elapsed.time, paste("../result/best_seller/best_seller.elapsed.csv",sep=""), row.names=F) + + + +stopCluster(cl) + +experiment.simrank <- read.csv("../facts/best_seller.simrank.csv", header=F) +colnames(experiment.simrank) <- c("src", "dst", "score") +experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")] +experiment.simrank <- eval.df(experiment.simrank) +write.csv(experiment.simrank$raw, "../result/best_seller/best_seller.simrank.csv", row.names=F) diff --git a/utils/gbserverAPI.R b/Rscript/gbserverAPI.R similarity index 84% rename from utils/gbserverAPI.R rename to Rscript/gbserverAPI.R index 86eb0a6..c38fb60 100644 --- a/utils/gbserverAPI.R +++ b/Rscript/gbserverAPI.R @@ -25,11 +25,39 @@ pcrw <- function(src, dst, metapath) { return(as.numeric(request(command))) } -heter_path <- function(id1, id2, discard_rel, max_depth = 3) { +meta_path <- function(id1, id2, discard_rel, max_depth = 3) { + command <- paste("metapath", id1, id2, discard_rel, max_depth, "F", "F", sep=" ") + paths <- request(command) + return(unlist(str_split(paths[-1], "\n"))) +} + +heter_path <- function(id1, id2, discard_rel, max_depth = 3, .raw = F) { command <- paste("hpath", id1, id2, discard_rel, max_depth, "F", "F", sep=" ") paths <- request(command) - return(lapply(unlist(str_split(paths[-1], "\n")), hpath_parser)) + if (!.raw) { + return(lapply(unlist(str_split(paths[-1], "\n")), hpath_parser)) + } else { + return(unlist(str_split(paths[-1],"\\n"))) + } +} + +heter_node_path <- function(id1, id2, discard_rel, max_depth = 3) { + res <- heter_path(id1, id2, discard_rel, max_depth) + + res <- lapply(res, function(x){ + return(paste(x$nodes, collapse = ",")) + }) + + res <- as.data.frame(table(unlist(res))) + + return(res[order(-res$Freq),]) +} + +heter_full_path <- function(id1, id2, discard_rel, max_depth = 3) { + res <- meta_path(id1, id2, discard_rel, max_depth) + res <- as.data.frame(table(res)) + return(res[order(-res$Freq),]) } get_path_by_rel <- function(id1, id2, discard_rel, max_depth = 3, target_path=NULL) { @@ -46,7 +74,7 @@ get_path_by_rel <- function(id1, id2, discard_rel, max_depth = 3, target_path=NU return(tmp_res[!sapply(tmp_res, is.null)]) } -rel_path <- function(id1, id2, max_depth = 3, is_directed = F, discard_rel, mapfile = NA, .raw=F) { +rel_path <- function(id1, id2, max_depth = 3, is_directed = F, discard_rel, mapfile = NA, .raw=F, .discard_direction = F) { library(utils) library(stringr) @@ -82,7 +110,7 @@ rel_path <- function(id1, id2, max_depth = 3, is_directed = F, discard_rel, mapf return(paste(ifelse(x < 0, "(-1)", "-"), mapfile[which(mapfile$V1 == abs(x)), "V2"], sep="")) })), collapse=",") })) - } else { + } else if (.discard_direction) { paths$paths <- unlist(lapply(as.character(paths$paths), function(x){str_replace_all(x,"-","")})) } @@ -96,7 +124,7 @@ rel_path <- function(id1, id2, max_depth = 3, is_directed = F, discard_rel, mapf return(paste(ifelse(x < 0, "(-1)", "-"), mapfile[which(mapfile$V1 == abs(x)), "V2"], sep="")) })), collapse=",") })) - } else { + } else if(.discard_direction) { paths$paths <- unlist(lapply(as.character(paths$paths), function(x){str_replace_all(x,"-","")})) } return(paths) diff --git a/Rscript/gngm_celf.R b/Rscript/gngm_celf.R new file mode 100644 index 0000000..efbe8fc --- /dev/null +++ b/Rscript/gngm_celf.R @@ -0,0 +1,113 @@ +# ---- Cleanup everything before start ---- +rm(list = ls()) +gc() + +### Find battle commander pairs + +# ---- GBSERVER API ---- +source("./experimentAPI.R") + +# ---- INPUT and CONFIGURATIONS ---- + +EDGE_TYPE_FILE = "../data/semmed_unique.edgetypes" # Example : "../data/lobbyist.edgetypes" +INPUT_FILE = "../facts/semmed/gngm_cause_celf_id.csv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label +CLUSTER_SIZE = 49 # Number of workers in gbserver +FALSE_PER_TRUE = 5 +DISCARD_REL = 5 +ASSOCIATE_REL = c(22,28,25,17,39,20,37) +OUTPUT_PREFIX = "gngm_celf/gngm_celf" # "result/" OUTPUT_PREFIX + +# ---- Load edge type file ---- + +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +# ---- Init workers ---- + +cl <- makeCluster(CLUSTER_SIZE) +clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", "heter_path", + "preferential_attachment", "katz", "pcrw", "heter_full_path", "meta_path", + "multidimensional_adamic_adar", "heterogeneous_adamic_adar", + "connectedby", "rel_path", "truelabeled", "falselabeled", "str_split", + "as.numeric", "request","DISCARD_REL"), envir = environment()) + +# ---- Load input data ---- +dat.true <- unique(read.csv(INPUT_FILE)) + + +if (ncol(dat.true) < 3) + dat.true$label <- T + +# ---- Construct false labeled data ----- +set.seed(233) + +# TODO: reformat this so it is universal and file independent +dat.false <- rbind.fill(apply(dat.true, 1, function(x){ + candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2]) + candidates <- unlist(lapply(candidates, function(y){ + if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) { + return(NULL) + } + return(y) + })) + return(data.frame(src=x[1], + dst=sample(candidates, FALSE_PER_TRUE), + label=F)) +})) + +colnames(dat.true) <- c("src","dst","label") +dat <- rbind(dat.true,dat.false) +dat <- unique(dat) + +## Adamic Adar + +experiment.aa <- eval.aa(dat, DISCARD_REL) +write.csv(experiment.aa$raw, paste("../result/", OUTPUT_PREFIX ,".aa.csv",sep=""), row.names=F) + +## Semantic Proximity + +experiment.sp <- eval.sp(dat, DISCARD_REL) +write.csv(experiment.sp$raw, paste("../result/", OUTPUT_PREFIX ,".sp.csv",sep=""), row.names=F) + +## Personalized PageRank + +experiment.ppr <- eval.ppr(dat, DISCARD_REL) +write.csv(experiment.ppr$raw, paste("../result/", OUTPUT_PREFIX ,".ppr.csv",sep=""), row.names=F) + +## Preferential Attachment + +experiment.pa <- eval.pa(dat, DISCARD_REL) +write.csv(experiment.pa$raw, paste("../result/", OUTPUT_PREFIX ,".pa.csv",sep=""), row.names=F) + +## Katz + +experiment.katz <- eval.katz(dat, DISCARD_REL) +write.csv(experiment.katz$raw, paste("../result/", OUTPUT_PREFIX ,".katz.csv",sep=""), row.names=F) + +## AMIE + +experiment.amie <- eval.amie(dat, ASSOCIATE_REL) +write.csv(experiment.amie$raw, paste("../result/", OUTPUT_PREFIX ,".amie.csv",sep=""), row.names=F) + +## Test Method + + +experiment.fullpath.test <- eval.fullpath.test(dat, DISCARD_REL) +write.csv(experiment.fullpath.test$raw, paste("../result/", OUTPUT_PREFIX ,".fullpath.test.csv",sep=""), row.names=F) + +experiment.test <- eval.test(dat, DISCARD_REL) +write.csv(experiment.test$raw, paste("../result/", OUTPUT_PREFIX ,".test.csv",sep=""), row.names=F) + +## Test Method + PCRW + +experiment.amiepcrw <- eval.pcrw(dat, c(c(9,2),c(9,9),c(36,16),c(24,16),c(9,1),c(30,16))) +write.csv(experiment.amiepcrw$raw, paste("../result/", OUTPUT_PREFIX ,".pcrwamie.csv",sep=""), row.names=F) + +stopCluster(cl) + +experiment.simrank <- read.csv("../facts/gngm_celf.simrank.csv", header=F) +colnames(experiment.simrank) <- c("src", "dst", "score") +experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")] +experiment.simrank <- eval.df(experiment.simrank) +write.csv(experiment.simrank$raw, paste("../result/", OUTPUT_PREFIX ,".simrank.csv",sep=""), row.names=F) \ No newline at end of file diff --git a/Rscript/interpret.R b/Rscript/interpret.R new file mode 100644 index 0000000..a8e3de7 --- /dev/null +++ b/Rscript/interpret.R @@ -0,0 +1,16 @@ +dat <- read.csv("../result/city/city_capital_all.test.csv") +weights <- information.gain(label~.,dat) +weights <- data.frame(path = rownames(weights), importance=weights$attr_importance) +weights <- weights[order(weights$importance, decreasing = T),] +get_description <- function(threshold) { + true_labeled <- colSums(dat[which(dat$label==T),]) + true_labeled <- names(true_labeled[which(true_labeled > 0)]) + false_labeled <- colSums(dat[which(dat$label==F),]) + false_labeled <- names(false_labeled[which(false_labeled > threshold)]) + idx <- !is.element(true_labeled,false_labeled) + true_only <- true_labeled[idx] + true_only <- true_only[which(true_only != "label")] + + np <- weights[which((weights$path %in% true_only) & weights$importance > 0),] + return(np[order(np$importance, decreasing=T),]) +} diff --git a/Rscript/meatpath_predicate.R b/Rscript/meatpath_predicate.R new file mode 100644 index 0000000..5ffe3c8 --- /dev/null +++ b/Rscript/meatpath_predicate.R @@ -0,0 +1,40 @@ +library(ggplot2) +dat <- read.csv("../result/empirical_result_f1") +dat$test <- str_replace(dat$test, " ","\n") +g <- ggplot(dat, aes(x=nfeature, y=f1, shape=type, color=type)) + + geom_point(size=5) + + scale_shape_manual(values=c(15,0,16,1)) + + scale_x_log10(expand=c(0.1,0)) + + scale_y_continuous(expand=c(0.1,0.1), limits=c(0.6,1)) + + ylab("F-Measure") + + xlab("Number of Features") + + theme_classic() + + facet_grid(test~.) + + theme(panel.background = element_rect(colour = "black", size=1), + legend.title=element_blank(), + legend.position="none", + legend.background = element_rect(fill = "transparent"), + legend.text = element_text(size = 8), + strip.text.y = element_text(size = 8)) + +ggsave("../result/dbpedia_metapath_predicate_fmeasure.eps", g, width = 5, height = 5) + +dat <- read.csv("../result/empirical_result_roc") +dat$test <- str_replace(dat$test, " ","\n") +g <- ggplot(dat, aes(x=nfeature, y=roc, shape=type, color=type)) + + geom_point(size=5) + + scale_shape_manual(values=c(15,0,16,1)) + + scale_x_log10(expand=c(0.1,0)) + + scale_y_continuous(expand=c(0.1,0.1), limits=c(0.6,1)) + + ylab("AUROC") + + xlab("Number of Features") + + theme_classic() + + facet_grid(test~.) + + theme(panel.background = element_rect(colour = "black", size=1), + legend.title=element_blank(), + legend.position="none", + legend.background = element_rect(fill = "transparent"), + legend.text = element_text(size = 8), + strip.text.y = element_text(size = 8)) +ggsave("../result/dbpedia_metapath_predicate_roc.eps", g, width = 5, height = 5) + diff --git a/Rscript/president_company.R b/Rscript/president_company.R new file mode 100644 index 0000000..81988c1 --- /dev/null +++ b/Rscript/president_company.R @@ -0,0 +1,152 @@ +# ---- Cleanup everything before start ---- +rm(list = ls()) +gc() + +### Find true capital-state pairs from all possible capital-state pairs + +# ---- GBSERVER API ---- +source("./experimentAPI.R") + +# ---- INPUT and CONFIGURATIONS ---- + +EDGE_TYPE_FILE = "../data/infobox.edgetypes" # Example : "../data/lobbyist.edgetypes" +INPUT_FILE = "../facts/ceo/ceo_id.csv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label +CLUSTER_SIZE = 48 # Number of workers in gbserver +FALSE_PER_TRUE = 5 +DISCARD_REL = 137 +ASSOCIATE_REL = c(169,20,21) + +# ---- Load edge type file ---- + +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +# ---- Init workers ---- + +cl <- makeCluster(CLUSTER_SIZE) +clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", "heter_path", + "preferential_attachment", "katz", "pcrw", "heter_full_path", "meta_path", + "multidimensional_adamic_adar", "heterogeneous_adamic_adar", + "connectedby", "rel_path", "truelabeled", "falselabeled", "str_split", + "as.numeric", "request","DISCARD_REL"), envir = environment()) + + + +# ---- Load input data ---- +dat.true <- read.csv(INPUT_FILE) + +if (ncol(dat.true) < 3) + dat.true$label <- T + +# ---- Construct false labeled data ----- +set.seed(233) + +# TODO: reformat this so it is universal and file independent +dat.false <- rbind.fill(apply(dat.true, 1, function(x){ + candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2]) + candidates <- unlist(lapply(candidates, function(y){ + if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) { + return(NULL) + } + return(y) + })) + return(data.frame(src=x[1], + dst=sample(candidates, FALSE_PER_TRUE), + label=F)) +})) + +colnames(dat.true) <- c("src","dst","label") +dat <- rbind(dat.true, dat.false) + +elapsed.time <- data.frame() + +## Adamic Adar + +experiment.aa <- eval.aa(dat, DISCARD_REL) +write.csv(experiment.aa$raw, "../result/company/president_company_no_keyperson_all.aa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="aa", + elapsed = experiment.aa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Semantic Proximity + +experiment.sp <- eval.sp(dat, DISCARD_REL) +write.csv(experiment.sp$raw, "../result/company/president_company_no_keyperson_all.sp.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="sp", + elapsed = experiment.sp$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Personalized PageRank + +experiment.ppr <- eval.ppr(dat, DISCARD_REL) +write.csv(experiment.ppr$raw, "../result/company/president_company_no_keyperson_all.ppr.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="ppr", + elapsed = experiment.ppr$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Preferential Attachment + +experiment.pa <- eval.pa(dat, DISCARD_REL) +write.csv(experiment.pa$raw, "../result/company/president_company_no_keyperson_all.pa.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pa", + elapsed = experiment.pa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Katz + +experiment.katz <- eval.katz(dat, DISCARD_REL) +write.csv(experiment.katz$raw, "../result/company/president_company_no_keyperson_all.katz.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="katz", + elapsed = experiment.katz$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## AMIE + +experiment.amie <- eval.amie(dat, ASSOCIATE_REL) +write.csv(experiment.amie$raw, "../result/company/president_company_no_keyperson_all.amie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="amie", + elapsed = experiment.amie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + +## Test Method + +experiment.fullpath.test <- eval.fullpath.test(dat, DISCARD_REL) +write.csv(experiment.fullpath.test$raw, "../result/company/president_company_no_keyperson_all.fullpath.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="fullpath.test", + elapsed = experiment.fullpath.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +experiment.test <- eval.test(dat, DISCARD_REL) +write.csv(experiment.test$raw, "../result/company/president_company_no_keyperson_all.test.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="test", + elapsed = experiment.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +experiment.pcrwamie <- eval.pcrw(dat, c(169,20,21)) +write.csv(experiment.pcrwamie$raw, "../result/company/president_company_no_keyperson_all.pcrwamie.csv", row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pcrw", + elapsed = experiment.pcrwamie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +write.csv(elapsed.time, paste("../result/company/president_company_no_keyperson_all.elapsed.csv",sep=""), row.names=F) + +stopCluster(cl) + + +experiment.simrank <- read.csv("../facts/company_president.simrank.csv", header=F) +colnames(experiment.simrank) <- c("src", "dst", "score") +experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")] +experiment.simrank <- eval.df(experiment.simrank) +write.csv(experiment.simrank$raw, "../result/company/president_company_no_keyperson_all.simrank.csv", row.names=F) + diff --git a/Rscript/president_us.R b/Rscript/president_us.R new file mode 100644 index 0000000..fb8c4dd --- /dev/null +++ b/Rscript/president_us.R @@ -0,0 +1,156 @@ +# ---- Cleanup everything before start ---- +rm(list = ls()) +gc() + +### Find battle commander pairs + +# ---- GBSERVER API ---- +source("./experimentAPI.R") + +# ---- INPUT and CONFIGURATIONS ---- + +EDGE_TYPE_FILE = "../data/infobox.edgetypes" # Example : "../data/lobbyist.edgetypes" +INPUT_FILE = "../facts/government/president_vice_id.csv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label +CLUSTER_SIZE = 48 # Number of workers in gbserver +FALSE_PER_TRUE = 5 +DISCARD_REL = 467 +ASSOCIATE_REL = c(284,124,56) +OUTPUT_PREFIX = "president/president" # "result/" OUTPUT_PREFIX + +# ---- Load edge type file ---- + +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +# ---- Init workers ---- + +cl <- makeCluster(CLUSTER_SIZE) +clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", "heter_path", + "preferential_attachment", "katz", "pcrw", "heter_full_path", "meta_path", + "multidimensional_adamic_adar", "heterogeneous_adamic_adar", + "connectedby", "rel_path", "truelabeled", "falselabeled", "str_split", + "as.numeric", "request","DISCARD_REL"), envir = environment()) + + +# ---- Load input data ---- +dat.true <- unique(read.csv(INPUT_FILE)) + +if (ncol(dat.true) < 3) + dat.true$label <- T + +dat.false <- rbind.fill(apply(dat.true, 1, function(x){ + candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2]) + candidates <- unlist(lapply(candidates, function(y){ + if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) { + return(NULL) + } + return(y) + })) + return(data.frame(src=x[1], + dst=sample(candidates, FALSE_PER_TRUE), + label=F)) +})) + +colnames(dat.true) <- c("src","dst","label") +colnames(dat.false) <- c("src","dst","label") + +dat <- rbind(dat.true,dat.false) +dat <- unique(dat) + +elapsed.time <- data.frame() + +## Adamic Adar + +experiment.aa <- eval.aa(dat, DISCARD_REL) +write.csv(experiment.aa$raw, paste("../result/", OUTPUT_PREFIX ,".aa.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="aa", + elapsed = experiment.aa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Semantic Proximity + +experiment.sp <- eval.sp(dat, DISCARD_REL) +write.csv(experiment.sp$raw, paste("../result/", OUTPUT_PREFIX ,".sp.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="sp", + elapsed = experiment.sp$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Personalized PageRank + +experiment.ppr <- eval.ppr(dat, DISCARD_REL) +write.csv(experiment.ppr$raw, paste("../result/", OUTPUT_PREFIX ,".ppr.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="ppr", + elapsed = experiment.ppr$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Preferential Attachment + +experiment.pa <- eval.pa(dat, DISCARD_REL) +write.csv(experiment.pa$raw, paste("../result/", OUTPUT_PREFIX ,".pa.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pa", + elapsed = experiment.pa$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## Katz + +experiment.katz <- eval.katz(dat, DISCARD_REL) +write.csv(experiment.katz$raw, paste("../result/", OUTPUT_PREFIX ,".katz.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="katz", + elapsed = experiment.katz$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + +## AMIE + +experiment.amie <- eval.amie(dat, ASSOCIATE_REL) +write.csv(experiment.amie$raw, paste("../result/", OUTPUT_PREFIX ,".amie.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="amie", + elapsed = experiment.amie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + + +## Test Method + +experiment.fullpath.test <- eval.fullpath.test(dat, DISCARD_REL) +write.csv(experiment.fullpath.test$raw, paste("../result/", OUTPUT_PREFIX ,".fullpath.test.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="fullpath.test", + elapsed = experiment.fullpath.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))) + +experiment.test <- eval.test(dat, DISCARD_REL) +write.csv(experiment.test$raw, paste("../result/", OUTPUT_PREFIX ,".test.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="test", + elapsed = experiment.test$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + + + + +experiment.pcrwamie <- eval.pcrw(dat, c(284,124,56)) +write.csv(experiment.pcrwamie$raw, paste("../result/", OUTPUT_PREFIX ,".pcrwamie.csv",sep=""), row.names=F) + +elapsed.time <- rbind(elapsed.time, data.frame(method="pcrw", + elapsed = experiment.pcrwamie$elapsed[3] * CLUSTER_SIZE / nrow(dat))); + +write.csv(elapsed.time, paste("../result/", OUTPUT_PREFIX ,".elapsed.csv",sep=""), row.names=F) + + +stopCluster(cl) + +experiment.simrank <- read.csv("../facts/president.simrank.csv", header=F) +colnames(experiment.simrank) <- c("src", "dst", "score") +experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")] +experiment.simrank <- eval.df(experiment.simrank) +write.csv(experiment.simrank$raw, "../result/president//president.simrank.csv", row.names=F) diff --git a/Rscript/robustness.R b/Rscript/robustness.R new file mode 100644 index 0000000..3d3fbf8 --- /dev/null +++ b/Rscript/robustness.R @@ -0,0 +1,178 @@ +# Test robustness of each algorithms with different data partition +library(FSelector) +library(ggplot2) +library(bear) +library(RWeka) +source("experimentAPI.R") + +# Data frame format +# res.df <- data.frame(algorithm, partition, roc) + +getRoc <- function(resdf) { + return(as.numeric(str_split(str_split(resdf$eval$string,"\n")[[1]][26], " +")[[1]][9])) +} + +best_val <- function(dat, weights, cl) { + clusterExport(cl = cl, varlist=c("dat", "weights", "eval.df", "cutoff.k", "getRoc","str_split", + "as.numeric"), envir = environment()) + res <- rbind.fill(parLapply(cl, seq(1,ncol(dat)-1,by = 1), function(x){ + library(FSelector) + library(RWeka) + wekares<-eval.df(dat[,c("label", cutoff.k(weights, x))]) + return(data.frame(roc=getRoc(wekares),nfeature=x)) + })) + return(res[which.max(res$roc),]) +} + +calculate <- function(filepath, algorithm, fselection = F) { + cl <- makeCluster(56) + + set.seed(233) + dat <- read.csv(filepath) + + ntrue <- length(which(dat$label == T)) + + res <- NULL + + for(frac in seq(0.1,0.9,by=0.1)) { + # Construct true labeled and false labeled data + total <- floor(ntrue / frac) + nfalse <- total - ntrue + truelabeled <- dat[which(dat$label == T),] + falselabeled <- dat[sample(which(dat$label == F), nfalse, replace = T),] + + ndat <- rbind(truelabeled, falselabeled) + + if (fselection) { + weights <- information.gain(label~., ndat) + rrr <- best_val(ndat, weights, cl) + res <- rbind(res, data.frame(algorithm = algorithm, roc = rrr$roc, frac = frac, ntrue = ntrue, nfalse = nfalse)) + } else { + res <- rbind(res, data.frame(algorithm = algorithm, roc = getRoc(eval.df(ndat)), frac = frac, ntrue = ntrue, nfalse = nfalse)) + } + + } + + stopCluster(cl) + + return(res) +} + +savegraph <- function(df, filepath) { + g <- ggplot(df, aes(x=frac, y=roc, shape=algorithm, color=algorithm)) + + geom_point(size=4, position = "dodge") + geom_line() + + ylab("AUROC") + + xlab("Precentage of True Labeled Data") + + scale_shape_manual(values=c(16,0,1,2,5,6,3,4,18)) + + scale_color_manual(values=c("blue",rep("red",8))) + + theme_classic() + + theme(panel.background = element_rect(colour = "black", size=1), + legend.title=element_blank(), + legend.position="none", + legend.background = element_rect(fill = "transparent")) + ggsave(filepath, g, width=5,height=5) +} + +capital_state <- rbind(calculate("../result/city/capital_state_all.test.csv", "test_method", T), + calculate("../result/city/capital_state_all.amie.csv", "AMIE"), + calculate("../result/city/capital_state_all.katz.csv", "KATZ"), + calculate("../result/city/capital_state_all.pa.csv", "Preferential Attachment"), + calculate("../result/city/capital_state_all.pcrwamie.csv", "PCRW"), + calculate("../result/city/capital_state_all.ppr.csv", "PPR"), + calculate("../result/city/state_capital_all.simrank.csv", "SimRank"), + calculate("../result/city/capital_state_all.sp.csv", "Semantic Proximity"), + calculate("../result/city/capital_state_all.aa.csv", "Adamic Adar")) + +savegraph(capital_state, "../result/robustness/capital_state.eps") + +city_capital <- rbind(calculate("../result/city/city_capital_all.test.csv", "test_method", T), + calculate("../result/city/city_capital_all.amie.csv", "AMIE"), + calculate("../result/city/city_capital_all.katz.csv", "KATZ"), + calculate("../result/city/city_capital_all.pa.csv", "Preferential Attachment"), + calculate("../result/city/city_capital_all.pcrwamie.csv", "PCRW"), + calculate("../result/city/city_capital_all.ppr.csv", "PPR"), + calculate("../result/city//city_capital_all.simrank.csv", "SimRank"), + calculate("../result/city/city_capital_all.sp.csv", "Semantic Proximity"), + calculate("../result/city/city_capital_all.aa.csv", "Adamic Adar")) + + + +savegraph(city_capital, "../result/robustness/city_capital.eps") + +company_president <- rbind(calculate("../result/company/president_company_no_keyperson_all.test.csv", "test_method", T), + calculate("../result/company/president_company_no_keyperson_all.amie.csv", "AMIE"), + calculate("../result/company/president_company_no_keyperson_all.katz.csv", "KATZ"), + calculate("../result/company/president_company_no_keyperson_all.pa.csv", "Preferential Attachment"), + calculate("../result/company/president_company_no_keyperson_all.pcrwamie.csv", "PCRW"), + calculate("../result/company/president_company_no_keyperson_all.ppr.csv", "PPR"), + calculate("../result/company/president_company_no_keyperson_all.simrank.csv", "SimRank"), + calculate("../result/company/president_company_no_keyperson_all.sp.csv", "Semantic Proximity"), + calculate("../result/company/president_company_no_keyperson_all.aa.csv", "Adamic Adar")) + + +savegraph(company_president, "../result/robustness/company_president.eps") + + +nyt_bestseller <- rbind(calculate("../result/best_seller/best_seller.test.csv", "test_method", T), + calculate("../result/best_seller/best_seller.amie.csv", "AMIE"), + calculate("../result/best_seller/best_seller.katz.csv", "KATZ"), + calculate("../result/best_seller/best_seller.pa.csv", "Preferential Attachment"), + calculate("../result/best_seller/best_seller.pcrwamie.csv", "PCRW"), + calculate("../result/best_seller/best_seller.ppr.csv", "PPR"), + calculate("../result/best_seller/best_seller.simrank.csv", "SimRank"), + calculate("../result/best_seller/best_seller.sp.csv", "Semantic Proximity"), + calculate("../result/best_seller/best_seller.aa.csv", "Adamic Adar")) + + + +savegraph(nyt_bestseller, "../result/robustness/nyt_bestseller.eps") + + +civil_war <- rbind(calculate("../result/civil_war/battle_commander.test.csv", "test_method", T), + calculate("../result/civil_war/battle_commander.amie.csv", "AMIE"), + calculate("../result/civil_war/battle_commander.katz.csv", "KATZ"), + calculate("../result/civil_war/battle_commander.pa.csv", "Preferential Attachment"), + calculate("../result/civil_war/battle_commander.pcrwamie.csv", "PCRW"), + calculate("../result/civil_war/battle_commander.ppr.csv", "PPR"), + calculate("../result/civil_war/civil_war.simrank.csv", "SimRank"), + calculate("../result/civil_war/battle_commander.sp.csv", "Semantic Proximity"), + calculate("../result/civil_war/battle_commander.aa.csv", "Adamic Adar")) + + + +savegraph(civil_war, "../result/robustness/civil_war.eps") + +uspresident <- rbind(calculate("../result/president/president.test.csv", "test_method", T), + calculate("../result/president/president.amie.csv", "AMIE"), + calculate("../result/president/president.katz.csv", "KATZ"), + calculate("../result/president/president.pa.csv", "Preferential Attachment"), + calculate("../result/president/president.pcrwamie.csv", "PCRW"), + calculate("../result/president/president.ppr.csv", "PPR"), + calculate("../result/president/president.simrank.csv", "SimRank"), + calculate("../result/president/president.sp.csv", "Semantic Proximity"), + calculate("../result/president/president.aa.csv", "Adamic Adar")) + + +savegraph(uspresident, "../result/robustness/uspresident.eps") + +aapp_dsyn <- rbind(calculate("../result/aapp_dsyn/aapp_dsyn.test.csv", "test_method", T), + calculate("../result/aapp_dsyn/aapp_dsyn.amie.csv", "AMIE"), + calculate("../result/aapp_dsyn/aapp_dsyn.katz.csv", "KATZ"), + calculate("../result/aapp_dsyn/aapp_dsyn.pa.csv", "Preferential Attachment"), + calculate("../result/aapp_dsyn/aapp_dsyn.pcrwamie.csv", "PCRW"), + calculate("../result/aapp_dsyn/aapp_dsyn.ppr.csv", "PPR"), + calculate("../result/aapp_dsyn/aapp_dsyn.simrank.csv", "SimRank"), + calculate("../result/aapp_dsyn/aapp_dsyn.sp.csv", "Semantic Proximity"), + calculate("../result/aapp_dsyn/aapp_dsyn.aa.csv", "Adamic Adar")) +savegraph(aapp_dsyn, "../result/robustness/aapp_dsyn.eps") + +gngm_celf <- rbind(calculate("../result/gngm_celf/gngm_celf.test.csv", "test_method", T), + calculate("../result/gngm_celf/gngm_celf.amie.csv", "AMIE"), + calculate("../result/gngm_celf/gngm_celf.katz.csv", "KATZ"), + calculate("../result/gngm_celf/gngm_celf.pa.csv", "Preferential Attachment"), + calculate("../result/gngm_celf/gngm_celf.pcrwamie.csv", "PCRW"), + calculate("../result/gngm_celf/gngm_celf.ppr.csv", "PPR"), + calculate("../result/gngm_celf/gngm_celf.simrank.csv", "SimRank"), + calculate("../result/gngm_celf/gngm_celf.sp.csv", "Semantic Proximity"), + calculate("../result/gngm_celf/gngm_celf.aa.csv", "Adamic Adar")) +savegraph(gngm_celf, "../result/robustness/gngm_celf.eps") \ No newline at end of file diff --git a/Rscript/time_consumption.R b/Rscript/time_consumption.R new file mode 100644 index 0000000..2fceb52 --- /dev/null +++ b/Rscript/time_consumption.R @@ -0,0 +1,50 @@ +library(ggplot2) +library(bear) +library(Rmisc) + +files <- c("../result/best_seller/best_seller.elapsed.csv", "../result//civil_war//battle_commander.elapsed.csv", + "../result/company/president_company_no_keyperson_all.elapsed.csv", "../result/city/capital_state_all.elapsed.csv", + "../result/city/city_capital_all.elapsed.csv", "../result/president/president.elapsed.csv") + +dat <- rbind.fill(lapply(files, read.csv)) + +simrank <- rbind.fill(lapply(c("../facts/best_seller.simrank.csv", + "../facts/city_capital.simrank.csv", + "../facts/civil_war.simrank.csv", + "../facts/company_president.simrank.csv", + "../facts/president.simrank.csv", + "../facts/state_capital.simrank.csv"), read.csv, header=F))[,4] + +titlem <- list() +titlem[["aa"]] = "AA" +titlem[["amie"]] = "AMIE" +titlem[["katz"]] = "Katz" +titlem[["sp"]] = "SP" +titlem[["pa"]] = "PA" +titlem[["ppr"]] = "PPR" +titlem[["pcrw"]] = "PCRW" +titlem[["test"]] = "Predicate\nPath" + + +dat$method <- unlist(lapply(dat$method, function(x){ + return (titlem[[x]]) +})) + +dat.se <- summarySE(data = dat, measurevar = "elapsed", groupvars = c("method")) +simrank.se <- STDERR(simrank) / 1000 +dat.se <- rbind(dat.se, data.frame(method="SimRank",N=6,elapsed=simrank.se[2],sd=0,se=simrank.se[1]-simrank.se[2],ci=0)) +g <- ggplot(dat.se[order(dat.se$elapsed),], aes(y=elapsed, x=method)) + + geom_point(size=3) + + geom_errorbar(aes(ymax=elapsed+ci, ymin=elapsed-ci), width=.5) + + scale_y_log10(expand=c(0,0.1)) + + scale_x_discrete(expand=c(0.05,0.05)) + + ylab("Execution Time Per Query (second)") + + xlab("Algorithm") + + guides(color=guide_legend(override.aes = list(size=0.5))) + + theme_classic() + + theme(panel.background = element_rect(colour = "black", size=1), + legend.justification=c(0,0),legend.title=element_blank(), + legend.position=c(0.2,0)) + +ggsave("../result/time.eps", g, width=7, height=5) + diff --git a/Rscript/top_related_rels.R b/Rscript/top_related_rels.R new file mode 100644 index 0000000..bd4ec68 --- /dev/null +++ b/Rscript/top_related_rels.R @@ -0,0 +1,24 @@ +library(FSelector) +library(ggplot2) +source("experimentAPI.R") + +EDGE_TYPE_FILE = "../data/semmed.edgetypes" # Example : "../data/lobbyist.edgetypes" +mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F) +mapfile$V1 <- as.numeric(mapfile$V1) +mapfile$V2 <- as.character(mapfile$V2) + +dat <- read.csv("../result/aapp_dsyn//aapp_dsyn.test.csv") + +# Change path name +colnames(dat) <- str_replace_all(str_replace(colnames(dat), "X",""), "\\.",",") + +# Get weights of raw features + +weights <- information.gain(label~., dat) + +features <- data.frame(importance=weights$attr_importance, path=row.names(weights)) + +features <- features[order(-features$importance),] +features$path <- idpath2title(features$path, mapfile) + +head(features) diff --git a/src/gbserver/edge_loader.h b/src/gbserver/edge_loader.h index 6b6c549..3a87818 100644 --- a/src/gbserver/edge_loader.h +++ b/src/gbserver/edge_loader.h @@ -297,6 +297,24 @@ class edge_loader { inline unsigned int get_type_rel() { return type_rel; } + unsigned int get_nentity_connected_by_rel(unsigned int rel) { + unsigned int counter = 0; + for (int id = 0; id < max_id; id++) { + std::set > &edges = get_edges(id).get_backward(); + for (auto it = edges.cbegin(); it != edges.cend(); ++it) { + if (it->second == rel) { + counter++; // if id is connected by rel type, break; + break; + } + } + } + return counter; + } + + unsigned int get_nontology() { + return get_nentity_connected_by_rel(type_rel); + } + }; diff --git a/src/gbserver/graph.h b/src/gbserver/graph.h index a52cbea..61d3632 100644 --- a/src/gbserver/graph.h +++ b/src/gbserver/graph.h @@ -678,6 +678,8 @@ class graph { } + unsigned int get_nontology() { return edges_ptr->get_nontology(); } + }; #endif //GBPEDIA_GRAPH_H \ No newline at end of file diff --git a/src/gbserver/socket_server.h b/src/gbserver/socket_server.h index 7d9b97e..fcc1e42 100644 --- a/src/gbserver/socket_server.h +++ b/src/gbserver/socket_server.h @@ -378,6 +378,8 @@ void worker(local::stream_protocol::socket *socket, graph +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + + +vector > edge; +vector > invedge; +vector > foredge; +double c; +int times, numr; + +double singlepair(int sid, int eid, int r) { + int *u; + int *v; + int t, i, j, id, alpha, beta; + double simrvalue = 0; + + + u = (int *) malloc(r * sizeof(int)); + v = (int *) malloc(r * sizeof(int)); + for (i = 0; i < r; i++) { + u[i] = sid; + v[i] = eid; + } + + + for (t = 0; t < times; t++) { + vector inters(r); + vector::iterator it; + sort(u, u + r); + sort(v, v + r); + vector uu(u, u + r); + vector uv(v, v + r); + /* for (it=uu.begin();it!=uu.end();++it) + printf("%d ",*it); + printf("\n"); + for (it=uv.begin();it!=uv.end();++it) + printf("%d ",*it); + printf("\n"); + printf("\n");*/ + + it = unique(uu.begin(), uu.end()); + uu.resize(distance(uu.begin(), it)); + it = unique(uv.begin(), uv.end()); + uv.resize(distance(uv.begin(), it)); + /*for (it=uu.begin();it!=uu.end();++it) + printf("%d ",*it); + printf("\n"); + for (it=uv.begin();it!=uv.end();++it) + printf("%d ",*it); + printf("\n"); + printf("\n");*/ + + + it = set_intersection(uu.begin(), uu.end(), uv.begin(), uv.end(), inters.begin()); + + inters.resize(it - inters.begin()); + + // printf("size:%ld\n",inters.size()); + if (inters.size() > 0) { + for (it = inters.begin(); it != inters.end(); ++it) { + id = *it; + alpha = 0; + beta = 0; + for (i = 0; i < r; i++) { + if (u[i] == id) + alpha++; + if (v[i] == id) + beta++; + } + simrvalue = simrvalue + pow(c, t) * (1 - c) * double(alpha) * double(beta) / pow(r, 2); + + } + } + for (i = 0; i < r; i++) { + if (invedge[u[i]].size() > 1) + u[i] = invedge[u[i]][rand() % (invedge[u[i]].size() - 1) + 1]; + if (invedge[v[i]].size() > 1) + v[i] = invedge[v[i]][rand() % (invedge[v[i]].size() - 1) + 1]; + } + +// printf("%f \n",simrvalue); + } + return simrvalue; +} + +int main(int argc, char **argv) { + FILE *fin, *fout; + char line[300]; + char file[100], ofile[100]; + int len, topK, tempid, i, j, a, b, numedge = 0, maxvv = 0, newid = 0, start, sid, Maxdis; + int *trans; + int *invtrans; + int *dist, *outputid; + double *simv, *sortsimv, *topsimv; + double tempv; + srand(time(NULL)); + if (argc != 9) { + printf("Usage: Edge_file Output_file StartId T R c EndID Maxdis\n"); + return 2; + } + + strcpy(file, argv[1]); + strcpy(ofile, argv[2]); + sscanf(argv[3], "%d", &start); + sscanf(argv[4], "%d", ×); + sscanf(argv[5], "%d", &numr); + sscanf(argv[6], "%lf", &c); + sscanf(argv[7], "%d", &topK); + sscanf(argv[8], "%d", &Maxdis); + + fin = fopen(file, "r"); + if (!fin) { + printf("File doesn't exist\n"); + return 1; + } + while (fgets(line, 100, fin) != NULL) { + sscanf(line, "%d %d", &a, &b); + numedge++; + if (a > maxvv) + maxvv = a; + if (b > maxvv) + maxvv = b; + //printf("%d %d\n",a,b); + } + maxvv++; + trans = (int *) malloc(maxvv * sizeof(int)); + for (i = 0; i < maxvv; i++) + trans[i] = -1; + fclose(fin); + a = 0; + b = 0; + fin = fopen(file, "r"); + while (fgets(line, 100, fin) != NULL) { + sscanf(line, "%d %d", &a, &b); + if (trans[a] == -1) { + trans[a] = newid; + newid++; + } + if (trans[b] == -1) { + trans[b] = newid; + newid++; + } + } + invtrans = (int *) malloc(newid * sizeof(int)); + dist = (int *) malloc(newid * sizeof(int)); + outputid = (int *) malloc(newid * sizeof(int)); + + simv = (double *) malloc(newid * sizeof(double)); + sortsimv = (double *) malloc(newid * sizeof(double)); + topsimv = (double *) malloc(newid * sizeof(double)); + for (i = 0; i < maxvv; i++) + if (trans[i] != -1) + invtrans[trans[i]] = i; + + for (i = 0; i < newid; i++) { + vector temprow(1, -1); + edge.push_back(temprow); + invedge.push_back(temprow); + foredge.push_back(temprow); + simv[i] = 0; + dist[i] = 1000000000; + outputid[i] = i; + } + fclose(fin); + fin = fopen(file, "r"); + vector::iterator checkit; + while (fgets(line, 100, fin) != NULL) { + sscanf(line, "%d %d", &a, &b); + checkit = find(edge[trans[a]].begin(), edge[trans[a]].end(), trans[b]); + if (checkit == edge[trans[a]].end()) { + edge[trans[a]].push_back(trans[b]); + edge[trans[b]].push_back(trans[a]); + invedge[trans[b]].push_back(trans[a]); + invedge[trans[a]].push_back(trans[b]); + } + } + fclose(fin); + + fout = fopen(ofile, "w+"); + clock_t start_t = clock(); + double simscore = singlepair(trans[start], trans[topK], numr); + clock_t end_t = clock(); + fprintf(fout, "%d %d %lf %lu\n", start, topK, simscore, (unsigned long) (end_t - start_t) * 1000 / CLOCKS_PER_SEC); + fclose(fout); + return 0; +} +