Skip to content

Commit

Permalink
Merge pull request #25 from bxshi/dev
Browse files Browse the repository at this point in the history
clean up and upload R scripts
  • Loading branch information
bxshi committed Sep 24, 2015
2 parents 4802f13 + adec27f commit 5013099
Show file tree
Hide file tree
Showing 24 changed files with 1,787 additions and 77 deletions.
110 changes: 110 additions & 0 deletions Rscript/aapp_dsyn.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# ---- Cleanup everything before start ----
rm(list = ls())
gc()

### Find battle commander pairs

# ---- GBSERVER API ----
source("./experimentAPI.R")

# ---- INPUT and CONFIGURATIONS ----

EDGE_TYPE_FILE = "../data/semmed_unique.edgetypes" # Example : "../data/lobbyist.edgetypes"
INPUT_FILE = "../facts/semmed/aapp_cause_dsyn_clean_id.csv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label
CLUSTER_SIZE = 48 # Number of workers in gbserver
FALSE_PER_TRUE = 5
DISCARD_REL = 5
ASSOCIATE_REL = c(22,28,25,17,39,20,37)
OUTPUT_PREFIX = "aapp_dsyn/aapp_dsyn" # "result/" OUTPUT_PREFIX

# ---- Load edge type file ----

mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F)
mapfile$V1 <- as.numeric(mapfile$V1)
mapfile$V2 <- as.character(mapfile$V2)

# ---- Init workers ----

cl <- makeCluster(CLUSTER_SIZE)
clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank",
"preferential_attachment", "katz", "pcrw",
"multidimensional_adamic_adar", "heterogeneous_adamic_adar",
"connectedby", "rel_path", "truelabeled", "falselabeled", "str_split",
"as.numeric", "request","DISCARD_REL"), envir = environment())

# ---- Load input data ----
dat.true <- unique(read.csv(INPUT_FILE))


if (ncol(dat.true) < 3)
dat.true$label <- T

# ---- Construct false labeled data -----
set.seed(233)

# TODO: reformat this so it is universal and file independent
dat.false <- rbind.fill(apply(dat.true, 1, function(x){
candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2])
candidates <- unlist(lapply(candidates, function(y){
if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) {
return(NULL)
}
return(y)
}))
return(data.frame(src=x[1],
dst=sample(candidates, FALSE_PER_TRUE),
label=F))
}))

colnames(dat.true) <- c("src","dst","label")
dat <- rbind(dat.true,dat.false)
dat <- unique(dat)

## Adamic Adar

experiment.aa <- eval.aa(dat, DISCARD_REL)
write.csv(experiment.aa$raw, paste("../result/", OUTPUT_PREFIX ,".aa.csv",sep=""), row.names=F)

## Semantic Proximity

experiment.sp <- eval.sp(dat, DISCARD_REL)
write.csv(experiment.sp$raw, paste("../result/", OUTPUT_PREFIX ,".sp.csv",sep=""), row.names=F)

## Personalized PageRank

experiment.ppr <- eval.ppr(dat, DISCARD_REL)
write.csv(experiment.ppr$raw, paste("../result/", OUTPUT_PREFIX ,".ppr.csv",sep=""), row.names=F)

## Preferential Attachment

experiment.pa <- eval.pa(dat, DISCARD_REL)
write.csv(experiment.pa$raw, paste("../result/", OUTPUT_PREFIX ,".pa.csv",sep=""), row.names=F)

## Katz

experiment.katz <- eval.katz(dat, DISCARD_REL)
write.csv(experiment.katz$raw, paste("../result/", OUTPUT_PREFIX ,".katz.csv",sep=""), row.names=F)

## AMIE

experiment.amie <- eval.amie(dat, ASSOCIATE_REL)
write.csv(experiment.amie$raw, paste("../result/", OUTPUT_PREFIX ,".amie.csv",sep=""), row.names=F)

## Test Method

#experiment.test <- eval.test(dat, DISCARD_REL)
#write.csv(experiment.test$raw, paste("../result/", OUTPUT_PREFIX ,".test.csv",sep=""), row.names=F)

experiment.pcrwamie <- eval.pcrw(dat, c(22,28,25,17,39,20,37))
write.csv(experiment.pcrwamie$raw, paste("../result/", OUTPUT_PREFIX ,".pcrwamie.csv",sep=""), row.names=F)


stopCluster(cl)

experiment.simrank <- read.csv("../facts/aapp_dsyn.simrank.csv", header=F)
colnames(experiment.simrank) <- c("src", "dst", "score")
experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")]
experiment.simrank <- eval.df(experiment.simrank)
write.csv(experiment.simrank$raw, paste("../result/", OUTPUT_PREFIX ,".simrank.csv",sep=""), row.names=F)


151 changes: 151 additions & 0 deletions Rscript/battle_commander.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# ---- Cleanup everything before start ----
rm(list = ls())
gc()

### Find battle commander pairs

# ---- GBSERVER API ----
source("./experimentAPI.R")

# ---- INPUT and CONFIGURATIONS ----

EDGE_TYPE_FILE = "../data/infobox.edgetypes" # Example : "../data/lobbyist.edgetypes"
INPUT_FILE = "../facts/civil_war//battle_commander_id.tsv" # Example : "../facts/lobbyist/firm_payee.csv" col 1 and 2 are ids and 3 is label
CLUSTER_SIZE = 48 # Number of workers in gbserver
FALSE_PER_TRUE = 5
DISCARD_REL = 78
ASSOCIATE_REL = c(144)

# ---- Load edge type file ----

mapfile <- read.csv(EDGE_TYPE_FILE, sep="\t", header=F)
mapfile$V1 <- as.numeric(mapfile$V1)
mapfile$V2 <- as.character(mapfile$V2)

# ---- Init workers ----

cl <- makeCluster(CLUSTER_SIZE)
clusterExport(cl = cl, varlist=c("adamic_adar", "semantic_proximity", "ppagerank", "heter_path",
"preferential_attachment", "katz", "pcrw", "heter_full_path", "meta_path",
"multidimensional_adamic_adar", "heterogeneous_adamic_adar",
"connectedby", "rel_path", "truelabeled", "falselabeled", "str_split",
"as.numeric", "request","DISCARD_REL"), envir = environment())



# ---- Load input data ----
dat.true <- unique(read.csv(INPUT_FILE))


if (ncol(dat.true) < 3)
dat.true$label <- T

# ---- Construct false labeled data -----
set.seed(233)

# TODO: reformat this so it is universal and file independent
dat.false <- rbind.fill(apply(dat.true, 1, function(x){
candidates <- unique(dat.true[which(dat.true[,1] != x[1]), 2])
candidates <- unlist(lapply(candidates, function(y){
if(length(which(dat.true[,1] == x[1] & dat.true[,2] == y) != 0)) {
return(NULL)
}
return(y)
}))
return(data.frame(src=x[1],
dst=sample(candidates, FALSE_PER_TRUE),
label=F))
}))

colnames(dat.true) <- c("src","dst","label")
dat <- rbind(dat.true, dat.false)
dat <- unique(dat)

elapsed.time <- data.frame()

## Adamic Adar

experiment.aa <- eval.aa(dat, DISCARD_REL)
write.csv(experiment.aa$raw, "../result/civil_war/battle_commander.aa.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="aa",
elapsed = experiment.aa$elapsed[3] * CLUSTER_SIZE / nrow(dat)));


## Semantic Proximity

experiment.sp <- eval.sp(dat, DISCARD_REL)
write.csv(experiment.sp$raw, "../result/civil_war/battle_commander.sp.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="sp",
elapsed = experiment.sp$elapsed[3] * CLUSTER_SIZE / nrow(dat)));


## Personalized PageRank

experiment.ppr <- eval.ppr(dat, DISCARD_REL)
write.csv(experiment.ppr$raw, "../result/civil_war/battle_commander.ppr.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="ppr",
elapsed = experiment.ppr$elapsed[3] * CLUSTER_SIZE / nrow(dat)));


## Preferential Attachment

experiment.pa <- eval.pa(dat, DISCARD_REL)
write.csv(experiment.pa$raw, "../result/civil_war/battle_commander.pa.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="pa",
elapsed = experiment.pa$elapsed[3] * CLUSTER_SIZE / nrow(dat)));


## Katz

experiment.katz <- eval.katz(dat, DISCARD_REL)
write.csv(experiment.katz$raw, "../result/civil_war/battle_commander.katz.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="katz",
elapsed = experiment.katz$elapsed[3] * CLUSTER_SIZE / nrow(dat)));



## AMIE

experiment.amie <- eval.amie(dat, ASSOCIATE_REL)
write.csv(experiment.amie$raw, "../result/civil_war/battle_commander.amie.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="amie",
elapsed = experiment.amie$elapsed[3] * CLUSTER_SIZE / nrow(dat)));


## Test Method

experiment.fullpath.test <- eval.fullpath.test(dat, DISCARD_REL)
write.csv(experiment.fullpath.test$raw, "../result/civil_war/battle_commander.fullpath.test.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="fullpath.test",
elapsed = experiment.fullpath.test$elapsed[3] * CLUSTER_SIZE / nrow(dat)))

experiment.test <- eval.test(dat, DISCARD_REL)
write.csv(experiment.test$raw, "../result/civil_war/battle_commander.test.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="test",
elapsed = experiment.test$elapsed[3] * CLUSTER_SIZE / nrow(dat)));


experiment.pcrwamie <- eval.pcrw(dat, c(144))
write.csv(experiment.pcrwamie$raw, "../result/civil_war/battle_commander.pcrwamie.csv", row.names=F)

elapsed.time <- rbind(elapsed.time, data.frame(method="pcrw",
elapsed = experiment.pcrwamie$elapsed[3] * CLUSTER_SIZE / nrow(dat)));

write.csv(elapsed.time, paste("../result/civil_war/battle_commander.elapsed.csv",sep=""), row.names=F)


stopCluster(cl)

experiment.simrank <- read.csv("../facts/civil_war.simrank.csv", header=F)
colnames(experiment.simrank) <- c("src", "dst", "score")
experiment.simrank <- merge(experiment.simrank, dat)[, c("label","score")]
experiment.simrank <- eval.df(experiment.simrank)
write.csv(experiment.simrank$raw, "../result/civil_war/civil_war.simrank.csv", row.names=F)
Loading

0 comments on commit 5013099

Please sign in to comment.