Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random seed for random walk generation #71

Merged
merged 11 commits into from
Feb 12, 2022
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pytest==6.2.5
pytest-cov==3.0.0
twine==3.4.1
pre-commit==2.16.0
parameterized==0.8.1
10 changes: 9 additions & 1 deletion src/pecanpy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,13 @@ def parse_args():
help="Noisy edge threshold parameter.",
)

parser.add_argument(
"--random_state",
type=int,
default=None,
help="Random seed for generating random walks.",
)

parser.add_argument(
"--delimiter",
type=str,
Expand Down Expand Up @@ -249,6 +256,7 @@ def read_graph(args):
directed = args.directed
extend = args.extend
gamma = args.gamma
random_state = args.random_state
mode = args.mode
task = args.task
delimiter = args.delimiter
Expand All @@ -266,7 +274,7 @@ def read_graph(args):
exit()

pecanpy_mode = getattr(pecanpy, mode, None)
g = pecanpy_mode(p, q, workers, verbose, extend, gamma)
g = pecanpy_mode(p, q, workers, verbose, extend, gamma, random_state)

if fp.endswith(".npz"):
g.read_npz(fp, weighted)
Expand Down
30 changes: 20 additions & 10 deletions src/pecanpy/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,18 +456,29 @@ class DenseGraph(BaseGraph):
def __init__(self):
"""Initialize DenseGraph object."""
super().__init__()
self.data = None
self.nonzero = None
self._data = None
self._nonzero = None

@property
def num_edges(self):
"""Return the number of edges in the graph."""
return self.nonzero.sum()

def _set_data(self, data):
"""Set data and update nonzero."""
self.data = data
self.nonzero = data != 0
@property
def data(self):
"""Return the adjacency matrix."""
return self._data

@property
def nonzero(self):
"""Return the nonzero mask for the adjacency matrix."""
return self._nonzero

@data.setter
def data(self, data):
"""Set adjacency matrix and the corresponding nonzero matrix."""
self._data = data.astype(float)
self._nonzero = self._data != 0

def read_npz(self, fp, weighted):
"""Read ``.npz`` file and create dense graph.
Expand All @@ -479,7 +490,7 @@ def read_npz(self, fp, weighted):

"""
raw = np.load(fp)
self._set_data(raw["data"])
self.data = raw["data"]
if not weighted: # overwrite edge weights with constant
self.data = self.nonzero * 1.0
self.set_ids(raw["IDs"].tolist())
Expand All @@ -490,7 +501,7 @@ def read_edg(self, edg_fp, weighted, directed, delimiter="\t"):
g.read(edg_fp, weighted, directed, delimiter)

self.set_ids(g.IDlst)
self._set_data(g.to_dense())
self.data = g.to_dense()

def save(self, fp):
"""Save dense graph as ``.dense.npz`` file."""
Expand All @@ -507,7 +518,7 @@ def from_adjlst_graph(cls, adjlst_graph, **kwargs):
"""
g = cls(**kwargs)
g.set_ids(adjlst_graph.IDlst)
g._set_data(adjlst_graph.to_dense())
g.data = adjlst_graph.to_dense()
return g

@classmethod
Expand All @@ -521,6 +532,5 @@ def from_mat(cls, adj_mat, node_ids, **kwargs):
"""
g = cls(**kwargs)
g.data = adj_mat
g.nonzero = adj_mat != 0
g.set_ids(node_ids)
return g
43 changes: 36 additions & 7 deletions src/pecanpy/pecanpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
from gensim.models import Word2Vec
from numba import njit, prange
from numba.np.ufunc.parallel import _get_thread_id
from numba_progress import ProgressBar
from pecanpy.rw import DenseRWGraph, SparseRWGraph
from pecanpy.wrappers import Timer
Expand Down Expand Up @@ -41,32 +42,46 @@ class Base:

"""

def __init__(self, p, q, workers, verbose=False, extend=False, gamma=0):
def __init__(
self,
p=1,
q=1,
workers=1,
verbose=False,
extend=False,
gamma=0,
random_state=None,
):
"""Initializ node2vec base class.

Args:
p (float): return parameter, value less than 1 encourages returning
back to previous vertex, and discourage for value grater than 1.
back to previous vertex, and discourage for value grater than 1
(default: 1).
q (float): in-out parameter, value less than 1 encourages walks to
go "outward", and value greater than 1 encourage walking within
a localized neighborhood.
workers (int): number of threads to be spawned for runing node2vec
including walk generation and word2vec embedding.
a localized neighborhood (default: 1)
workers (int): number of threads to be spawned for runing node2vec
including walk generation and word2vec embedding (default: 1)
verbose (bool): show progress bar for walk generation.
extend (bool): use node2vec+ extension if set to :obj:`True`
(default: :obj:`False`).
gamma (float): Multiplication factor for the std term of edge
weights added to the average edge weights as the noisy edge
threashold, only used by node2vec+ (default: 0)
random_state (int, optional): Random seed for generating random
walks (default: :obj:`None`).

"""
super().__init__()
self.p = p
self.q = q
self.workers = workers
self.workers = workers # TODO: not doing anything, need to fix.
self.verbose = verbose
self.extend = extend
self.gamma = gamma
self.random_state = random_state
self._preprocessed = False

def _map_walk(self, walk_idx_ary):
"""Map walk from node index to node ID.
Expand Down Expand Up @@ -94,19 +109,28 @@ def simulate_walks(self, num_walks, walk_length):
walks_length (int): length of walk.

"""
self._preprocess_transition_probs()

num_nodes = len(self.IDlst)
nodes = np.array(range(num_nodes), dtype=np.uint32)
start_node_idx_ary = np.concatenate([nodes] * num_walks)
np.random.shuffle(start_node_idx_ary)
tot_num_jobs = start_node_idx_ary.size

random_state = self.random_state
np.random.seed(random_state)
np.random.shuffle(start_node_idx_ary) # for balanced work load

move_forward = self.get_move_forward()
has_nbrs = self.get_has_nbrs()
verbose = self.verbose

@njit(parallel=True, nogil=True)
def node2vec_walks(num_iter, progress_proxy):
"""Simulate a random walk starting from start node."""
# Seed the random number generator
if random_state is not None:
np.random.seed(random_state + _get_thread_id())

# use the last entry of each walk index array to keep track of the
# effective walk length
walk_idx_mat = np.zeros((num_iter, walk_length + 2), dtype=np.uint32)
Expand Down Expand Up @@ -168,6 +192,11 @@ def preprocess_transition_probs(self):
"""Null default preprocess method."""
pass

def _preprocess_transition_probs(self):
if not self._preprocessed:
self.preprocess_transition_probs()
self._preprocessed = True

def embed(
self,
dim=128,
Expand Down
53 changes: 20 additions & 33 deletions test/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,33 @@
import os
import os.path as op
import os.path as osp
import shutil
import subprocess
import tempfile
import unittest
from unittest.mock import patch

from numba import set_num_threads
from parameterized import parameterized
from pecanpy import cli

set_num_threads(1)

DATA_DIR = op.abspath(op.join(__file__, op.pardir, op.pardir, "demo"))
EDG_FP = op.join(DATA_DIR, "karate.edg")
DATA_DIR = osp.abspath(osp.join(__file__, osp.pardir, osp.pardir, "demo"))
EDG_FP = osp.join(DATA_DIR, "karate.edg")

TMP_DATA_DIR = tempfile.mkdtemp()
CSR_FP = op.join(TMP_DATA_DIR, "karate.csr.npz")
DENSE_FP = op.join(TMP_DATA_DIR, "karate.dense.npz")
CSR_FP = osp.join(TMP_DATA_DIR, "karate.csr.npz")
DENSE_FP = osp.join(TMP_DATA_DIR, "karate.dense.npz")
COM = ["pecanpy", "--input", EDG_FP, "--output"]

SETTINGS = [
("FirstOrderUnweighted",),
("PreCompFirstOrder",),
("PreComp",),
("SparseOTF",),
("DenseOTF",),
]


class TestCli(unittest.TestCase):
@classmethod
Expand Down Expand Up @@ -69,35 +78,13 @@ def test_precompfirstorder_catch(self):
with self.assertRaises(ValueError):
self.execute("PreCompFirstOrder", EDG_FP, p, q)

def test_firstorderunweighted_from_edg(self):
self.execute("FirstOrderUnweighted", EDG_FP)

def test_precompfirstorder_from_edg(self):
self.execute("PreCompFirstOrder", EDG_FP)

def test_precomp_from_edg(self):
self.execute("PreComp", EDG_FP)

def test_sparseotf_from_edg(self):
self.execute("SparseOTF", EDG_FP)

def test_denseotf_from_edg(self):
self.execute("DenseOTF", EDG_FP)

def test_firstorderunweighted_from_csr(self):
self.execute("FirstOrderUnweighted", CSR_FP)

def test_precompfirstorder_from_npz(self):
self.execute("PreCompFirstOrder", CSR_FP)

def test_precomp_from_npz(self):
self.execute("PreComp", CSR_FP)

def test_sparseotf_from_npz(self):
self.execute("SparseOTF", CSR_FP)
@parameterized.expand(SETTINGS)
def test_from_edg(self, name):
self.execute(name, EDG_FP)

def test_denseotf_from_npz(self):
self.execute("DenseOTF", DENSE_FP)
@parameterized.expand(SETTINGS)
def test_from_npz(self, name):
self.execute(name, DENSE_FP if name == "DenseOTF" else CSR_FP)


if __name__ == "__main__":
Expand Down
82 changes: 26 additions & 56 deletions test/test_pecanpy.py
Original file line number Diff line number Diff line change
@@ -1,74 +1,44 @@
import os.path as op
import os.path as osp
import unittest

from numba import set_num_threads
from parameterized import parameterized
from pecanpy import graph
from pecanpy import pecanpy

set_num_threads(1)

DATA_DIR = op.abspath(op.join(__file__, op.pardir, op.pardir, "demo"))
EDG_FP = op.join(DATA_DIR, "karate.edg")
DATA_DIR = osp.abspath(osp.join(__file__, osp.pardir, osp.pardir, "demo"))
EDG_FP = osp.join(DATA_DIR, "karate.edg")
SETTINGS = [
("SparseOTF", pecanpy.SparseOTF),
("DenseOTF", pecanpy.DenseOTF),
("PreComp", pecanpy.PreComp),
("PreCompFirstOrder", pecanpy.PreCompFirstOrder),
("FirstOrderUnweighted", pecanpy.FirstOrderUnweighted),
]


class TestPecanPyFromMat(unittest.TestCase):
def setUp(self):
class TestPecanPy(unittest.TestCase):
@classmethod
def setUpClass(self):
g = graph.DenseGraph()
g.read_edg(EDG_FP, weighted=False, directed=False)
self.mat = g.data
self.ids = g.IDlst
self.kwargs = {"p": 1, "q": 1, "workers": 1}

def test_sparseotf_from_mat(self):
g = pecanpy.SparseOTF.from_mat(self.mat, self.ids, **self.kwargs)
g.embed()

def test_denseotf_from_mat(self):
g = pecanpy.DenseOTF.from_mat(self.mat, self.ids, **self.kwargs)
g.embed()

def test_precomp_from_mat(self):
g = pecanpy.PreComp.from_mat(self.mat, self.ids, **self.kwargs)
g.preprocess_transition_probs()
g.embed()

def test_precompfirtorder_from_mat(self):
g = pecanpy.PreCompFirstOrder.from_mat(self.mat, self.ids, **self.kwargs)
g.preprocess_transition_probs()
g.embed()

def test_firtorderunweighted_from_mat(self):
g = pecanpy.FirstOrderUnweighted.from_mat(self.mat, self.ids, **self.kwargs)
g.embed()


class TestPecanPyFromEdg(unittest.TestCase):
def test_sparseotf_from_edg(self):
g = pecanpy.SparseOTF(1, 1, 1)
g.read_edg(EDG_FP, weighted=False, directed=False)
g.embed()

def test_denseotf_from_edg(self):
g = pecanpy.DenseOTF(1, 1, 1)
g.read_edg(EDG_FP, weighted=False, directed=False)
g.embed()

def test_precomp_from_edg(self):
g = pecanpy.PreComp(1, 1, 1)
g.read_edg(EDG_FP, weighted=False, directed=False)
g.preprocess_transition_probs()
g.embed()

def test_precompfirstorder_from_edg(self):
g = pecanpy.PreCompFirstOrder(1, 1, 1)
g.read_edg(EDG_FP, weighted=False, directed=False)
g.preprocess_transition_probs()
g.embed()

def test_firstorderunweighted_from_edg(self):
g = pecanpy.FirstOrderUnweighted(1, 1, 1)
g.read_edg(EDG_FP, weighted=False, directed=False)
g.embed()
@parameterized.expand(SETTINGS)
def test_from_mat(self, name, mode):
with self.subTest(name):
g = mode.from_mat(self.mat, self.ids, p=1, q=1)
g.embed()

@parameterized.expand(SETTINGS)
def test_from_edg(self, name, mode):
with self.subTest(name):
g = mode(p=1, q=1)
g.read_edg(EDG_FP, weighted=False, directed=False)
g.embed()


if __name__ == "__main__":
Expand Down
Loading