diff --git a/pypathway/hotnet/HotNet2.py b/pypathway/hotnet/HotNet2.py new file mode 100755 index 0000000..b0b2b2f --- /dev/null +++ b/pypathway/hotnet/HotNet2.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python + +# Load required modules +import sys, os, json +from itertools import product + +# Load HotNet2 modules +from hotnet2 import run as hnrun, hnap, hnio, heat as hnheat, consensus_with_stats, viz as hnviz +from hotnet2.constants import ITERATION_REPLACEMENT_TOKEN, HN2_INFMAT_NAME + +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/scripts/')) +import createDendrogram as CD + +def get_parser(): + description = "Helper script for simple runs of generalized HotNet2, including automated"\ + "parameter selection." + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + + parser.add_argument('-nf', '--network_files', required=True, nargs='*', + help='Path to HDF5 (.h5) file containing influence matrix and edge list.') + parser.add_argument('-pnp', '--permuted_network_paths', required=True, default='', + help='Path to influence matrices for permuted networks, one path '\ + 'per network file. Include ' + ITERATION_REPLACEMENT_TOKEN + ' '\ + 'in the path to be replaced with the iteration number', nargs='*') + parser.add_argument('-hf', '--heat_files', required=True, nargs='*', + help='Path to heat file containing gene names and scores. This can either'\ + 'be a JSON file created by generateHeat.py, in which case the file'\ + 'name must end in .json, or a tab-separated file containing a gene'\ + 'name in the first column and the heat score for that gene in the'\ + 'second column of each line.') + parser.add_argument('-ccs', '--min_cc_size', type=int, default=2, + help='Minimum size connected components that should be returned.') + parser.add_argument('-d', '--deltas', nargs='*', type=float, default=[], + help='Delta value(s).') + parser.add_argument('-np', '--network_permutations', type=int, default=100, + help='Number of permutations to be used for delta parameter selection.') + parser.add_argument('-cp', '--consensus_permutations', type=int, default=0, + help='Number of permutations to be used for consensus statistical significance testing.') + parser.add_argument('-hp', '--heat_permutations', type=int, default=100, + help='Number of permutations to be used for statistical significance testing.') + parser.add_argument('-o', '--output_directory', required=True, default=None, + help='Output directory. Files results.json, components.txt, and'\ + 'significance.txt will be generated in subdirectories for each delta.') + parser.add_argument('-c', '--num_cores', type=int, default=1, + help='Number of cores to use for running permutation tests in parallel. If'\ + '-1, all available cores will be used.') + parser.add_argument('-dsf', '--display_score_file', + help='Path to a tab-separated file containing a gene name in the first'\ + 'column and the display score for that gene in the second column of'\ + 'each line.') + parser.add_argument('-dnf', '--display_name_file', + help='Path to a tab-separated file containing a gene name in the first'\ + 'column and the display name for that gene in the second column of'\ + 'each line.') + parser.add_argument('--output_hierarchy', default=False, required=False, action='store_true', + help='Output the hierarchical decomposition of the HotNet2 similarity matrix.') + parser.add_argument('--verbose', default=1, choices=list(range(5)), type=int, required=False, + help='Set verbosity of output (minimum: 0, maximum: 5).') + + return parser + +def run(args): + # Load the network and heat files + assert( len(args.network_files) == len(args.permuted_network_paths) ) + networks, graph_map = [], dict() + for network_file, pnp in zip(args.network_files, args.permuted_network_paths): + infmat, indexToGene, G, network_name = hnio.load_network(network_file, HN2_INFMAT_NAME) + graph_map[network_name] = G + networks.append( (infmat, indexToGene, G, network_name, pnp) ) + + heats, json_heat_map, heat_map, mutation_map, heat_file_map = [], dict(), dict(), dict(), dict() + for heat_file in args.heat_files: + json_heat = os.path.splitext(heat_file.lower())[1] == '.json' + heat, heat_name, mutations = hnio.load_heat_file(heat_file, json_heat) + json_heat_map[heat_name] = json_heat + heat_map[heat_name] = heat + heat_file_map[heat_name] = heat_file + mutation_map[heat_name] = mutations + heats.append( (heat, heat_name) ) + + # Run HotNet2 on each pair of network and heat files + if args.verbose > 0: + print('* Running HotNet2 in consensus mode...') + + single_runs, consensus, linkers, auto_deltas, consensus_stats = consensus_with_stats(args, networks, heats) + + # Output the single runs + if args.verbose > 0: + print('* Outputting results to file...') + + params = vars(args) + result_dirs = [] + for (network_name, heat_name, run) in single_runs: + # Set up the output directory and record for later + output_dir = '%s/%s-%s' % (args.output_directory, network_name.lower(), heat_name.lower()) + result_dirs.append(output_dir) + hnio.setup_output_dir(output_dir) + + # Output to file + hnio.output_hotnet2_run(run, params, network_name, heat_map[heat_name], heat_name, heat_file_map[heat_name], json_heat_map[heat_name], output_dir) + + # create the hierarchy if necessary + if args.output_hierarchy: + hierarchy_out_dir = '{}/hierarchy/'.format(output_dir) + if not os.path.isdir(hierarchy_out_dir): os.mkdir(hierarchy_out_dir) + CD.createDendrogram( sim, list(index2gene.values()), hierarchy_out_dir, params, verbose=False) + + # Output the consensus + hnio.output_consensus(consensus, linkers, auto_deltas, consensus_stats, params, args.output_directory) + + # Create the visualization(s). This has to be after the consensus procedure + # is run because we want to default to the auto-selected deltas. + if args.verbose > 0: + print('* Generating and outputting visualization data...') + + d_score = hnio.load_display_score_tsv(args.display_score_file) if args.display_score_file else None + d_name = hnio.load_display_name_tsv(args.display_name_file) if args.display_name_file else dict() + for (network_name, heat_name, run), result_dir, auto_delta in zip(single_runs, result_dirs, auto_deltas): + snvs, cnas, sampleToType = mutation_map[heat_name] + G = graph_map[network_name] + + output = hnviz.generate_viz_json(run, G.edges(), network_name, heat_map[heat_name], snvs, cnas, sampleToType, d_score, d_name) + + with open('{}/viz-data.json'.format(result_dir), 'w') as OUT: + output['params'] = dict(consensus=False, network_name=network_name, heat_name=heat_name, auto_delta=format(auto_delta, 'g')) + json.dump( output, OUT ) + + # Add the consensus visualization + snvs, cnas, sampleToType = mutations + consensus_ccs = [ d['core'] + d['expansion'] for d in consensus ] + consensus_auto_delta = 0 + results = [[consensus_ccs, consensus_stats, consensus_auto_delta]] + with open('{}/consensus/viz-data.json'.format(args.output_directory), 'w') as OUT: + output = hnviz.generate_viz_json(results, G.edges(), network_name, heat, snvs, cnas, sampleToType, d_score, d_name) + output['params'] = dict(consensus=True, auto_delta=format(consensus_auto_delta, 'g')) + json.dump( output, OUT ) + +if __name__ == "__main__": + run(get_parser().parse_args(sys.argv[1:])) diff --git a/pypathway/hotnet/__init__.py b/pypathway/hotnet/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/pypathway/hotnet/hotnet2/__init__.py b/pypathway/hotnet/hotnet2/__init__.py new file mode 100755 index 0000000..79e4e80 --- /dev/null +++ b/pypathway/hotnet/hotnet2/__init__.py @@ -0,0 +1,14 @@ +from .constants import * +from . import delta +from . import heat +from . import hnap +from . import hnio +from . import hotnet2 +from . import permutations +from . import run +from . import stats +from . import union_find +from . import viz +from . import hierarchy +from .consensus import * + diff --git a/pypathway/hotnet/hotnet2/c_ext_src/basic.c b/pypathway/hotnet/hotnet2/c_ext_src/basic.c new file mode 100644 index 0000000..0540e65 --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/basic.c @@ -0,0 +1,122 @@ +// +// basic.c +// color_coding +// +// Created by Yang Xu on 2017/8/16. +// Copyright © 2017年 sheep. All rights reserved. +// + +#include "basic.h" +#include "data_structure.h" +#include +#include + +struct Graph* DiGraphFromMatrix(struct MatrixDes* md) { + struct Graph* G = (struct Graph*)malloc(sizeof(struct Graph)); + G->nodes = (struct Node*)malloc(sizeof(struct Node) * md->length); + char * mt = md->matrix; + for (int i = 0; i < md->length; i++) { + G->nodes[i].degree = 0; + G->nodes[i].id = i; + for (int j = 0; j < md->length; j++) { + if (*(mt + md->length * i + j)) { + if (G->nodes[i].degree == 0) { + G->nodes[i].neighbours = (int*)malloc(sizeof(int)); + G->nodes[i].neighbours[0] = j; + G->nodes[i].degree++; + }else{ + G->nodes[i].neighbours = (int*)realloc(G->nodes[i].neighbours, + (G->nodes[i].degree + 1) * sizeof(int)); + G->nodes[i].neighbours[G->nodes[i].degree] = j; + G->nodes[i].degree++; + } + } + } + } + G->node_count = md->length; + return G; +} + + +struct SubQueue* stronglyConnectedComponent(struct Graph* G) { + // store the result + struct SubQueue* result = (struct SubQueue*)malloc(sizeof(struct SubQueue)); + struct SubQueue* current = result; + result->next = NULL; + result->queue = NULL; + int * preorder = (int *)malloc(sizeof(int) * G->node_count); + memset(preorder, 0, sizeof(int) * G->node_count); + int * lowlink = (int *)malloc(sizeof(int) * G->node_count); + memset(lowlink, 0, sizeof(int) * G->node_count); + int * scc_found = (int *)malloc(sizeof(int) * G->node_count); + memset(scc_found, 0, sizeof(int) * G->node_count); + struct QueueNode* scc_queue = NULL; + int i = 0; + for (int j = 0; j < G->node_count; j++) { + if (G->nodes[j].degree == 0) { + // this is an empty node + continue; + } + if (scc_found[G->nodes[j].id]) { + // already done + continue; + } + struct QueueNode* queue = initQueue(j); + // check if queue is empty while it should be empty + while (queue) { + int v = queuePeakRight(queue); + if (preorder[v] == 0) { + i += 1; + preorder[v] = i; + } + int done = 1; + // go through beighbours + for (int k = 0; k < G->nodes[v].degree; k++) { + int nei_id = G->nodes[v].neighbours[k]; + if (preorder[nei_id] == 0) { + queueAppend(&queue, nei_id); + done = 0; + break; + } + } + if (done == 1) { + lowlink[v] = preorder[v]; + for (int k = 0; k < G->nodes[v].degree; k++) { + int nei_id = G->nodes[v].neighbours[k]; + if (scc_found[nei_id] == 0) { + if (preorder[nei_id] > preorder[v]) { + lowlink[v] = lowlink[v] < lowlink[nei_id] ? lowlink[v] : lowlink[nei_id]; + }else{ + lowlink[v] = lowlink[v] < preorder[nei_id] ? lowlink[v] : preorder[nei_id]; + } + } + } + queuePopRight(&queue); + if (lowlink[v] == preorder[v]) { + scc_found[v] = 1; + struct QueueNode* resultQueue = initQueue(v); + // printf("%i, ", v); + while (scc_queue && preorder[queuePeakRight(scc_queue)] > preorder[v]) { + int k = queuePopRight(&scc_queue); + scc_found[k] = 1; + // printf("%i, ", k); + queueAppend(&resultQueue, k); + } + // printf("\n"); + if (result->queue == NULL) { + result->queue = resultQueue; + }else{ + struct SubQueue* next = (struct SubQueue*)malloc(sizeof(struct SubQueue)); + next->queue = resultQueue; + next->next = NULL; + current->next = next; + current=next; + } + }else{ + queueAppend(&scc_queue, v); + } + } + } + } + return result; +} diff --git a/pypathway/hotnet/hotnet2/c_ext_src/basic.h b/pypathway/hotnet/hotnet2/c_ext_src/basic.h new file mode 100644 index 0000000..fb9930e --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/basic.h @@ -0,0 +1,19 @@ +// +// basic.h +// color_coding +// +// Created by Yang Xu on 2017/8/16. +// Copyright © 2017年 sheep. All rights reserved. +// + +#ifndef basic_h +#define basic_h + +#include +#include "data_structure.h" + +struct Graph* DiGraphFromMatrix(struct MatrixDes* md); + +struct SubQueue* stronglyConnectedComponent(struct Graph* G); + +#endif /* basic_h */ diff --git a/pypathway/hotnet/hotnet2/c_ext_src/data_structure.c b/pypathway/hotnet/hotnet2/c_ext_src/data_structure.c new file mode 100755 index 0000000..d5fa872 --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/data_structure.c @@ -0,0 +1,85 @@ +// +// data_structure.c +// color_coding +// +// Created by sheep on 2017/8/2. +// Copyright © 2017年 sheep. All rights reserved. +// + +#include "data_structure.h" +#include + +void queueAppend(struct QueueNode** queue, int val){ + // if the queue is enpty + if (*queue == NULL) { + *queue = initQueue(val); + return; + } + struct QueueNode* node = (struct QueueNode*)malloc(sizeof(struct QueueNode)); + node->value = val; + node->next = NULL; + node->end = node; + struct QueueNode* perious_end = (*queue)->end; + node->perious = perious_end; + (*queue)->end->next = node; + (*queue)->end = node; +} + +int queuePopLeft(struct QueueNode** queue){ + if ((*queue)->next == NULL) { + // only has one node + int val = (*queue)->value; + *queue = NULL; + return val; + } + struct QueueNode* left = *queue; + struct QueueNode* new_left = left->next; + new_left->end = left->end; + int left_val = left->value; + *queue = new_left; + return left_val; +} + +int queuePopRight(struct QueueNode** queue){ + struct QueueNode* right = NULL; + if (*queue == NULL) { + return -1; + }else if ((*queue)->next == NULL) { + int val = (*queue)->value; + right = *queue; + free(right); + *queue = NULL; + return val; + }else{ + struct QueueNode* cur_end = (*queue)->end; + struct QueueNode* new_end = cur_end->perious; + (*queue)->end = new_end; + new_end->next = NULL; + int val = cur_end->value; + free(cur_end); + return val; + } +} + +int queuePeakLeft(struct QueueNode* queue){ + return queue->value; +} + +int queuePeakRight(struct QueueNode* queue){ + return queue->end->value; +} + +struct QueueNode* initQueue(int val){ + struct QueueNode* node = (struct QueueNode*)malloc(sizeof(struct QueueNode)); + node->next = NULL; + node->value = val; + node->end = node; + node->perious = NULL; + return node; +} + +// The structure representation of the graph node + + + + diff --git a/pypathway/hotnet/hotnet2/c_ext_src/data_structure.h b/pypathway/hotnet/hotnet2/c_ext_src/data_structure.h new file mode 100755 index 0000000..2431d9a --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/data_structure.h @@ -0,0 +1,82 @@ +// +// data_structure.h +// color_coding +// +// Created by sheep on 2017/8/2. +// Copyright © 2017年 sheep. All rights reserved. +// + +#ifndef data_structure_h +#define data_structure_h + +#include + +struct SubQueue { + struct QueueNode* queue; + struct SubQueue* next; +}; + +struct MatrixDes { + char * matrix; + int length; +}; + +struct Node { + int id; + int degree; + int* neighbours; +}; + +struct Graph { + int node_count; + struct Node * nodes; +}; + +struct GraphSet { + int graph_count; + struct Graph * graphs; +}; + +struct CompactGraph { + int* node_index; + int* edge_list; + int node_count; + int edge_count; +}; + +struct State { + int id; + struct Node* node; + struct State* perious; + int color; + int score; + int depth; +}; + +struct TodoNode { + int id; + struct Node* node; + struct TodoNode* perious; + struct State* state; +}; + +struct QueueNode { + int value; + struct QueueNode* next; + struct QueueNode* end; + struct QueueNode* perious; +}; + +struct QueueNode* initQueue(int val); + +int queuePeakLeft(struct QueueNode* queue); + +int queuePeakRight(struct QueueNode* queue); + +int queuePopRight(struct QueueNode** queue); + +int queuePopLeft(struct QueueNode** queue); + +void queueAppend(struct QueueNode** queue, int val); + +#endif /* data_structure_h */ diff --git a/pypathway/hotnet/hotnet2/c_ext_src/fast_scc.c b/pypathway/hotnet/hotnet2/c_ext_src/fast_scc.c new file mode 100755 index 0000000..305400a --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/fast_scc.c @@ -0,0 +1,132 @@ +#include +#include +#include +#include "data_structure.h" +#include "graphic.h" +#include "test_data.h" +#include "basic.h" +#include + + +/* Docstrings */ +static char module_docstring[] = + "This module provides an interface for calculating chi-squared using C."; +static char fast_scc_docstring[] = + "Calculate the chi-squared of some data given a model."; + +/* Available functions */ +static PyObject *fast_scc_scc(PyObject *self, PyObject *args); + +/* Module specification */ +static PyMethodDef module_methods[] = { + {"scc", fast_scc_scc, METH_VARARGS, fast_scc_docstring}, + {NULL, NULL, 0, NULL} +}; + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "fast_scc", + module_docstring, + -1, + module_methods, + NULL, + NULL, + NULL, + NULL, +}; + +/* Initialize the module */ +PyObject *PyInit_fast_scc(void) +{ + PyObject *module = PyModule_Create(&moduledef); + if (module == NULL) + return NULL; + + /* Load `numpy` functionality. */ + import_array(); + + return module; +} + +static PyObject *fast_scc_scc(PyObject *self, PyObject *args) +{ + clock_t cstart = clock(); + clock_t cend = 0; + + + double m; + PyObject *x_obj; + + /* Parse the input tuple */ + if (!PyArg_ParseTuple(args, "dO", &m, &x_obj)) + return NULL; + + /* Interpret the input objects as numpy arrays. */ + PyObject *x_array = PyArray_FROM_OTF(x_obj, NPY_BOOL, NPY_IN_ARRAY); + + /* If that didn't work, throw an exception. */ + if (x_array == NULL) { + Py_XDECREF(x_array); + return NULL; + } + + /* How many data points are there? */ + int N = (int)PyArray_DIM(x_array, 0); + // int M = (int)PyArray_DIM(x_array, 1); + + // if (N != M) { + // printf("ToDo: raise exception"); + // } + + /* Get pointers to the data as C-types. */ + char *x = (char*)PyArray_DATA(x_array); + + /* Call the external C function to compute the chi-squared. */ + // struct MatrixDes* md = generateTestMatrix(x, N); + struct MatrixDes* md = (struct MatrixDes*)malloc(sizeof(struct MatrixDes)); + md->length = N; + md->matrix = x; + cend = clock(); + printf ("%.6f cpu sec\n", ((double)cend - (double)cstart)* 1.0e-6); + cstart = clock(); + cend = 0; + struct Graph* G = DiGraphFromMatrix(md); + cend = clock(); + printf ("%.6f cpu sec\n", ((double)cend - (double)cstart)* 1.0e-6); + cstart = clock(); + // freeMatrixDes(md); + struct SubQueue* sq = stronglyConnectedComponent(G); + cend = clock(); + printf ("%.6f cpu sec\n", ((double)cend - (double)cstart)* 1.0e-6); + int * result = (int *)malloc(sizeof(int) * N * 3); + memset(result, 0, sizeof(int) * N * 3); + int curpos = 0; + while (sq) { + int start_pos = curpos + 1; + while (sq->queue) { + // printf("%i, ", sq->queue->value); + result[curpos] += 1; + result[start_pos] = sq->queue->value; + start_pos += 1; + sq->queue = sq->queue->next; + } + // printf("\n"); + sq = sq->next; + curpos = start_pos; + // printf("curpos: %i\n", curpos); + } + + /* Clean up. */ + Py_DECREF(x_array); + + npy_intp Dims[1]; + Dims[0] = curpos; + + // int res[3] = {1, 2, 3}; + // int* r = (int *)malloc(sizeof(int) * 3); + // memcpy(r, res, sizeof(int) * 3); + + PyObject* mat = PyArray_SimpleNewFromData(1, Dims, NPY_INT, result); + + return mat; +} \ No newline at end of file diff --git a/pypathway/hotnet/hotnet2/c_ext_src/graphic.c b/pypathway/hotnet/hotnet2/c_ext_src/graphic.c new file mode 100755 index 0000000..798c443 --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/graphic.c @@ -0,0 +1,158 @@ +// +// graphic.c +// color_coding +// +// Created by sheep on 2017/8/2. +// Copyright © 2017年 sheep. All rights reserved. +// + +#include "graphic.h" +#include "data_structure.h" +#include "string.h" +#include "stdlib.h" +#include "time.h" + +struct TodoNode** todoList; +struct TodoNode** nextTodo; +struct State** states; + +// the score/color-coding related function +void init_graph_variable(int parallel_length, struct Graph* G, int* scores){ + // init the space for the two variable, todoList and the states + todoList = (struct TodoNode**)malloc(sizeof(struct TodoNode*) * parallel_length); + states = (struct State**)malloc(sizeof(struct State*) * parallel_length); + nextTodo = (struct TodoNode**)malloc(sizeof(struct TodoNode*) * parallel_length); + //memset(nextTodo, 0, sizeof(struct TodoNode*) * parallel_length); + for (int i = 0; i < parallel_length; i++) { + nextTodo[i] = NULL; + } + // first do not consider the situation + for (int i = 0; i < G->node_count; i ++) { + struct State* node_state = (struct State*)malloc(sizeof(struct State)); + struct TodoNode* todo_node = (struct TodoNode*)malloc(sizeof(struct State)); + node_state->node = G->nodes + i; + node_state->id = G->nodes[i].id; + node_state->perious = NULL; + // fix me if necessary + node_state->color = 0; + node_state->score = scores[i]; + node_state->depth = 0; + // init todo node + todo_node->node = G->nodes + i; + todo_node->perious = NULL; + todo_node->state = node_state; + todo_node->id = G->nodes[i].id; + todoList[i] = todo_node; + states[i] = node_state; + } +} + +void reinit_graph_variable(int parallel_length) { + todoList = nextTodo; + nextTodo = (struct TodoNode**)malloc(sizeof(struct TodoNode*) * parallel_length); +} + +int simulate_GPU(int parallel_length, struct Graph* G, int terminate_depth, int* max_score, int* scores){ + int remain = 0; + // simulate the GPU running state, undependent threads + for (int i = 0; i < parallel_length; i ++) { + struct TodoNode* todo = todoList[i]; + while (todo != NULL) { + if (todo->state->depth == terminate_depth) { + // the path has ended + if (todo->state->score > *max_score) { + printf("The high score now is %i\n", todo->state->score); + *max_score = todo->state->score; + } + todo = todo->perious; + continue; + } else { + printf("current score is %i, depth is %i\n", todo->state->score, todo->state->depth); + } + for (int nei = 0; nei < todo->node->degree; nei ++) { + // neighbour node + struct Node* neighbour_node = G->nodes + todo->node->neighbours[nei]; + // init a state + struct State* node_state = (struct State*)malloc(sizeof(struct State)); + node_state->perious = todo->state; + node_state->depth = todo->state->depth + 1; + node_state->id = neighbour_node->id; + node_state->color = 0; + node_state->score = todo->state->score + scores[neighbour_node->id]; + node_state->node = neighbour_node; + // init a todoNode + struct TodoNode* todo_node = (struct TodoNode*)malloc(sizeof(struct TodoNode)); + todo_node->id = neighbour_node->id; + todo_node->node = neighbour_node; + todo_node->state = node_state; + todo_node->perious = NULL; + if (nextTodo[i] == NULL) { + nextTodo[i] = todo_node; + }else{ + struct TodoNode* last = nextTodo[i]; + while (last->perious) { + last = last->perious; + } + last->perious = todo_node; + } + remain ++; + } + todo = todo->perious; + } + } + return remain; +} + +void simulate_loop(struct Graph* G, int parallel_length, int* scores){ + int terminate_length = 2; + int* max_score = (int *)malloc(sizeof(int)); + *max_score = 0; + init_graph_variable(parallel_length, G, scores); + while (1) { + int remain = simulate_GPU(parallel_length, G, terminate_length, max_score, scores); + if (remain == 0) { + printf("max score is %i\n", *max_score); + break; + } + reinit_graph_variable(parallel_length); + } +} + +// Basic graphic function +void display_graphic(struct Graph* G){ + printf("This graphic contains %i nodes\n", G->node_count); + for (int i = 0; i < G->node_count; i++) { + printf("Node %i: \n", G->nodes[i].id); + for (int j = 0; j < G->nodes[i].degree; j ++) { + printf("\t\t%i -- %i\n", G->nodes[i].id, G->nodes[i].neighbours[j]); + } + } +} + +struct Graph* init_10_node_test_graph(){ + struct Graph* graph = (struct Graph*)malloc(sizeof(struct Graph)); + graph->nodes = (struct Node*)malloc(sizeof(struct Node) * 10); + graph->node_count = 10; + for (int i = 0; i < 10; i++) { + struct Node* node = (struct Node*)malloc(sizeof(struct Node)); + node->id = i; + node->degree = 2; + node->neighbours = (int *)malloc(sizeof(int)); + if (i == 0) { + node->neighbours[0] = 1; + node->neighbours[1] = 9; + }else if (i == 9){ + node->neighbours[0] = 8; + node->neighbours[1] = 0; + }else{ + node->neighbours[0] = i - 1; + node->neighbours[1] = i + 1; + } + graph->nodes[i] = *node; + } + return graph; +} + + + + diff --git a/pypathway/hotnet/hotnet2/c_ext_src/graphic.h b/pypathway/hotnet/hotnet2/c_ext_src/graphic.h new file mode 100755 index 0000000..4394d2b --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/graphic.h @@ -0,0 +1,23 @@ +// +// graphic.h +// color_coding +// +// Created by sheep on 2017/8/2. +// Copyright © 2017年 sheep. All rights reserved. +// + +#ifndef graphic_h +#define graphic_h + +#include +#include "data_structure.h" + +struct Graph* init_10_node_test_graph(); + +void display_graphic(struct Graph* G); + +void init_graph_variable(int parallel_length, struct Graph* G, int* scores); + +void simulate_loop(struct Graph* G, int parallel_length, int* scores); + +#endif /* graphic_h */ diff --git a/pypathway/hotnet/hotnet2/c_ext_src/main.c b/pypathway/hotnet/hotnet2/c_ext_src/main.c new file mode 100755 index 0000000..d3e20ef --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/main.c @@ -0,0 +1,51 @@ +// +// main.c +// color_coding +// +// Created by sheep on 2017/8/2. +// Copyright © 2017年 sheep. All rights reserved. +// + +#include +#include "data_structure.h" +#include "graphic.h" +#include "test_data.h" +#include "basic.h" + + +void testQueue(){ + struct QueueNode* queue = initQueue(1); + queuePopLeft(&queue); + queueAppend(&queue, 2); + queuePopRight(&queue); + queueAppend(&queue, 3); + queueAppend(&queue, 4); + queueAppend(&queue, 5); + printf("%i\n", queuePopRight(&queue)); + printf("%i\n", queuePopLeft(&queue)); + printf("%i\n", queuePopRight(&queue)); + printf("%i\n", queuePopLeft(&queue)); +} + +int main(int argc, const char * argv[]) { +// struct Graph* test_graph = init_10_node_test_graph(); +// int score[10] = {2, 1, 4, 3, 2, 5, 6, 4, 2, 1}; +// simulate_loop(test_graph, test_graph->node_count, &score); + int mt[7][7] = {{0, 1, 0, 0, 0, 0, 0}, + {0, 0, 1, 0, 0, 0, 0}, {0, 0, 0, 1, 0, 0, 0}, + {1, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 1, 0}, + {0, 0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 1, 0, 0}}; + struct MatrixDes* md = generateTestMatrix(mt, 7); + struct Graph* G = DiGraphFromMatrix(md); + struct SubQueue* sq = stronglyConnectedComponent(G); + while (sq) { + while (sq->queue) { + printf("%i, ", sq->queue->value); + sq->queue = sq->queue->next; + } + printf("\n"); + sq = sq->next; + } +// testQueue(); +} + diff --git a/pypathway/hotnet/hotnet2/c_ext_src/test_data.c b/pypathway/hotnet/hotnet2/c_ext_src/test_data.c new file mode 100755 index 0000000..6fa2e4b --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/test_data.c @@ -0,0 +1,97 @@ +// +// test_data.c +// color_coding +// +// Created by sheep on 2017/8/3. +// Copyright © 2017年 sheep. All rights reserved. +// + +#include "test_data.h" +#include "data_structure.h" +#include +#include + +struct Graph* load_string_network(){ + return NULL; +} + +struct Graph* load_biogrid_network(){ + return NULL; +} + +struct Graph* load_hint_network(){ + return load_generate_tab_network("/Users/yangxu/submodule/raw_network/hint.txt"); +} + + +struct Graph* load_generate_tab_network(char* path){ + FILE* fp = fopen(path, "r"); + int node1, node2; + float weight; + int* degree = (int*)malloc(sizeof(int) * 1000000); + memset(degree, 0, sizeof(int) * 1000000); + while (fscanf(fp, "%i\t%i\t%f\n", &node1, &node2, &weight) > 0) { + // printf("%i -- %i : weight %f\n", node1, node2, weight); + degree[node1] ++; + degree[node2] ++; + } + int count = 0; + for (int i = 0; i < 1000000; i ++) { + if (degree[i] == 0) { + count = i; + break; + } + } + struct Graph* graph = (struct Graph*)malloc(sizeof(struct Graph)); + graph->node_count = count; + graph->nodes = (struct Node*)malloc(sizeof(struct Node) * graph->node_count); + for (int i = 0; i < graph->node_count; i ++) { + graph->nodes[i].degree = 0; + graph->nodes[i].id = i; + graph->nodes[i].neighbours = (int*)malloc(sizeof(int) * degree[i]); + } + free(degree); + fseek(fp, 0, SEEK_SET); + while (fscanf(fp, "%i\t%i\t%f\n", &node1, &node2, &weight) > 0) { + graph->nodes[node1].neighbours[graph->nodes[node1].degree] = node2; + graph->nodes[node2].neighbours[graph->nodes[node2].degree] = node1; + graph->nodes[node1].degree ++; + graph->nodes[node2].degree ++; + } + return graph; +} + + +struct MatrixDes* simpleTestMatrix(){ + int mt[3][3] = {{0, 1, 0}, {1, 0, 1}, {0, 1, 0}}; + int* matrix = (int *)malloc(sizeof(int) * 9); + struct MatrixDes* md = malloc(sizeof(struct MatrixDes)); + memcpy(matrix, mt, sizeof(int) * 9); + md->length = 3; + md->matrix = matrix; + return md; +} + +void freeMatrixDes(struct MatrixDes* md){ + free(md->matrix); + free(md); +} + + +struct MatrixDes* generateTestMatrix(int ** mat, int length){ + int * matrix = (int *)malloc(sizeof(int) * length * length); + struct MatrixDes* md = malloc(sizeof(struct MatrixDes)); + memcpy(matrix, mat, length * length * sizeof(int)); + md->length = length; + md->matrix = matrix; + return md; +} + +void printMatrix(struct MatrixDes* md) { + for (int i = 0; i < md->length; i++) { + for (int j = 0; j < md->length; j ++) { + printf("%i, ", *(md->matrix + md->length * i + j)); + } + printf("\b \b\b \b\n"); + } +} diff --git a/pypathway/hotnet/hotnet2/c_ext_src/test_data.h b/pypathway/hotnet/hotnet2/c_ext_src/test_data.h new file mode 100755 index 0000000..2881077 --- /dev/null +++ b/pypathway/hotnet/hotnet2/c_ext_src/test_data.h @@ -0,0 +1,26 @@ +// +// test_data.h +// color_coding +// +// Created by sheep on 2017/8/3. +// Copyright © 2017年 sheep. All rights reserved. +// + +#ifndef test_data_h +#define test_data_h + +#include + +struct Graph* load_generate_tab_network(); + +struct Graph* load_hint_network(); + +struct MatrixDes* simpleTestMatrix(); + +void printMatrix(struct MatrixDes* md); + +struct MatrixDes* generateTestMatrix(int ** mat, int length); + +void freeMatrixDes(struct MatrixDes* md); + +#endif /* test_data_h */ diff --git a/pypathway/hotnet/hotnet2/consensus.py b/pypathway/hotnet/hotnet2/consensus.py new file mode 100755 index 0000000..6f05379 --- /dev/null +++ b/pypathway/hotnet/hotnet2/consensus.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python + +import sys, os, networkx as nx, numpy +from itertools import product, combinations +from collections import defaultdict +from .run import run_helper, get_deltas_hotnet2 +from .constants import * +from .heat import filter_heat, filter_heat_to_network_genes +from .permutations import permute_heat + +def count_consensus(consensus, sizes=HN2_STATS_SIZES): + cc_sizes = [ len(d['core'] + d['expansion']) for d in consensus ] + return dict( (s, sum(1 for cc_size in cc_sizes if cc_size >= s)) for s in sizes ) + +def consensus_with_stats(args, networks, heats, verbose=0): + # Run with the input heat + single_runs, consensus, linkers, auto_deltas = consensus_run( args, networks, heats, verbose ) + + # Generate permuted heats + np = args.consensus_permutations + permuted_single_runs = defaultdict(list) + for (infmat, indexToGene, G, nname, pnp), (heat, hname) in product(networks, heats): + # 1) Filter the heat scores + # 1a) Remove enes not in the network + heat = filter_heat_to_network_genes(heat, set(indexToGene.values()), verbose) + + # 1b) Genes with score 0 cannot be in output components, but are eligible for heat in permutations + heat, addtl_genes = filter_heat(heat, None, False, 'There are ## genes with heat score 0') + + for permutation in permute_heat(heat, list(indexToGene.values()), np, addtl_genes, args.num_cores): + result = run_helper(args, infmat, indexToGene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_hotnet2, HN2_INFMAT_NAME, HN2_MAX_CC_SIZES, verbose=verbose) + permuted_single_runs[(hname, nname)].append(result) + + # Run consensus to compute observed statistics + network_heat_pairs = list(permuted_single_runs.keys()) + permuted_counts = [] + for i in range(args.heat_permutations): + runs = [ (n, h, permuted_single_runs[(n, h)][i]) for n, h in network_heat_pairs ] + permuted_consensus, _, _ = identify_consensus( runs, verbose=verbose ) + permuted_counts.append(count_consensus(permuted_consensus)) + + # Summarize stats + consensus_stats = dict() + for k, count in count_consensus(consensus).items(): + empirical = [ permuted_count[k] for permuted_count in permuted_counts ] + if np == 0: + pval = 1. + expected = 0. + else: + expected = numpy.mean(empirical) + pval = sum(1. for p in empirical if p >= count )/np + consensus_stats[k] = dict(observed=count, expected=expected, pval=pval) + + return single_runs, consensus, linkers, auto_deltas, consensus_stats + +def consensus_run(args, networks, heats, verbose): + # Perform the single runs + single_runs = [] + for (infmat, indexToGene, G, nname, pnp), (heat, hname) in product(networks, heats): + # Simple progress bar + if args.verbose > 0: print('\t-', nname, hname) + + # 1) Filter the heat scores + # 1a) Remove enes not in the network + heat = filter_heat_to_network_genes(heat, set(indexToGene.values()), verbose) + + # 1b) Genes with score 0 cannot be in output components, but are eligible for heat in permutations + heat, addtl_genes = filter_heat(heat, None, False, 'There are ## genes with heat score 0') + + if args.verbose > 1: + print("\t\t- Loaded '%s' heat scores for %s genes" % (hname, len(heat))) + + result = run_helper(args, infmat, indexToGene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_hotnet2, HN2_INFMAT_NAME, HN2_MAX_CC_SIZES, args.verbose) + single_runs.append( (nname, hname, result) ) + + # Perform the consensus + consensus, linkers, auto_deltas = identify_consensus( single_runs, verbose=verbose ) + + return single_runs, consensus, linkers, auto_deltas + +def identify_consensus(single_runs, pval_threshold=0.01, min_cc_size=2, verbose=0): + if verbose > 0: print('* Constructing HotNet(2) consensus network...') + + # Choose single runs and count the number of networks + components, networks, auto_deltas = choose_deltas(single_runs, pval_threshold, min_cc_size, verbose) + num_networks = len(set(networks)) + + # Create the consensus graph + edges = consensus_edges(components, networks) + G = nx.Graph() + G.add_weighted_edges_from( (u, v, w) for (u, v), w in edges.items() ) + + # Extract the connected components when restricted to edges in all networks. + H = nx.Graph() + H.add_edges_from( (u, v) for (u, v), w in edges.items() if w >= num_networks ) + consensus = [ set(cc) for cc in nx.connected_components( H ) ] + consensus_genes = set( g for cc in consensus for g in cc ) + + # Expand each consensus by adding back any edges not in all networks. + expanded_consensus = [] + linkers = set() + for cc in consensus: + other_consensus_genes = consensus_genes - cc + neighbors = set( v for u in cc for v in G.neighbors(u) if v not in consensus_genes ) + expansion = set() + for u in neighbors: + cc_networks = max( G[u][v]['weight'] for v in set(G.neighbors(u)) & cc ) + consensus_neighbors = set( v for v in G.neighbors(u) if v in other_consensus_genes and G[u][v]['weight'] >= cc_networks ) + if len(consensus_neighbors) > 0: + if any([ G[u][v]['weight'] > 1 for v in consensus_neighbors ]): + linkers.add( u ) + else: + expansion.add( u ) + expanded_consensus.append( dict(core=list(cc), expansion=list(expansion)) ) + + consensus_genes = set( g for cc in expanded_consensus for g in cc['core'] + cc['expansion'] ) + linkers -= consensus_genes + + return expanded_consensus, linkers, auto_deltas + +# Choose the results files when given directories of results files. +def choose_deltas(single_runs, pval_threshold, min_cc_size, verbose=0): + components, networks, auto_deltas = [], [], [] + # Consider each run. + for network, heat, run in single_runs: + result_statistics = [] + for ccs, stats, delta in run: + # Consider each \delta value for each run. + # For each \delt value, load the results for each \delta value and record the + # number of component sizes k with p-values below a given threshold when k \geq + # min_cc_size. + count = sum( 1 for k in stats if int(k) >= min_cc_size and stats[k]['pval'] > pval_threshold ) + result_statistics.append((count, delta, ccs)) + + # Find the smallest \delta value with the largest number of component sizes k with p-values + # below a threshold. For convenience, we find the smallest number of \delta values greater + # greater than or equal to the threshold, and we sort from smallest to largest. + selected_result_statistics = sorted(result_statistics)[0] + if verbose > 3: + print('\t- Selected delta = {} for {}...'.format(selected_result_statistics[1], directory)) + + components.append(selected_result_statistics[2]) + auto_deltas.append(selected_result_statistics[1]) + networks.append(network) + + return components, networks, auto_deltas + +# Construct consensus graph. +def consensus_edges(components, networks): + edges_to_networks = defaultdict(set) + for ccs, network in zip(components, networks): + for cc in ccs: + for u, v in combinations(cc, 2): + edges_to_networks[(u, v)].add(network) + return dict( (edge, len(edge_networks)) for edge, edge_networks in edges_to_networks.items() ) diff --git a/pypathway/hotnet/hotnet2/constants.py b/pypathway/hotnet/hotnet2/constants.py new file mode 100755 index 0000000..8e4b1e9 --- /dev/null +++ b/pypathway/hotnet/hotnet2/constants.py @@ -0,0 +1,37 @@ +from collections import namedtuple + +# Names +HOTNET2 = "hotnet2" +HOTNET = "hotnet" + +# mutation types +SNV = "snv" +AMP = "amp" +DEL = "del" +INACTIVE_SNV = "inactive_snv" +FUSION = "fus" + +# test statistics +MAX_CC_SIZE = 'max_cc_size' +NUM_CCS = 'num_ccs' + +# output file names +HEAT_JSON = "heat.json" +JSON_OUTPUT = "results.json" +COMPONENTS_TSV = "components.txt" +SIGNIFICANCE_TSV = "significance.txt" + +VIZ_INDEX = 'index.html' +VIZ_SUBNETWORKS = 'subnetworks.html' +HIERARCHY_WEB_FILE = 'hierarchy.html' + +Mutation = namedtuple("Mutation", ["sample", "gene", "mut_type", "valid"]) +Mutation.__new__.__defaults__ = (True,) # default valid field to True +Fusion = namedtuple("Fusion", ["sample", "genes"]) + +ITERATION_REPLACEMENT_TOKEN = '##NUM##' + +# HotNet2 params +HN2_MAX_CC_SIZES = [5, 10, 15, 20] +HN2_INFMAT_NAME = "PPR" +HN2_STATS_SIZES = list(range(2, 11)) diff --git a/pypathway/hotnet/hotnet2/delta.py b/pypathway/hotnet/hotnet2/delta.py new file mode 100755 index 0000000..0d0c232 --- /dev/null +++ b/pypathway/hotnet/hotnet2/delta.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python + +# Load required modules +import multiprocessing as mp, numpy as np, scipy as sp, networkx as nx +from collections import namedtuple, defaultdict + +# Load local modules +from . import hotnet2 as hn, hnio +from .union_find import UnionFind +from .constants import * +import traceback + +import sys +import os +sys.path.append(os.path.dirname(os.path.realpath(__file__))) + +try: + import fast_scc + use_fast_scc = True + print("fast_scc") +except: + use_fast_scc = False + print(traceback.format_exc()) + print("slow_scc") + +strong_ccs = nx.strongly_connected_components + +def max_connected_component(mat, delta): + r = fast_scc.scc(2.0, mat > delta) + return parse_result(r) + +def parse_result(res): + cur_pos = 0 + lens = [] + while cur_pos < len(res): + lens.append(res[cur_pos]) + cur_pos += res[cur_pos] + 1 + return lens + +def get_component_sizes(arrs): + return [len(arr) for arr in arrs] + +def delta_too_small(component_sizes, max_size ): + # print "\t\t\t", max(component_sizes) + if max_size > max(component_sizes): return True + else: return False + +def find_best_delta_by_largest_cc(permuted_sim, permuted_index, sizes, directed, start_quant=0.99, verbose=0): + """Return a dict mapping each size in sizes to the smallest delta such that the size of the + largest CC in the graph corresponding the the given similarity matrix is <= that size. + + Arguments: + permuted_sim -- 2D ndarray representing a similarity matrix + permuted_index -- dict mapping an index in the matrix to the name of the gene represented at that + index in the influence matrix + sizes -- list of sizes for the largest connected component + directed -- whether or not the graph constructed from the similarity matrix should be directed + start_quant -- percentile of edge weights that should be used as the starting delta for the + binary search procedure + + """ + + if verbose > 4: + print("Finding smallest delta such that size of largest CC is <= l") + component_fn = strong_ccs if directed else nx.connected_components + # Construct weighted digraphs for each network for each delta + sorted_edges = np.unique(permuted_sim) # unique edge weights in sorted ascending + size2delta = dict() + for max_size in sizes: + # print "\t\tk=", max_size + delta = -1 + index = round(len(sorted_edges)* start_quant) + left, right = 0., float(len(sorted_edges)) #TODO: why are these floats rather than ints? + visited = [] + + while len(visited) < 100: + # print "\t\t\t%s: %s" % (len(visited), index/len(sorted_edges)), + # print "(%s)" % format(sorted_edges[int(index)], 'g') + + # construct graph using new delta + delta = sorted_edges[int(index)] + if delta in visited: + size2delta[max_size] = delta + break + else: + visited.append( delta ) + if use_fast_scc: + lengths = max_connected_component(permuted_sim, delta) + else: + G = hn.weighted_graph(permuted_sim, permuted_index, delta, directed) + lengths = get_component_sizes( component_fn( G) ) + # increment / decrement index based on whether components meet size specifications + if delta_too_small( lengths , max_size ): + right = index + index -= round( (index-left)/2. ) + else: + left = index + index += round( (right-index)/2. ) + + if max_size not in size2delta: + raise AssertionError("NO DELTA SELECTED FOR k = " + str(max_size)) + + return size2delta + +Edge = namedtuple("Edge", ["node1", "node2", "weight"]) + +def find_best_delta_by_num_ccs(permuted_sim, ks, start=0.05): + """Return a dict mapping each size in ks to the median delta that maximizes the number of + connected components of size at least k in the graph corresponding the the given similarity + matrix. + + Arguments: + permuted_sim -- 2D ndarray representing a similarity matrix + ks -- list of minimum sizes for connected components to be counted. This must be at least 2. + start -- only deltas in the top start proportion of edge weights will be considered when + searching for deltas that maximize the number of connected components + """ + + print("Finding median delta that maximizes the # of CCs of size >= l") + edges = get_edges(permuted_sim, start) + k2delta = {} + + for k in ks: + _, bestDeltas = find_best_delta_by_num_ccs_for_given_k(permuted_sim, edges, k) + k2delta[k] = np.median(bestDeltas) + + return k2delta + +def find_best_delta_by_num_ccs_for_given_k(permuted_sim, edges, k): + + if k < 2: + raise ValueError("k must be at least 2") + + max_num_ccs = 0 #initially, each node is its own CC of size 1, so none is of size >= k for k >= 2 + bestDeltas = [edges[0].weight] + uf = UnionFind() + + for edge in edges: + uf.union(edge.node1, edge.node2) + num_ccs = len([root for root in uf.roots if uf.weights[root] >= k]) + if num_ccs > max_num_ccs: + max_num_ccs = num_ccs + bestDeltas = [edge.weight] + elif num_ccs == max_num_ccs: + bestDeltas.append(edge.weight) + + return max_num_ccs, bestDeltas + +def get_edges(sim, start=.05): + """Return a list of Edge tuples representing edges in the top start% of edge weights""" + flattened = np.ndarray.flatten(sim) + if np.array_equal(sim, sim.transpose()): + edges = [Edge(i/len(sim), i%len(sim), flattened[i]) for i in range(len(flattened)) if i/len(sim) <= i%len(sim)] + else: + edges = [Edge(i/len(sim), i%len(sim), flattened[i]) for i in range(len(flattened))] + edges = sorted(edges, key=lambda x: x.weight, reverse=True) + edges = edges[:int(start*len(edges))] + return edges + +def network_delta_wrapper(xxx_todo_changeme): + (network_path, infmat_name, index2gene, heat, sizes, directed, + selection_function, verbose) = xxx_todo_changeme + permuted_mat = hnio.load_hdf5(network_path)[infmat_name] + sim, index2gene = hn.similarity_matrix(permuted_mat, index2gene, heat, directed, verbose) + if selection_function is find_best_delta_by_largest_cc: + return selection_function(sim, index2gene, sizes, directed, verbose=verbose) + elif selection_function is find_best_delta_by_num_ccs: + return selection_function(sim, sizes, verbose=verbose) + else: + raise ValueError("Unknown delta selection function: %s" % (selection_function)) + +def network_delta_selection(network_paths, infmat_name, index2gene, heat, sizes, directed=True, + num_cores=1, selection_fn=find_best_delta_by_largest_cc, verbose=0): + """Return a dict mapping each size in sizes to a list of the best deltas for each permuted + network for that size. + + Arguments: + network_paths -- iterable of paths to .mat files containing permuted networks + infmat_name -- name of influence matrix in .mat files + index2gene -- dict mapping an index in the matrix to the name of the gene represented at that + index in the influence matrix + heat -- dict mapping a gene name to the heat score for that gene + sizes -- list of sizes for largest CC / min size for CCs to be counted (based on selection_fn) + directed -- whether or not the graph constructed from the similarity matrix should be directed + num_cores -- number of cores to use for running in parallel + selection_fn -- function that should be used for finding the best delta + + """ + if num_cores != 1: + pool = mp.Pool(None if num_cores == -1 else num_cores) + map_fn = pool.map + else: + map_fn = map + + args = [(network_path, infmat_name, index2gene, heat, sizes, directed, + selection_fn, verbose) for network_path in network_paths] + delta_maps = map_fn(network_delta_wrapper, args) + + if num_cores != 1: + pool.close() + pool.join() + + # Parse the deltas into one dictionary + sizes2deltas = defaultdict(list) + for size2delta in delta_maps: + for s in sizes: sizes2deltas[s].append(size2delta[s]) + + return sizes2deltas + +def heat_delta_wrapper(xxx_todo_changeme1): + (infmat, index2gene, heat_permutation, directed, sizes, selection_function) = xxx_todo_changeme1 + sim, index2gene = hn.similarity_matrix(infmat, index2gene, heat_permutation, directed) + if selection_function is find_best_delta_by_largest_cc: + return selection_function(sim, index2gene, sizes, directed) + elif selection_function is find_best_delta_by_num_ccs: + return selection_function(sim, sizes) + else: + raise ValueError("Unknown delta selection function: %s" % (selection_function)) + +#list of num_permutations dicts of max cc size => best delta +def heat_delta_selection(infmat, index2gene, heat_permutations, sizes, directed=True, num_cores=1, + selection_fn=find_best_delta_by_largest_cc): + """Return a dict mapping each size in sizes to a list of the best deltas for each heat + permutation for that size. + + Arguments: + infmat -- 2D ndarray representing an influence matrix + index2gene -- dict mapping an index in the matrix to the name of the gene represented at that + index in the influence matrix + heat_permutations -- list of heat permutations (dicts mapping gene name to heat score) + sizes -- list of sizes for largest CC / min size for CCs to be counted (based on selection_fn) + directed -- whether or not the graph constructed from the similarity matrix should be directed + num_cores -- number of cores to use for running in parallel + selection_fn -- function that should be used for finding the best delta + + """ + if num_cores != 1: + pool = mp.Pool(None if num_cores == -1 else num_cores) + map_fn = pool.map + else: + map_fn = map + args = [(infmat, index2gene, heat_permutation, directed, sizes, selection_fn) + for heat_permutation in heat_permutations] + deltas = map_fn(heat_delta_wrapper, args) + + if num_cores != 1: + pool.close() + pool.join() + + # Parse the deltas into one dictionary + sizes2deltas = defaultdict(list) + for size2delta in deltas: + for s in sizes: sizes2deltas[s].append(size2delta[s]) + + return sizes2deltas + + +################################################################################ +# HOTNET AND HOTNET2 DELTA SELECTION WRAPPERS +################################################################################ + +def get_deltas_for_network(permuted_networks_path, heat, infmat_name, index2gene, test_statistic, + sizes, classic, num_permutations, num_cores, verbose=0): + if verbose > 3: print("* Performing permuted network delta selection...") + + #construct list of paths to the first num_permutations + permuted_network_paths = [permuted_networks_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i)) + for i in range(1, num_permutations+1)] + + delta_selection_fn = find_best_delta_by_largest_cc if test_statistic == "max_cc_size" \ + else find_best_delta_by_num_ccs + + return network_delta_selection(permuted_network_paths, infmat_name, index2gene, heat, + sizes, not classic, num_cores, delta_selection_fn, verbose) + +def get_deltas_for_heat(infmat, index2gene, gene2heat, addtl_genes, num_permutations, test_statistic, + sizes, classic, num_cores, verbose=0): + if verbose > 3: print("* Performing permuted heat delta selection...") + heat_permutations = permutations.permute_heat(gene2heat, list(index2gene.values()), num_permutations, + addtl_genes, num_cores) + return get_deltas_from_heat_permutations(infmat, index2gene, heat_permutations, test_statistic, + sizes, classic, num_cores, verbose) + +def get_deltas_for_mutations(args, infmat, index2gene, heat_params, verbose=0): + print("* Performing permuted mutation data delta selection...") + index2gene = hnio.load_index(args.infmat_index_file) + + heat_permutations = permutations.generate_mutation_permutation_heat( + heat_params["heat_fn"], heat_params["sample_file"], + heat_params["gene_file"], list(index2gene.values()), heat_params["snv_file"], + args.gene_length_file, args.bmr, args.bmr_file, heat_params["cna_file"], + args.gene_order_file, heat_params["cna_filter_threshold"], + heat_params["min_freq"], args.num_permutations, args.num_cores) + return get_deltas_from_heat_permutations(infmat, index2gene, heat_permutations, args.test_statistic, + args.sizes, args.classic, args.num_cores, verbose) + +def get_deltas_from_heat_permutations(infmat, gene_index, heat_permutations, test_statistic, sizes, + classic, num_cores, verbose=0): + delta_selection_fn = find_best_delta_by_largest_cc if test_statistic == "max_cc_size" \ + else find_best_delta_by_num_ccs + + deltas = delta.heat_delta_selection(infmat, gene_index, heat_permutations, sizes, not classic, + num_cores, delta_selection_fn, verbose) + return deltas diff --git a/pypathway/hotnet/hotnet2/fast_swap.py b/pypathway/hotnet/hotnet2/fast_swap.py new file mode 100755 index 0000000..9cce5db --- /dev/null +++ b/pypathway/hotnet/hotnet2/fast_swap.py @@ -0,0 +1,71 @@ +from ......utils import _node +from ctypes import * +import networkx as nx +import sys +import numpy as np +import struct + +sys.path.append('/Users/yangxu/PyPathway') + + +class Node(Structure): + _fields_ = [("node_id", c_int), + ('neighbours', POINTER(c_int)), + ("neighbour_count", c_int)] + + def __repr__(self): + return "id: {} nei_c {}, neis: {}".format(self.node_id, self.neighbour_count, self.neighbours) + + +def nx2c(G): + res = [] + for k, v in G.edge.items(): + nei = (c_int * len(v))(*[x for x in v]) + n = Node(node_id=k, neighbours=nei, neighbour_count=len(v)) + res.append(n) + return (Node * len(G.edge))(*res) + + +def c2nx(r): + G = nx.Graph() + for x in r.contents: + for i in range(x.neighbour_count): + G.add_edge(x.node_id, x.neighbours[i]) + return G + + +def translate(Graph): + ''' + From networkx graph to adjacency matrix + + :param Graph: the input graph + :return the id2node dict and the adjancency matrix + ''' + length = len(Graph.node) + id2_node = {} + adjacency = np.zeros((length, length), dtype=np.int8) + for i, n in enumerate(Graph.node): + id2_node[int(n)] = i + for x in Graph.edges(): + i1, i2 = id2_node[int(x[0])], id2_node[int(x[1])] + adjacency[i1][i2] = 1 + adjacency[i2][i1] = 1 + ids = list(id2_node.keys()) + id_list = (c_int * len(ids))(*ids) + return id_list, adjacency, length + + +def parse_adjacency_matrix(id2name, matrix, count): + ids = struct.unpack('i' * count, id2name) + id2node = {i: x for i, x in enumerate(ids)} + rG = nx.Graph() + for i, j in zip(*np.where(matrix > 0)): + rG.add_edge(str(id2node[i]), str(id2node[j])) + return rG + + +def node_swap(G, nswap, windows_threshold=3): + id2node, matrix, count = translate(G) + ids, mat, count = _node.swap(id2node, matrix, count, int(nswap), windows_threshold) + NG = parse_adjacency_matrix(ids, mat, count) + return NG \ No newline at end of file diff --git a/pypathway/hotnet/hotnet2/heat.py b/pypathway/hotnet/hotnet2/heat.py new file mode 100755 index 0000000..0dd7db4 --- /dev/null +++ b/pypathway/hotnet/hotnet2/heat.py @@ -0,0 +1,224 @@ +from collections import defaultdict +from math import log10 +import scipy +from .constants import Mutation, SNV, AMP, DEL + +def filter_heat(heat, min_score, zero_genes=False, message=None): + """Returns (1) a dict mapping gene names to heat scores where all non-zero heat scores are at + least min_score and any genes that originally had score less than min_score either have score 0 + or are excluded depending on the value of zero_genes; and (2) a set of the genes that had scores + less than min_score + + Arguments: + heat -- dict mapping gene names to heat scores + min_score -- minimum heat score a gene must have to be included or non-zero (depending on the + value of zero_genes) in the returned dict. If None, the minimum score will be + calculated as the minimum of the non-zero heat scores included in the input dict + zero_genes -- if true, genes with score below min_score will be included in the returned heat + dict with their scores set to zero; otherwise, they will be excluded + message -- message to print about number of filtered genes; '##' will be replaced with the number + + """ + if not min_score: + min_score = min([score for score in list(heat.values()) if score > 0]) + + filtered_genes = set() + filtered_heat = dict() + for gene, score in heat.items(): + if score >= min_score: + filtered_heat[gene] = score + else: + filtered_genes.add(gene) + if zero_genes: + filtered_heat[gene] = 0 + + if message and len(filtered_genes) > 0: + print('\t- ' + message.replace('##', str(len(filtered_genes)))) + + return filtered_heat, filtered_genes + +def num_snvs(mutations): + """Return the number of valid SNVs in the given iterable of Mutations + (those with mut_type == SNV and valid == True). + + Arguments: + mutations -- iterable of Mutation tuples + + """ + return len([mut for mut in mutations if mut.mut_type == SNV and mut.valid]) + +def num_cnas(mutations): + """Return the number of valid CNAs in the given iterable of Mutations + (those with mut_type == AMP or DEL and valid == True). + + Arguments: + mutations -- iterable of Mutation tuples + + """ + return len([mut for mut in mutations if (mut.mut_type == AMP or mut.mut_type == DEL) and mut.valid]) + +def filter_cnas(cnas, filter_thresh): + """Return a list of Mutation tuples in which CNAs in genes where the proportion of CNAs in the + gene across samples is less than filter_thresh have their "valid" field set to False. CNAs in + genes whose CNAs pass the threshold that are of the opposite type of the dominant type in the gene + also have their "valid" field set to False. + + Arguments: + cnas -- a list of Mutation tuples representing CNAs. All are assumed to be of mut_type AMP or DEL + filter_thresh -- the proportion of CNAs in a gene that must be of the same type + + """ + if filter_thresh <= .5: + raise ValueError("filter_thresh must be greater than .5") + + filtered_cnas = list() + genes2cnas = defaultdict(list) + for cna in cnas: + genes2cnas[cna.gene].append(cna) + + for gene_cnas in genes2cnas.values(): + amp_count = float(len([cna for cna in gene_cnas if cna.mut_type == AMP])) + del_count = float(len([cna for cna in gene_cnas if cna.mut_type == DEL])) + if (amp_count / (amp_count + del_count)) >= filter_thresh: + invalidate_opposite_cnas(filtered_cnas, gene_cnas, AMP) + elif (del_count / (amp_count + del_count)) >= filter_thresh: + invalidate_opposite_cnas(filtered_cnas, gene_cnas, DEL) + else: + for cna in gene_cnas: + filtered_cnas.append(get_invalidated_mutation(cna)) + + return filtered_cnas + +def invalidate_opposite_cnas(cnas, gene_cnas, mut_type): + for cna in gene_cnas: + if cna.mut_type == mut_type: + cnas.append(cna) + else: + cnas.append(get_invalidated_mutation(cna)) + +def get_invalidated_mutation(mutation): + return Mutation(mutation.sample, mutation.gene, mutation.mut_type, False) + +def mut_heat(genes, num_samples, snvs, cnas, min_freq): + """Return a dict mapping gene name to heat score based on the given mutation data. + + Arguments: + genes -- iterable of genes tested for mutations + num_samples -- the number of samples tested for mutations + snvs -- iterable of Mutation tuples representing SNVs (mut_type == SNV) + cnas -- iterable of Mutation tuples representing CNAs (mut_type == AMP or DEL) + min_freq -- the minimum number of samples in which a gene must have an SNV to be considered + mutated in the heat score calculation. + + """ + + genes2mutations = dict((gene, set()) for gene in genes) + for snv in snvs: + genes2mutations[snv.gene].add(snv) + for cna in cnas: + genes2mutations[cna.gene].add(cna) + print(("* Calculating heat scores for %s genes in %s samples at min frequency %s" % + (len(genes2mutations), num_samples, min_freq))) + + gene2heat = dict() + for gene, mutations in genes2mutations.items(): + snv_mut_samples = set( m.sample for m in mutations if m.mut_type == SNV and m.valid ) + cna_mut_samples = set( m.sample for m in mutations if (m.mut_type == AMP or m.mut_type == DEL) and m.valid ) + + # Minimum frequency is for SNVs *only*, so we just use CNAs if the SNVs + # are below min_freq + if len(snv_mut_samples) < min_freq: + gene2heat[gene] = len(cna_mut_samples) / float(num_samples) + else: + gene2heat[gene] = len(snv_mut_samples | cna_mut_samples) / float(num_samples) + + return gene2heat + +NULL = 100 +def fm_heat(gene2heat, fm_threshold, cis_threshold=0.01, CIS=False): + print("* Creating oncodrive heat map...") + if CIS: print("\tIncluding CIS scores at threshold", cis_threshold, "...") + heat = dict() + src_fm, src_cis_amp, src_cis_del = 0, 0, 0 + for g, scores in list(gene2heat.items()): + if CIS: + del_score = scores["del"] if scores["del"] < cis_threshold else NULL + amp_score = scores["amp"] if scores["amp"] < cis_threshold else NULL + fm_score = scores["fm"] if scores["fm"] < fm_threshold else NULL + if fm_score == NULL and amp_score == NULL and del_score == NULL: continue + min_val = min(del_score, amp_score, fm_score) + heat[g] = -log10( min_val ) + if min_val == scores["fm"]: src_fm += 1 + elif min_val == scores["amp"]: src_cis_amp += 1 + elif min_val == scores["del"]: src_cis_del += 1 + else: + if scores["fm"] >= fm_threshold: continue + heat[g] = -log10(scores["fm"]) + src_fm += 1 + print("\t- Genes using FM score:", src_fm) + print("\t- Genes using CIS AMP score:", src_cis_amp) + print("\t- Genes using CIS DEL score:", src_cis_del) + + return heat + +def mutsig_heat(gene2mutsig, threshold=1.0): + print("* Creating MutSig heat map...") + gene2heat = dict((gene, -log10(score["qval"])) + for gene, score in list(gene2mutsig.items()) + if score["qval"] < threshold) + print("\t- Including", len(gene2heat), "genes at threshold", threshold) + return gene2heat + +def music_heat(gene2music, threshold=1.0, max_heat=15): + print("* Creating MuSiC heat map...") + print("\t- Mapping q-values of 0 to", max_heat) + def music_heat(qvals): + heat = scipy.median([ qvals["FDR_CT"], qvals["FDR_LRT"], qvals["FDR_FCPT"] ]) + return -log10(heat) if heat != 0 else max_heat + gene2heat = dict((gene, music_heat(scores)) for gene, scores in list(gene2music.items()) + if scipy.median(list(scores.values())) < threshold) + print("\t- Including", len(gene2heat), "genes at threshold", threshold) + return gene2heat + +def filter_heat_to_network_genes(gene2heat, network_genes, verbose): + """Return a dict mapping genes to heat scores such that only genes in the network are included. + + Arguments: + gene2heat -- dict mapping gene names to heat scores + network_genes -- set of network_genes + + """ + filtered_heat = dict() + num_removed = 0 + for gene, heat in gene2heat.items(): + if gene in network_genes: + filtered_heat[gene] = heat + else: + num_removed += 1 + + if verbose > 1 and num_removed > 0: + print("\t- Removing %s genes not in the network" % num_removed) + + return filtered_heat + +def reconcile_heat_with_tested_genes(gene2heat, tested_genes): + """Return a dict mapping gene names to heat scores containing for each gene in tested_genes + and only for genes in tested_genes. Genes in tested_genes not in gene2heat will be given a + score of 0. + + Arguments: + gene2heat -- dict mapping gene names to heat scores + tested_genes -- set of genes that should have heat scores in the returned dict + + """ + filtered_heat = dict((g, gene2heat[g] if g in gene2heat else 0) for g in tested_genes) + + num_removed = len(set(gene2heat.keys()).difference(tested_genes)) + if num_removed > 0: + print("\t- Removing %s genes not in gene_filter_file" % num_removed) + + num_zeroed = len(set(tested_genes).difference(list(gene2heat.keys()))) + if num_zeroed > 0: + print("\t- Assigned score 0 to %s genes in gene file without scores" % num_zeroed) + + return filtered_heat diff --git a/pypathway/hotnet/hotnet2/hierarchy/.Rhistory b/pypathway/hotnet/hotnet2/hierarchy/.Rhistory new file mode 100644 index 0000000..e69de29 diff --git a/pypathway/hotnet/hotnet2/hierarchy/README.md b/pypathway/hotnet/hotnet2/hierarchy/README.md new file mode 100755 index 0000000..5a8f9bc --- /dev/null +++ b/pypathway/hotnet/hotnet2/hierarchy/README.md @@ -0,0 +1,153 @@ +Overview +----------- +Tarjan (1982, 1983) describes a hierarchical clustering algorithm that uses strongly connected components. Below, we describe how to use Tarjan's algorithm to select the `delta` parameter in HotNet2. + +Installation +------------ +Run the included `setup_fortran.py` script or, (almost) equivalently, run the following command in terminal: + + f2py -c fortran_routines.f95 -m fortran_routines + +This process requires a Fortran compiler and NumPy, which contains the F2PY program. Our implementation uses Fortran in several places where large performance gains are possible. However, if a Fortran compiler is unavailable or unsuccessful, our implementation will automatically revert to functionally equivalent (but slower) Python functions. + +Usage +----- +For illustration, consider the example given in Tarjan (1983). The graph in this example can be given by the vertices `V` and weighted adjacency matrix `A`. + +`V`: + + ['a', 'b', 'c', 'd', 'e', 'f', 'g'] + +`A`: + + [[ 0. 10. 0. 0. 0. 0. 15.] + [ 12. 0. 30. 0. 0. 0. 0.] + [ 0. 0. 0. 0. 0. 0. 45.] + [ 0. 0. 6. 0. 16. 0. 14.] + [ 0. 0. 0. 13. 0. 8. 0.] + [ 26. 0. 0. 0. 0. 0. 20.] + [ 0. 35. 22. 0. 50. 0. 0.]] + +For example, the edge from vertex `a` to vertex `b` has weight `10.`. + +For our implementation, the entries of the list `V` may be any hashable type, e.g., integers and/or strings. The entries of the NumPy array `A` must be double-precision floating-point numbers. The entries of `A` need not be unique. An exception is raised when the graph described by `A` is not strongly connected. + +To generate the hierarchical decomposition tree, import `hierarchical_clustering.py` and run + + T = HD(V,A,increasing) + +where `increasing = True` adds edge weights in increasing order and `increasing = False` adds edge weights in decreasing order; the latter is the case for HotNet2. The `increasing` argument is optional, and `increasing = False` is its default value when omitted. For this example, set `increasing=True`. The output is a tree represented as a dictionary `T` whose keys and values are tuples containing edge weights and leaf nodes. + +`T`: + + (0.0, 'a') : (12.0, 'a', 'b') + (0.0, 'b') : (12.0, 'a', 'b') + (0.0, 'c') : (45.0, 'a', 'b', 'c', 'g') + (0.0, 'd') : (16.0, 'd', 'e') + (0.0, 'e') : (16.0, 'd', 'e') + (0.0, 'f') : (50.0, 'a', 'b', 'c', 'd', 'e', 'f', 'g') + (0.0, 'g') : (35.0, 'a', 'b', 'g') + (12.0, 'a', 'b') : (35.0, 'a', 'b', 'g') + (16.0, 'd', 'e') : (50.0, 'a', 'b', 'c', 'd', 'e', 'f', 'g') + (35.0, 'a', 'b', 'g') : (45.0, 'a', 'b', 'c', 'g') + (45.0, 'a', 'b', 'c', 'g') : (50.0, 'a', 'b', 'c', 'd', 'e', 'f', 'g') + +For example, vertices `a` and `b` condense into a single component after the addition of weight `12.0`. The later addition of `35.0` combines the component containing both `a` and `b` with the component containing only `g` into a single component containing `a`, `b`, and `g`. The eventual addition of `50.0` completes the tree, combining `a`, `b`, `c`, `d`, `e`, `f`, and `g` into a single component. + +To convert this specialized tree representation to the more standard Newick format, run + + newick_representation = newick(T) + +The output is a string representing the tree. + +`newick_representation`: + + (f:50.0,(d:16.0,e:16.0):34.0,(c:45.0,(g:35.0,(a:12.0,b:12.0):23.0):10.0):5.0); + +To find the clusters formed from the hierarchical decomposition, run + + weights,clusters = cluster(T) + +The output is a collection of condensing weights and a collection of vertex clusters. + +`weights`: + + [0.0, 12.0, 16.0, 35.0, 45.0, 50.0] + +`clusters`: + + [['a'], ['b'], ['c'], ['d'], ['e'], ['f'], ['g']] + [['a', 'b'], ['c'], ['d'], ['e'], ['f'], ['g']] + [['a', 'b'], ['c'], ['d', 'e'], ['f'], ['g']] + [['a', 'b', 'g'], ['c'], ['d', 'e'], ['f']] + [['a', 'b', 'c', 'g'], ['d', 'e'], ['f']] + [['a', 'b', 'c', 'd', 'e', 'f', 'g']] + +For example, each vertex initially forms its own strongly connected component. Later, after the addition of `12.0`, the components with `a` and `b` merge to form one component with both `a` and `b`. After the eventual addition of `50.0`, the vertices `a`, `b`, `c`, `d`, `e`, `f`, and `g` form a single connected component. + +Minimal working example +----------------------- +Consider the following minimal working example. Note that `increasing` has been omitted from `HD`, so it is `False` by default. + +** Input ** + + from hierarchical_clustering import * + V = ['a', 'b', 'c', 'd', 'e', 'f', 'g'] + A = np.array([[ 0., 10., 0., 0., 0., 0., 15.], + [ 12., 0., 30., 0., 0., 0., 0.], + [ 0., 0., 0., 0., 0., 0., 45.], + [ 0., 0., 6., 0., 16., 0., 14.], + [ 0., 0., 0., 13., 0., 8., 0.], + [ 26., 0., 0., 0., 0., 0., 20.], + [ 0., 35., 22., 0., 50., 0., 0.]]) + + T = HD(V,A) + weights,clusters = cluster(T) + newick_representation = newick(T) + + print 'Tree in Newick format:' + print ' ',newick_representation + print 'Tree in our specialized format:' + for v in T: + print ' ',v,':',T[v] + print 'Condensing weights:' + print ' ',weights + print 'Clusters:' + for c in clusters: + print ' ',c + +** Output ** + + Tree in Newick format: + (f:42.0,(a:38.0,(d:37.0,e:37.0,(b:20.0,c:20.0,g:20.0):17.0):1.0):4.0); + + Tree in our specialized format: + (50.0, 'a') : (12.0, 'a', 'b', 'c', 'd', 'e', 'g') + (50.0, 'b') : (30.0, 'b', 'c', 'g') + (50.0, 'c') : (30.0, 'b', 'c', 'g') + (13.0, 'b', 'c', 'd', 'e', 'g') : (12.0, 'a', 'b', 'c', 'd', 'e', 'g') + (50.0, 'd') : (13.0, 'b', 'c', 'd', 'e', 'g') + (50.0, 'e') : (13.0, 'b', 'c', 'd', 'e', 'g') + (50.0, 'f') : (8.0, 'a', 'b', 'c', 'd', 'e', 'f', 'g') + (30.0, 'b', 'c', 'g') : (13.0, 'b', 'c', 'd', 'e', 'g') + (50.0, 'g') : (30.0, 'b', 'c', 'g') + (12.0, 'a', 'b', 'c', 'd', 'e', 'g') : (8.0, 'a', 'b', 'c', 'd', 'e', 'f', 'g') + + Condensing weights: + 0 : 30.0 + 1 : 13.0 + 2 : 12.0 + 3 : 8.0 + 4 : 0.0 + + Vertex clusters: + 0 : [['a'], ['b'], ['c'], ['d'], ['e'], ['f'], ['g']] + 1 : [['a'], ['b', 'c', 'g'], ['d'], ['e'], ['f']] + 2 : [['a'], ['b', 'c', 'd', 'e', 'g'], ['f']] + 3 : [['a', 'b', 'c', 'd', 'e', 'g'], ['f']] + 4 : [['a', 'b', 'c', 'd', 'e', 'f', 'g']] + +References +---------- +* R.E. Tarjan. A Hierarchical Clustering Algorithm Using Strong Components. *Information Processing Letters*. (1982) 14(1):26-29. +* R.E. Tarjan. An Improved Algorithm for Hierarchical Clustering Using Strong Components. *Information Processing Letters*. (1983) 17(1):37-41. diff --git a/pypathway/hotnet/hotnet2/hierarchy/__init__.py b/pypathway/hotnet/hotnet2/hierarchy/__init__.py new file mode 100755 index 0000000..ab6e4f5 --- /dev/null +++ b/pypathway/hotnet/hotnet2/hierarchy/__init__.py @@ -0,0 +1,2 @@ +from .hierarchical_clustering import * +from .hierarchical_clustering_io import linkage as convertToLinkage, newick as convertToNewick \ No newline at end of file diff --git a/pypathway/hotnet/hotnet2/hierarchy/hierarchical_clustering.py b/pypathway/hotnet/hotnet2/hierarchy/hierarchical_clustering.py new file mode 100755 index 0000000..b00b678 --- /dev/null +++ b/pypathway/hotnet/hotnet2/hierarchy/hierarchical_clustering.py @@ -0,0 +1,899 @@ +#!/usr/bin/python + +############################################################################### +# +# Setup +# +############################################################################### + +import math +import numpy as np + +try: + try: + from .. import fortran_routines + available_routines = 1 + except ImportError: + import fortran_routines + available_routines = 1 +except ImportError: + available_routines = 0 + +############################################################################### +# +# Tarjan's hierarchical decomposition algorithm +# +############################################################################### + +# +# HD(V,A,increasing=False) +# +# This function finds the decomposition of a graph into a hierarchy of +# strongly connected components following the algorithm of Tarjan (1983). +# It is a wrapper for tarjan_HD, which actually performs the decomposition. +# +# Inputs: +# V: list of vertices of the graph +# A: weighted adjacency matrix representing the edges of the graph +# increasing: order for adding edges according to weight +# +# Output: +# T: hierarchical decomposition tree +# + +def HD(V,A,increasing=False): + + SCCs = strongly_connected_components(A) + + if len(SCCs)>1: + largest_component_size = max(list(map(len,SCCs))) + number_vertices = len(V) + for component in SCCs: + if len(component)==largest_component_size: + break + A = A[np.ix_(component,component)] + V = slice_vertices(V,component) + print('The graph has %d strongly connected components, but the hierarchical decomposition requires a strongly connected graph. The decomposition tree considers a subgraph defined by a largest strongly connected component of the original graph %d of its %d vertices.' % (len(SCCs),largest_component_size,number_vertices)) + + if increasing: + base = 0.0 + W = [tuple([base,v]) for v in V] + T,root = tarjan_HD(W,A,{},0) + elif not increasing: + base = np.max(A) + W = [tuple([base,v]) for v in V] + S,root = tarjan_HD(W,reverse_matrix(A),{},0) + T = reverse_tree(W,A,S) + + return T + +# +# tarjan_HD(V,A,T,i) +# +# This function implements the hierarchical decomposition algorithm +# described in Tarjan (1983). Whenever possible, we reuse the notation and +# labels, i.e., Case 1, Case 2, Case 2a, and Case 2b, used in the paper. +# +# Inputs: +# V: list of vertices of the graph +# A: weighted adjacency matrix representing the edges of the graph +# T: hierarchical decomposition tree +# i: weight index; see Tarjan (1983) for description +# +# Outputs: +# T: hierarchical decomposition tree +# root: root of the tree +# + +def tarjan_HD(V,A,T,i): + + A_sort = sort(A) + m = len(A_sort)-1 + r = m-i + + if r==1: + # Case 1 + root = form_edge(A_sort[-1],V) + S = {v:root for v in V} + T.update(S) + return T,root + + else: + # Case 2 + j = int(math.ceil(0.5*float(i+m))) + weight_i = A_sort[i] + weight_j = A_sort[j] + A_j = remove_edges(A,weight_j) + SCCs = strongly_connected_components(A_j) + + if len(SCCs)==1: + # Case 2a + return tarjan_HD(V,A_j,T,i) + + else: + # Case 2b + W = [] + for SCC in SCCs: + if len(SCC)>1: + B = slice_array(A_j,SCC,SCC) + k = subproblem_index(B,weight_i) + X = slice_vertices(V,SCC) + S,root = tarjan_HD(X,B,{},k) + T.update(S) + W.append(root) + else: + W.extend(slice_vertices(V,SCC)) + + B = condense_graph(A,SCCs) + k = subproblem_index(B,weight_j) + return tarjan_HD(W,B,T,k) + +# +# clustering(T) +# +# This function finds the clusters of vertices given in the hierarchical +# decomposition tree T. Each vertex of the original graph forms its own +# cluster at the leaf nodes of the tree while the same vertices form a single +# cluster at the root of the tree. +# +# Input: +# T: hierarchical decomposition tree +# +# Outputs: +# weights: weights of the inner nodes of the tree +# clusters: clusters of vertices corresponding to each weight +# + +def clustering(T): + + condensations = [v for v in T if len(v)==2] + increasing = condensations[0][0]weight: + break + + while i=minimum_size] + + if selection_criterion==0: + if len(counts)>=minimum_number: + if truncated: + clusters = [v[1:] for v in condensations if len(v[1:])>=minimum_size] + if not truncated: + clusters = [v[1:] for v in condensations] + break + elif selection_criterion==1: + if sum(counts)>=minimum_number: + if truncated: + clusters = [v[1:] for v in condensations if len(v[1:])>=minimum_size] + if not truncated: + clusters = [v[1:] for v in condensations] + break + + return clusters, delta + +############################################################################### +# +# Helper functions +# +############################################################################### + +def closest(Y,x): + + m = len(Y)//2 + n = len(Y)//2 + r = len(Y)-1 + + while m>1: + m = int(round(m/2.0)) + if x<=Y[n]: + n = max(n-m,0) + else: + n = min(n+m,r) + + p = max(n-1,0) + q = min(n+1,r) + + if Y[p]<=x<=Y[n]: + if abs(x-Y[p])<=abs(x-Y[n]): + index = p + else: + index = n + else: + if abs(x-Y[n])<=abs(x-Y[q]): + index = n + else: + index = q + + return index + +def closest_reversed(Y,x): + + return len(Y)-1-closest(Y[::-1],x) + +def condense_graph(A,SCCs): + + if available_routines==1: + + vertices = np.array([vertex for component in SCCs for vertex in component],dtype=np.int) + sizes = [len(component) for component in SCCs] + indices = np.array([sum(sizes[:i]) for i in range(len(SCCs)+1)],dtype=np.int) + return fortran_routines.condense_graph(A,vertices+1,indices+1) + + else: + + n = len(SCCs) + B = np.zeros((n,n),dtype=np.float) + + for i in range(n): + for j in range(n): + if i!=j: + C = slice_array(A,SCCs[j],SCCs[i]) + D = np.nonzero(C) + if np.size(D)>0: + B[i,j] = np.min(C[D]) + + return B + +def form_edge(weight,V): + + W = [] + for v in V: + W.extend(v[1:]) + return tuple([weight]+sorted(W)) + +def remove_edges(A,weight): + + if available_routines==1: + + return fortran_routines.remove_edges(A,weight) + + else: + + B = A.copy() + B[B>weight] = 0 + return B + +def slice_vertices(vertices,indices): + + return [vertices[index] for index in indices] + +# The next function is a hack to reverse the order of the edge weights in the +# adjacency matrix: the largest weight is mapped to the smallest weight, the +# second largest to the second smallest, etc. The reverse_tree function +# effectively reverses the reverse_matrix function. + +def reverse_matrix(A): + + shift = int(math.ceil(np.max(A)))+1 + B = -A+shift + B[B==shift] = 0 + return B + +# The next function is a hack to reverse the order of the edge weights in the +# hierarchical. Due to floating-point precision errors, it is also necessary +# to match each reversed edge weight with its original edge weight in A. The +# reverse_tree function effectively reverses the reverse_matrix function. + +def reverse_tree(V,A,S): + + weights = np.unique(A) + interior_nodes = set(S.values()) + shift = int(math.ceil(weights[-1]))+1 + mapping = {v: v for v in V} + + for e in interior_nodes: + u = -e[0]+shift + w = weights[closest(weights,u)] + mapping[e] = tuple([w]+list(e[1:])) + + return {mapping[v]: mapping[S[v]] for v in S} + +def slice_array(A,rows,columns): + + if available_routines==1: + + return fortran_routines.slice_array(A,np.array(columns,dtype=np.int)+1,np.array(rows,dtype=np.int)+1) + + else: + + return A[np.ix_(rows,columns)] + +def sort(A): + + return np.unique(A) + +def strongly_connected_components(A): + + if available_routines==1: + + components = fortran_routines.strongly_connected_components(A) + return [np.where(components==i+1)[0].tolist() for i in range(np.max(components))] + + else: + + components = strongly_connected_components_from_matrix(A) + return sorted([sorted(c) for c in components]) + +# I adapted the next function from NetworkX's documentation to use adjacency +# matrices instead of adjacency lists. + +def strongly_connected_components_from_matrix(A): + + n=len(A) + preorder={} + lowlink={} + scc_found={} + scc_queue = [] + i=0 + for source in range(n): + if source not in scc_found: + queue=[source] + while queue: + v=queue[-1] + if v not in preorder: + i=i+1 + preorder[v]=i + done=True + v_nbrs=[w for w in range(n) if A[v,w]!=0] + for w in v_nbrs: + if w not in preorder: + queue.append(w) + done=False + break + if done==True: + lowlink[v]=preorder[v] + for w in v_nbrs: + if w not in scc_found: + if preorder[w]>preorder[v]: + lowlink[v]=min([lowlink[v],lowlink[w]]) + else: + lowlink[v]=min([lowlink[v],preorder[w]]) + queue.pop() + if lowlink[v]==preorder[v]: + scc_found[v]=True + scc=[v] + while scc_queue and preorder[scc_queue[-1]]>preorder[v]: + k=scc_queue.pop() + scc_found[k]=True + scc.append(k) + yield scc + else: + scc_queue.append(v) + +def subproblem_index(B,A_weight): + + C = np.unique(B) + index = closest(C,A_weight) + maximum_index = len(C)-1 + + while C[index]A_weight and index>0: + index -= 1 + return index + +if __name__ == "__main__": + + ########################################################################### + # + # Testing functions + # + ########################################################################### + + def edges_to_matrix(E): + + V = edges_to_vertices(E) + n = len(V) + indices = {V[i]:i for i in range(n)} + + A = np.zeros((n,n),dtype=np.float) + for (source,target,weight) in E: + A[indices[source],indices[target]] = weight + + return V,A + + def edges_to_vertices(E): + + V = set() + + for (source,target,weight) in E: + V.add(source) + V.add(target) + + return sorted(list(V)) + + # The next function computes the hierarchical decomposition naively, i.e., + # by adding one edge at a time. The disadvantage of the naive approach + # compared to Tarjan's algorithm is the need to compute strongly connected + # components many more times. + + def HD_naive(E,increasing): + + import networkx as nx + + vertices = edges_to_vertices(E) + edges = sorted(E, key = lambda e:e[2], reverse = not increasing) + weights = sorted(list(set(e[2] for e in edges)), reverse = not increasing) + + # Add all of the vertices to the graph. + + G = nx.DiGraph() + G.add_nodes_from(vertices) + + T = {} + if increasing: + roots = {v:tuple([0.0,v]) for v in vertices} + elif not increasing: + roots = {v:tuple([max(weights),v]) for v in vertices} + + i = 0 + m = len(edges) + n = len(vertices) + + # Consider each weight. + + for weight in weights: + + # Add all of the edges with the same weight to the graph. + + while edges[i][2]==weight: + G.add_edge(edges[i][0], edges[i][1], weight=edges[i][2]) + i += 1 + if i==m: + break + + # Find the SCCs of the resulting graph. + + components = [component for component in nx.strongly_connected_components(G)] + + # Check if any the components contracted by comparing the number of + # components previously to the number of components currently. + + if len(components)1: + for i in range(m): + for j in range(m): + if i!=j: + p = np.random.randint(len(components[i])) + q = np.random.randint(len(components[j])) + A[components[i][p],components[j][q]] = np.random.rand() + + return V,A + + # The next function provides the example given by Tarjan in Tarjan (1983). + + def tarjan_1983_example(): + + E = [["a","b",10.0], ["b","a",12.0], ["b","c",30.0], ["d","c",6.0], ["d","e",16.0], + ["e","d",13.0], ["e","f",8.0], ["f","a",26.0], ["a","g",15.0], ["g","b",35.0], + ["c","g",45.0], ["g","c",22.0], ["d","g",14.0], ["g","e",50.0], ["f","g",20.0]] + + V,A = edges_to_matrix(E) + + return V,A,E + + # The next function tests the correctness of our implementation of the + # hierarchical decomposition algorithm by comparing its results with those + # from a naive implementation on various random graphs. If the functions + # return different trees for the same graph, then we display trees, the + # seed for generating the adjacency matrix, and the adjacency matrix + # itself. + + def test_correctness(m,n,increasing,sparsity=0.0,nonuniqueness=0.0): + + for i in range(n): + + progress("Progress: %d/%d" % (i+1,n)) + V,A = random_adjacency_matrix(m,seed=i,sparsity=sparsity,nonuniqueness=nonuniqueness) + E = matrix_to_edges(V,A) + S = HD_naive(E,increasing) + T = HD(V,A,increasing) + + if S!=T: + progress("") + print(i) + print(A) + print(S) + print(T) + return False + + progress("") + return True + + # The next function tests the performance of our implementation of the + # hierarchical decomposition algorithm as well as our implementation of a + # clustering algorithm by running then on various complete random graphs + # of various sizes. + + def test_performance(trials,repetitions,increasing): + + import time + + dimensions = np.zeros(trials, dtype=np.int) + HD_times = np.zeros((trials,repetitions),dtype=np.float) + clustering_times = np.zeros((trials,repetitions),dtype=np.float) + + for trial in range(trials): + + n = int(10*2**trial) + dimensions[trial] = n + + progress("Progress: %d/%d of %d/%d" % (0,repetitions,trial+1,trials)) + np.random.seed(seed=trial) + V,A = random_adjacency_matrix(n) + + for repetition in range(repetitions): + + progress("Progress: %d/%d of %d/%d" % (repetition+1,repetitions,trial+1,trials)) + first_time = time.time() + T = HD(V,A,increasing) + second_time = time.time() + weights,clusters = cluster(T) + third_time = time.time() + + HD_times[trial,repetition] = second_time-first_time + clustering_times[trial,repetition] = third_time-second_time + + progress("") + + return dimensions,np.min(HD_times,axis=1),np.min(clustering_times,axis=1) + + # The next function profiles our implementation of the hierarchical + # decomposition algorithm on a complete random graph; search online for + # cProfile for details. + + def test_profile(n,increasing,filename): + + import cProfile + + V,A = random_adjacency_matrix(n) + cProfile.runctx("T = HD(V,A,"+str(increasing)+")", globals(), locals(), filename=filename) + + ########################################################################### + # + # Tests + # + ########################################################################### + + # Excluding for the profiling code, this collection of tests should require + # less than 1 minute of runtime and 300 MB of memory on a modern machine. + # No additional storage space or a network connection is necessary. We use + # the NetworkX package only for the test cases. We import several Fortran + # routines to expedite several of the computations, but the code falls back + # on functionally equivalent Python functions when the Fortran routines are + # unavailable. + + # See Tarjan (1983) for a description of hierarchical clustering with + # strongly connected components (SCCs). In our implementation, we + # represent the hierarchical decomposition (HD) tree of a strongly + # connected graph as a dictionary. The formation of a SCC is represented + # as a tuple whose first entry is the condensing weight and remaining + # entries are strongly connected vertices. + + # For example, in the example in Tarjan (1983), the addition of weight 12 + # forms SCC consisting of vertices a and b, which we represent with the + # dictionary entries + # + # (0.0, 'a') : (12.0, 'a', 'b') + # (0.0, 'b') : (12.0, 'a', 'b') + # + # The further addition of weight 35 forms a SCC consisting of vertices a, + # b, and g, which we represent with the dictionary entries + # + # (0.0, 'g') : (35.0, 'a', 'b', 'g') + # (12.0, 'a', 'b') : (35.0, 'a', 'b', 'g') + + # First, we run our implementation of Tarjan's HD algorithm on the example + # given in Tarjan (1983) and output the tree, the condensing weights, and + # the vertex clusters. We reverse the edge order from increasing weights + # to decreasing weights and repeat. + + print('=== Test 1 ===') + V, A, E = tarjan_1983_example() + + T = HD(V,A,increasing=True) + weights, clusters = cluster(T) + + print('Weights added in increasing order:') + print('') + print('Tree:') + print(prettify(T)) + print('Condensing weights:') + print(prettify(weights)) + print('Vertex clusters:') + print(prettify(clusters)) + + T = HD(V,A,increasing=False) + weights, clusters = cluster(T) + + print('Weights added in decreasing order:') + print('') + print('Tree:') + print(prettify(T)) + print('Condensing weights:') + print(prettify(weights)) + print('Vertex clusters:') + print(prettify(clusters)) + + # Second, we compare tree from of our implementation of Tarjan's HD + # algorithm with the tree from our implementation of the naive algorithm, + # i.e., the addition of one weight at a time. Our dictionary + # representation of the tree is unique, so the the trees are the same when + # the dictionaries are the same and different when different. + + print('=== Test 2 ===') + print(HD(V,A,True)==HD_naive(E,True)) + print(HD(V,A,False)==HD_naive(E,False)) + print('') + + # Third, we repeat the previous test on a complete graph, a graph with + # roughly 70% of its edges removed, and a graph with roughly 40% of its + # edge weights repeated, many to the same weights. Each graph has 25 + # vertices and edge weights chosen randomly from a uniform distribution. + # Each test repeats 20 times with a different graph each time for both + # increasing and decreasing edge weights. The test returns `True` if the + # trees are the same and `False` with additional information if they + # differ; see `test_HD_correctness` for more details. + + print('=== Test 3 ===') + print(test_correctness(25,20,True)) + print(test_correctness(25,20,True,sparsity=0.7)) + print(test_correctness(25,20,True,nonuniqueness=0.4)) + print(test_correctness(25,20,False)) + print(test_correctness(25,20,False,sparsity=0.7)) + print(test_correctness(25,20,False,nonuniqueness=0.4)) + print('') + + # Fourth, we generate profiling data for our implementation of Tarjan's HD + # algorithm for a complete random graph with 2000 vertices. Search for + # cProfile online for details. + + print('=== Test 4 ===') + # test_profile(2000,True,'tarjan_HD.cprof') + print('') + + # Fifth, we examine the performance of our implementation of Tarjan's HD + # algorithm and the performance of our clustering routine on complete + # random graphs of various sizes, returning the best runtime from three + # repetitions on the same random graph. We start with 10 vertices, double + # to 20 vertices, and so on for a total of eight trials. We return the + # number of vertices and the shortest runtimes for each trial, and we + # perform the test for both increasing and decreasing edge weights. + + print('=== Test 5 ===') + number_of_vertices, HD_runtimes, clustering_runtimes = test_performance(8,3,True) + + print('Number of vertices:') + print(number_of_vertices) + print('Runtimes for increasing edge weights:') + print(HD_runtimes) + print('Runtimes for vertex clustering:') + print(clustering_runtimes) + print('') + + number_of_vertices, HD_runtimes, clustering_runtimes = test_performance(8,3,False) + + print('Number of vertices:') + print(number_of_vertices) + print('Runtimes for decreasing edge weights:') + print(HD_runtimes) + print('Runtimes for vertex clustering:') + print(clustering_runtimes) diff --git a/pypathway/hotnet/hotnet2/hierarchy/hierarchical_clustering_io.py b/pypathway/hotnet/hotnet2/hierarchy/hierarchical_clustering_io.py new file mode 100755 index 0000000..39aed7e --- /dev/null +++ b/pypathway/hotnet/hotnet2/hierarchy/hierarchical_clustering_io.py @@ -0,0 +1,120 @@ +def linkage(T): + """ + This function converts a tree from our current format to a linkage matrix Z + and leaf node list V. + """ + condensations = sorted([v for v in T if len(v)==2]) + increasing = condensations[0][0]Z[i+1][2]: + raise Warning("The rows in the linkage matrix are not monotonically nondecreasing by distance; crossings may occur in the dendrogram.") + break + + # Below, we have the following variables: + + # m: index for reordered leaf nodes + # n: number of leaf nodes + # queue: list of inner nodes, which we append and pop as we work from right to left and from top to bottom in the dendrogram. + # order: dictionary for the reordering of the leaf nodes + # transpose_order: transpose the key, value pairs in order + + m = 0 + n = len(V) + queue = [len(Z)-1] + order = {} + transpose_order = {} + + # Continue while inner nodes remain. + + while len(queue): + + # Consider the right-most inner node. + + i = queue.pop() + subqueue = [] + + # For the children of the inner node, reorder them if they are leaf nodes and append them if they are inner nodes. + + for j in reversed(list(range(2))): + if Z[i][j]chromosome and chromosome->ordered gene list mappings. + + Arguments: + gene_order_file -- path to file containing tab-separated lists of genes on each chromosme, + one chromosome per line + + Note that numeric chromosome identifier used is simply the line number for the chromosome in + the given file and does not indicate the true chromosome number. + + """ + chromo2genes = {} + gene2chromo = {} + + cid = 0 + with open(gene_order_file) as f: + for line in f: + genes = line.split() + chromo2genes[cid] = genes + gene2chromo.update((gene, cid) for gene in genes) + cid += 1 + + return gene2chromo, chromo2genes + +def load_gene_specific_bmrs(bmr_file): + """Load gene BMR information from a file and return as a dict mapping gene name to BMR for the gene. + + Arguments: + bmr_file -- path to TSV file with gene names in the first column and the background mutation rate + for the gene in the second column + + """ + with open(bmr_file) as f: + arrs = [l.split() for l in f] + return dict((arr[0], float(arr[1])) for arr in arrs) + +def load_samples(sample_file): + """Load sample IDs from a file and return as a set. + + Arguments: + sample_file -- path to TSV file containing sample IDs as the first column. Any other columns + will be ignored + + """ + with open(sample_file) as f: + return set(l.rstrip().split()[0] for l in f) + +def include(item, whitelist): + return item in whitelist if whitelist else True + +def load_snvs(snv_file, gene_wlst=None, sample_wlst=None): + """Load SNV data from a file and return as a list of Mutation tuples with mut_type == SNV. + + Arguments: + snv_file -- path to TSV file containing SNVs where the first column of each line is a sample ID + and subsequent columns contain the names of SNVs with mutations in that sample. + Lines starting with "#" will be ignored. + gene_wlist -- whitelist of allowed genes (default None). Genes not in this list will be ignored. + If None, all mutated genes will be included. + sample_wlist -- whitelist of allowed samples (default None). Samples not in this list will be + ignored. If None, all samples will be included. + + """ + with open(snv_file) as f: + arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] + return [Mutation(arr[0], gene, SNV) for arr in arrs if include(arr[0], sample_wlst) + for gene in arr[1:] if include(gene, gene_wlst)] + +def load_inactivating_snvs(inactivating_snvs_file, gene_wlst=None, sample_wlst=None): + """Load inactivating SNVs from a file and return as a list of Mutation tuples with + mut_type == INACTIVE_SNV. + + Arguments: + inactivating_snvs_file -- path to TSV file listing inactivating SNVs where the first column of + each line is a gene name and the second column is a sample ID. + Lines starting with "#" will be ignored. + gene_wlist -- whitelist of allowed genes (default None). Genes not in this list will be ignored. + If None, all mutated genes will be included. + sample_wlist -- whitelist of allowed samples (default None). Samples not in this list will be + ignored. If None, all samples will be included. + + """ + with open(inactivating_snvs_file) as f: + arrs = [line.split() for line in f if not line.startswith("#")] + return [Mutation(arr[1], arr[0], INACTIVE_SNV) + for arr in arrs if include(arr[1], sample_wlst) and include(arr[0], gene_wlst)] + +def load_cnas(cna_file, gene_wlst=None, sample_wlst=None): + """Load CNA data from a file and return as a list of Mutation tuples with mut_type == AMP or DEL. + + Arguments: + cna_file -- path to TSV file containing CNAs where the first column of each line is a sample ID + and subsequent columns contain gene names followed by "(A)" or "(D)" indicating an + ammplification or deletion in that gene for the sample. Lines starting with '#' + will be ignored. + gene_wlist -- whitelist of allowed genes (default None). Genes not in this list will be ignored. + If None, all mutated genes will be included. + sample_wlist -- whitelist of allowed samples (default None). Samples not in this list will be + ignored. If None, all samples will be included. + + """ + with open(cna_file) as f: + arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] + return [Mutation(arr[0], cna.split("(")[0], get_mut_type(cna)) + for arr in arrs if include(arr[0], sample_wlst) + for cna in arr[1:] if include(cna.split("(")[0], gene_wlst)] + +def load_fusions(fusion_file, gene_wlst=None, sample_wlst=None, ): + """Load fusion information from a file and return as a list of Fusion objects. + + Arguments: + fusion_file -- path to TSV file containing a sample ID in the first column and gene names in + the second two columns of each line. Lines starting with "#" will be ignored. + gene_wlist -- whitelist of allowed genes (default None). Genes not in this list will be ignored. + If None, all mutated genes will be included. If only one gene of a fusion is in + the allowed whitelist, an exception is raised. + sample_wlist -- whitelist of allowed samples (default None). Samples not in this list will be + ignored. If None, all samples will be included. + """ + with open(fusion_file) as f: + arrs = [line.split() for line in f if not line.startswith("#")] + return [Fusion(arr[0], (arr[1], arr[2])) for arr in arrs + if include_fusion(sample_wlst, gene_wlst, *arr[:4])] + +def include_fusion(sample_wlst, gene_wlst, sample, gene1, gene2): + if sample_wlst and sample not in sample_wlst: return False + if not gene_wlst: return True + if gene1 not in gene_wlst and gene2 not in gene_wlst: return False + elif (gene1 in gene_wlst and not gene2 in gene_wlst) or \ + (gene2 in gene_wlst and not gene1 in gene_wlst): + raise ValueError('Genes %s and %s are in a fusion, but one is disallowed by the gene'\ + 'whitelist' % (gene1, gene2)) + return True + +def load_sample_types(type_file): + """Load sample type information from a file and return as a dict mapping sample ID to type string + + Arguments: + type_file -- Path to tab-separated file listing sample types where the first column of each + line is a sample ID and the second column is a type. + """ + with open(type_file) as f: + arrs = [line.split() for line in f] + return dict((arr[0], arr[1]) for arr in arrs) + +def get_mut_type(cna): + if cna.endswith("(A)"): return AMP + elif cna.endswith("(D)"): return DEL + else: raise ValueError("Unknown CNA type in '%s'", cna) + +def load_oncodrive_data(fm_scores, cis_amp_scores, cis_del_scores): + print("* Loading oncodrive data...") + # Create defaultdicts to hold the fm and cis scores + one = lambda: 1 + gene2fm = defaultdict(one) + gene2cis_amp, gene2cis_del = defaultdict(one), defaultdict(one) + + # Load fm scores (pvals, not z-scores) + with open(fm_scores) as f: + arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] + gene2fm.update((arr[1], float(arr[2])) for arr in arrs + if arr[2] != "" and arr[2] != "-0" and arr[2] != "-") + print("\tFM genes:", len(list(gene2fm.keys()))) + + # Load amplifications + with open(cis_amp_scores) as f: + arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] + gene2cis_amp.update((arr[0], float(arr[-1])) for arr in arrs) + print("\tCIS AMP genes:", len(list(gene2cis_amp.keys()))) + + # Load deletions + with open(cis_del_scores) as f: + arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] + gene2cis_del.update((arr[0], float(arr[-1])) for arr in arrs) + print("\tCIS DEL genes:", len(list(gene2cis_del.keys()))) + + # Merge data + genes = set(gene2cis_del.keys()) | set(gene2cis_amp.keys()) | set(gene2fm.keys()) + print("\t- No. genes:", len(genes)) + gene2heat = dict() + for g in genes: + gene2heat[g] = {"del": gene2cis_del[g], "amp": gene2cis_amp[g], + "fm": gene2fm[g] } + + return gene2heat + +def load_mutsig_scores(scores_file): + with open(scores_file) as f: + arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] + print("* Loading MutSig scores in", len(arrs), "genes...") + return dict((arr[0], {"pval": float(arr[-2]), "qval": float(arr[-1])}) + for arr in arrs) + + +FDR_CT, FDR_LRT, FDR_FCPT = 12, 11, 10 +music_score2name = {FDR_CT: "FDR_CT", FDR_LRT: "FDR_LRT", FDR_FCPT: "FDR_FCPT"} +def load_music_scores(scores_file): + print("* Loading MuSiC scores using the median of the 3 q-values...") + + with open(scores_file) as f: + # Load file and tab-split lines + arrs = [l.rstrip().split("\t") for l in f if not l.startswith("#")] + + # Indices for the columns we may be interested in + gene2music = dict((arr[0], {"FDR_CT": float(arr[FDR_CT]), + "FDR_FCPT": float(arr[FDR_FCPT]), + "FDR_LRT":float(arr[FDR_LRT])}) + for arr in arrs) + + # Output parsing info + print("\t- Loaded %s genes." % len(gene2music)) + return gene2music + +################################################################################ +# Data saving functions + +def write_components_as_tsv(output_file, ccs): + """Save connected components to file where each line represents a connected component and genes + within each CC are delimited by tabs. + + Arguments: + output_file -- path to which the output file should be written + ccs -- list of lists of gene names representing connected components + + """ + with open(output_file, 'w') as out_f: + for cc in ccs: + out_f.write('\t'.join(cc) + '\n') + +def write_significance_as_tsv(output_file, sizes2stats): + """Save significance information to tab-separated file. + + Arguments: + output_file -- path to which the output file should be written + sizes2stats -- dict mapping a CC size to a dict with the expected number of CCs of at least + that size based on permuted data, the observed number of CCs of at least that + size in the real data, and the p-value for the observed number + + """ + with open(output_file, 'w') as out_f: + out_f.write("Size\tExpected\tActual\tp-value\n") + for size, stats in sizes2stats.items(): + out_f.write("%s\t%s\t%s\t%s\n" % (size, stats["expected"], stats["observed"], stats["pval"])) + +def write_gene_list(output_file, genelist): + """Save a list of genes to a file, one gene per line. + + Arguments: + output_file -- path to which the output file should be written + genelist -- iterable of genes that should be included in the output file + + """ + with open(output_file, 'w') as out_f: + for gene in genelist: + out_f.write(gene+'\n') + +################################################################################ +# General data loading and saving functions + +def load_file(file_path): + with open(file_path) as f: + return f.read() + +def write_file(file_path, text): + with open(file_path, 'w') as f: + f.write(text) + + +def load_network(file_path, infmat_name): + """ + Load an influence matrix from the file path, using the file path extension + to figure out how to load the file. + """ + H = load_hdf5(file_path) + PPR = np.asarray(H[infmat_name]) + indexToGene = dict( list(zip(list(range(np.shape(PPR)[0])), H['nodes'])) ) + G = nx.Graph() + G.add_edges_from(H['edges']) + return PPR, indexToGene, G, H['network_name'] + +def load_hdf5(file_path, keys=None): + """ + Load a dictionary from an HDF5 file + + file_path: + HDF5 filename + keys (optional): + if given, return a dictionary with only the provided keys (provided that + they are also keys in f); otherwise, return a dictionary with every key + in f + dictionary: + dictionary loaded from the HDF5 file + """ + f = h5py.File(file_path, 'r') + if keys: + dictionary = {key:f[key].value for key in keys if key in f} + else: + dictionary = {key:f[key].value for key in f} + f.close() + # ['nodes', 'network_name', 'beta', 'PPR', 'edges'] + # dictionary = convert_back(dictionary) + dictionary['nodes'] = [x.decode('utf8') for x in dictionary['nodes']] + dictionary['network_name'] = dictionary['network_name'].decode('utf8') + dictionary['edges'] = dictionary['edges'].astype(str) + return dictionary + + +def convert_back(item): + if type(item) == bytes: + return item.decode("utf8") + elif type(item) == list: + return [convert_back(x) for x in item] + elif type(item) == tuple: + return tuple([convert_back(x) for x in item]) + elif type(item) == dict: + return {convert_back(k): convert_back(v) for k, v in item.items()} + elif type(item) == np.ndarray: + return item.astype('str') + elif type(item) == float or type(item) == int or type(item) == str: + return item + elif type(item) == np.float64 or type(item) == np.ndarray: + return item + else: + raise Exception("Unknown type: {}".format(type(item))) + + +def save_hdf5(file_path, dictionary, compression=False): + """ + Save or append a dictionary to an HDF5 file + + file_path: + HDF5 filename + dictionary: + dictionary to save to the HDF5 file; if any of the keys in dictionary + are already keys in f, then the corresponding values in dictionary + overwrite the existing values in f + compression (optional): + if given as True, compress values of the dictionary; otherwise, do not + compress the values + """ + f = h5py.File(file_path, 'a') + for key in dictionary: + if key in f: + del f[key] + if compression: + f.create_dataset(key, data=dictionary[key], compression='gzip') + else: + f[key] = dictionary[key] + f.close() + +# Wrapper for loading a heat file, automatically detecting if it's JSON or TSV +def load_heat_file(heat_file, json_heat): + mutations = [], [], dict() # default mutation data + if json_heat: + with open(heat_file, 'r') as IN: + obj = json.load(IN) + heat_params = obj['parameters'] + heat_name = heat_params['name'] + heat = obj['heat'] + heat_fn = heat_params['heat_fn'] if 'heat_fn' in heat_params else None + + # Return blank mutation data if none was provided + if heat_fn == 'load_mutation_heat': + # Load the mutation data + samples = load_samples(heat_params['sample_file']) if heat_params['sample_file'] else None + genes = load_genes(heat_params['gene_file']) if heat_params['gene_file'] else None + snvs = load_snvs(heat_params['snv_file'], genes, samples) if heat_params['snv_file'] else [] + cnas = load_cnas(heat_params['cna_file'], genes, samples) if heat_params['cna_file'] else [] + + if heat_params.get('sample_type_file'): + with open(heat_parameters['sample_type_file']) as f: + output['sampleToType'] = dict(l.rstrip().split() for l in f if not l.startswith("#") ) + else: + if not samples: + samples = set( m.sample for m in snvs ) | set( m.sample for m in cnas ) + sampleToType = dict( (s, "Cancer") for s in samples ) + + mutations = snvs, cnas, sampleToType + else: + heat = load_heat_tsv(heat_file) + heat_name = heat_file.split('/')[-1].split('.')[0] + + return [heat, heat_name, mutations] + +############################################################################### +# Main output functions +############################################################################### + +# create output directory if doesn't exist; warn if it exists and is not empty +def setup_output_dir(output_dir): + if not os.path.exists(output_dir): os.makedirs(output_dir) + +# Output a single run of HotNet2 +def output_hotnet2_run(result, params, network_name, heat, heat_name, heat_file, using_json_heat, output_dir): + for ccs, sizes2stats, delta in result: + # create output directory + delta_out_dir = os.path.abspath(output_dir + "/delta_" + str(delta)) + if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) + + # Output a heat JSON file if JSON heat wasn't included + if not using_json_heat: + heat_dict = {"heat": heat, "parameters": {"heat_file": heat_file}} + heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w') + json.dump(heat_dict, heat_out, indent=4) + heat_out.close() + heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON + + write_significance_as_tsv(delta_out_dir + "/" + SIGNIFICANCE_TSV, sizes2stats) + + params['heat_name'] = heat_name + params['network_name'] = network_name + params['delta'] = delta + output_dict = {"parameters": params, "sizes": component_sizes(ccs), + "components": ccs, "statistics": sizes2stats} + json_out = open(delta_out_dir + "/" + JSON_OUTPUT, 'w') + json.dump(output_dict, json_out, indent=4) + json_out.close() + + write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) + +# Output a consensus HotNet2 run +def output_consensus(consensus, linkers, auto_deltas, consensus_stats, params, output_dir): + output_dir = '{}/consensus/'.format(output_dir) + setup_output_dir(output_dir) + + # Output to JSON + with open(output_dir + '/subnetworks.json', 'w') as OUT: + # Convert the consensus to lists + output = dict(params=params, deltas=auto_deltas, stats=consensus_stats, + linkers=list(linkers), consensus=consensus) + json.dump(output, OUT, sort_keys=True, indent=4) + + # Output to TSV + with open(output_dir + '/subnetworks.tsv', 'w') as OUT: + OUT.write('#Linkers: {}\n'.format(', '.join(sorted(linkers)))) + OUT.write('#Core\tExpansion\n') + OUT.write('\n'.join([ '{}\t{}'.format(' '.join(sorted(d['core'])), ' '.join(sorted(d['expansion']))) for d in consensus ]) ) + + with open(output_dir + 'stats.tsv', 'w') as OUT: + rows = [ (k, d['observed'], d['expected'], d['pval']) for k, d in sorted(consensus_stats.items()) ] + OUT.write('#Size\tObserved\tExpected\tP-value\n') + OUT.write('\n'.join([ '\t'.join(map(str, r)) for r in rows ]) ) diff --git a/pypathway/hotnet/hotnet2/hotnet2.py b/pypathway/hotnet/hotnet2/hotnet2.py new file mode 100755 index 0000000..4d69d16 --- /dev/null +++ b/pypathway/hotnet/hotnet2/hotnet2.py @@ -0,0 +1,116 @@ +# -*- coding: iso-8859-1 -*- +from collections import defaultdict +import networkx as nx, numpy as np, scipy as sp + +try: + from . import c_routines + fast_similarity_matrix = True +except ImportError: + print("WARNING: Could not import either C module; " + "falling back to NumPy for similarity matrix creation.") + import traceback + print(traceback.format_exc()) + fast_similarity_matrix = False + +################################################################################ +# Influence and similarity matrix functions + +def similarity_matrix(infmat, index2gene, gene2heat, directed=True, verbose=0): + """Create and return a similarity matrix and index to gene mapping for the given influence + matrix and heat. Only genes with heat that are in the network will be included in the returned + similarity matrix and index to gene mapping. + + Arguments: + infmat -- 2D ndarray representing the full influence matrix + index2gene -- dict mapping an index in the matrix to the name of the gene represented at that + index in the influence matrix + gene2heat -- dict mapping a gene name to the heat score for that gene + directed -- if True, sim[i][j] = inf(i,j)*heat[i] and sim[i][j] != sim[j][i] + if False, sim[i][j] = min(inf(i,j), inf(j,i))*max(heat(i), heat(j)) + + """ + start_index = min(index2gene.keys()) + gene2index = dict((gene, index) for index, gene in index2gene.items()) + + # Identify genes in the given list that are also in the network + genelist = sorted(set(gene2heat.keys()).intersection(list(gene2index.keys()))) + index2gene = dict(enumerate(genelist)) + if verbose > 4: + print("\t- Genes in similarity matrix:", len(genelist)) + + infmat = np.asarray(infmat, dtype=np.float64) + h = np.array([gene2heat[g] for g in genelist], dtype=np.float64) + indices = np.array([gene2index[g]-start_index for g in genelist], dtype=np.int) + m = np.shape(infmat)[0] + n = np.shape(h)[0] + + if fast_similarity_matrix: + if directed: + sim = c_routines.compute_sim(infmat, h, indices, m, n) + else: + sim = c_routines.compute_sim_classic(infmat, h, indices, m, n) + else: + M = infmat[np.ix_(indices, indices)] + if directed: + sim = M * h + else: + M = np.minimum(M, M.transpose()) # Ensure that the influence matrix is symmetric + sim = np.empty_like(M) + for i in range(n): + for j in range(i, n): + sim[i, j] = max(h[i], h[j]) * M[i, j] + sim[j, i] = sim[i, j] + + return sim, index2gene + +################################################################################ +# Weighted graph functions + +def weighted_graph(sim_mat, index2gene, delta, directed=True): + """Construct and return weighted graph in which nodes are labeled with gene names and edges + between nodes have weight equal to the similarity score of the two genes. + + Arguments: + sim_mat -- similarity matrix obtained from similarity_matrix + index2gene -- dict mapping an index in the matrix to the name of the gene represented at that + index in the similarity matrix + delta -- weight threshold for edge inclusion. Only gene pairs with similarity >= delta will + have edges connecting their corresponding nodes in the returned graph. + directed -- whether or not the resulting graph should be directed. If true, a networkx Digraph + instance is returned. If false, a networkx Graph instance is returned. + + """ + e = list(zip( *sp.where(sim_mat >= delta))) + edges = [(int(j), int(i), dict(weight=sim_mat[i,j])) for i, j in e] + G = nx.DiGraph() if directed else nx.Graph() + G.add_edges_from([(index2gene[i], index2gene[j], d) for i, j, d in edges]) + return G + +def connected_components(G, min_size=1): + """Find connected components in the given graph and return as a list of lists of gene names. + + If the graph contains no connected components of size at least min_size, an empty list is returned. + + Arguments: + G -- weighted graph in which connected components should be found + min_size -- minimum size for connected components included in the returned component list + + """ + ccs = nx.strongly_connected_components(G) if isinstance(G, nx.DiGraph) else nx.connected_components(G) + ccs = [cc for cc in ccs if len(cc) >= min_size] + return ccs + +def component_sizes(ccs): + """Return dict mapping a CC size to the number of connected components of that size. + + Only sizes for which there is at least one connected component of that size will have an entry + in the returned dict. If the given component list is empty, an empty dict is returned. + + Arguments: + ccs -- list of lists of gene names representing connected components + + """ + size_dict = defaultdict(int) + for cc in ccs: + size_dict[len(cc)] += 1 + return size_dict diff --git a/pypathway/hotnet/hotnet2/permutations.py b/pypathway/hotnet/hotnet2/permutations.py new file mode 100755 index 0000000..07f239d --- /dev/null +++ b/pypathway/hotnet/hotnet2/permutations.py @@ -0,0 +1,223 @@ +from collections import defaultdict, namedtuple +import multiprocessing as mp +import random +from .constants import Mutation, SNV +from . import heat +from . import hnio + +################################################################################ +# Heat permutation + +def heat_permutation_wrapper(xxx_todo_changeme): + (heat_scores, eligible_genes) = xxx_todo_changeme + permuted_genes = list(eligible_genes) + random.shuffle(permuted_genes) + permuted_genes = permuted_genes[:len(heat_scores)] + + permuted_heat = dict((gene, heat) for gene, heat in zip(permuted_genes, heat_scores)) + + return permuted_heat + +def permute_heat(heat, network_genes, num_permutations, addtl_genes=None, num_cores=1): + """Return a list of num_permutation dicts, each mapping gene names to permuted heat scores. + + Arguments: + heat -- dict mapping a gene name to the heat score for that gene + network_genes -- iterable of names of genes in the network + num_permutations -- number of heat permutations to produce + addtl_genes -- iterable of names of genes that do not have heat scores in the real data but + which may have heat scores assigned in permutations. Defaults to None. + num_cores -- number of cores to use for running in parallel + + """ + if num_cores != 1: + pool = mp.Pool(None if num_cores == -1 else num_cores) + map_fn = pool.map + else: + map_fn = map + + heat_scores = list(heat.values()) + if not addtl_genes: addtl_genes = set() + genes_eligible_for_heat = set(heat.keys()).union(addtl_genes).intersection(network_genes) + + args = [(heat_scores, genes_eligible_for_heat)] * num_permutations + permutations = map_fn(heat_permutation_wrapper, args) + + if num_cores != 1: + pool.close() + pool.join() + + return permutations + +################################################################################ +# Mutation permutation + +def mutation_permuation_heat_wrapper(xxx_todo_changeme1): + (samples, genes, cnas, gene2length, bmr, gene2bmr, gene2chromo, + chromo2genes, cna_filter_threshold, min_freq) = xxx_todo_changeme1 + permuted_snvs = permute_snvs(samples, genes, gene2length, bmr, gene2bmr) + permuted_cnas = permute_cnas(cnas, gene2chromo, chromo2genes) + if cna_filter_threshold: + permuted_cnas = heat.filter_cnas(permuted_cnas, cna_filter_threshold) + + return heat.mut_heat(genes, len(samples), permuted_snvs, permuted_cnas, min_freq) + + +def generate_mutation_permutation_heat(heat_fn, sample_file, gene_file, genes_in_network, snv_file, + gene_length_file, bmr, bmr_file, cna_file, gene_order_file, + cna_filter_threshold, min_freq, num_permutations, num_cores=1): + """Return a list of num_permutation dicts, each mapping gene names to heat scores calculated + from permuted mutation data. + + Arguments: + heat_fn -- the name of the function used to calculate heat scores for the real data. + A RuntimeError is raised if this is not "load_mutation_heat" + sample_file -- path to TSV file containing tested sample IDs as the first column. If None, the + set of samples is assumed to be all samples that are provided in the SNV or CNA data. + gene_file -- path to file containing names of tested genes, one per line. If None, the set of + tested genes is assumed to be all genes that have mutations in either the SNV or + CNA data. + genes_in_network -- iterable of names of genes in the PPI network + snv_file -- path to TSV file containing SNVs where the first column of each line is a sample ID + and subsequent columns contain the names of SNVs with mutations in that sample. + gene_length_file -- path to TSV file containing gene names in the first column and the length + of the gene in base pairs in the second column + bmr -- default background mutation rate + bmr_file -- path to TSV file with gene names in the first column and the background mutation rate + for the gene in the second column. The default BMR will be used for any gene without + a BMR in this file. If None, the default BMR will be used for all genes. + cna_file -- path to TSV file containing CNAs where the first column of each line is a sample ID + and subsequent columns contain gene names followed by "(A)" or "(D)" indicating an + ammplification or deletion in that gene for the sample. Lines starting with '#' + will be ignored. + gene_order_file -- path to file containing tab-separated lists of genes on each chromosme, + one chromosome per line + cna_filter_threshold -- proportion of CNAs in a gene across samples that must share the same + CNA type in order for the CNAs to be included. Must be > .5 if not None. + If None, all CNAs will be included. + min_freq -- the minimum number of samples in which a gene must have an SNV to be considered + mutated in the heat score calculation. + num_permutations -- the number of permuted data sets to be generated + num_cores -- number of cores to use for running in parallel + + """ + if heat_fn != "load_mutation_heat": + raise RuntimeError("Heat scores must be based on mutation data to perform\ + delta selection based on mutation data permutation.") + + samples = hnio.load_samples(sample_file) if sample_file else None + genes = hnio.load_genes(gene_file) if gene_file else None + cnas = hnio.load_cnas(cna_file, genes, samples) + gene2length = hnio.load_gene_lengths(gene_length_file) + gene2bmr = hnio.load_gene_specific_bmrs(bmr_file) if bmr_file else {} + gene2chromo, chromo2genes = hnio.load_gene_order(gene_order_file) + + if not samples: + snvs = hnio.load_snvs(snv_file, genes, samples) + samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas]) + if not genes: + genes = set([snv.gene for snv in snvs] + [cna.gene for cna in cnas]) + + #only generate mutations for genes that are in the network + genes = set(genes).intersection(genes_in_network) + + if num_cores != 1: + pool = mp.Pool(None if num_cores == -1 else num_cores) + map_fn = pool.map + else: + map_fn = map + + args = [(samples, genes, cnas, gene2length, bmr, gene2bmr, gene2chromo,chromo2genes, + cna_filter_threshold, min_freq)] * num_permutations + heat_permutations = map_fn(mutation_permuation_heat_wrapper, args) + + if num_cores != 1: + pool.close() + pool.join() + + return heat_permutations + +def permute_snvs(samples, tested_genes, gene2length, bmr, gene2bmr): + permuted_snvs = [] + for sample in samples: + for gene in tested_genes: + gene_bmr = gene2bmr[gene] if gene in gene2bmr else bmr + gene_length = gene2length[gene] + prob = 1 - pow(1 - gene_bmr, gene_length) + if random.random() <= prob: + permuted_snvs.append(Mutation(sample, gene, SNV)) + + return permuted_snvs + +Block = namedtuple("Block", ["chromosome", "start_index", "mut_type", "genes"]) + +def permute_cnas(cnas, gene2chromo, chromo2genes): + samples2cnas = defaultdict(list) + for cna in cnas: + samples2cnas[cna.sample].append(cna) + + permuted_cnas = [] + for sample in samples2cnas: + chromo2blocks = get_cna_blocks_for_sample(samples2cnas[sample], gene2chromo, chromo2genes) + for chromo, blocks in chromo2blocks.items(): + genes = chromo2genes[chromo] + invalid_indices = [] + for block in blocks: + permuted_indices = get_block_indices(len(genes), len(block.genes), invalid_indices) + for index in permuted_indices: + permuted_cnas.append(Mutation(sample, genes[index], block.mut_type)) + + new_invalid_indices = permuted_indices +\ + [min(permuted_indices) - 1, max(permuted_indices) + 1] + invalid_indices.extend(new_invalid_indices) + + return permuted_cnas + +def get_block_indices(chromo_length, block_length, invalid_indices, max_attempts=10): + start = random.randint(0, chromo_length - block_length) + attempts = 1 + while not is_block_valid(start, block_length, invalid_indices): + if attempts == max_attempts: + raise RuntimeError("Cannot place CNA block after %s attempts" % (max_attempts)) + start = random.randint(0, chromo_length - block_length) + attempts += 1 + + return list(range(start, start + block_length)) + + +def is_block_valid(start, block_length, invalid_indices): + for i in range(start, start + block_length): + if i in invalid_indices: + return False + return True + +def get_cna_blocks_for_sample(cnas, gene2chromo, chromo2genes): + #sort cnas by chromosome, then by position in chromosome + cnas.sort(key = lambda cna: (gene2chromo[cna.gene], + chromo2genes[gene2chromo[cna.gene]].index(cna.gene))) + + chromo2blocks = defaultdict(list) + curr_chromo = gene2chromo[cnas[0].gene] + curr_start_index = chromo2genes[curr_chromo].index(cnas[0].gene) + curr_mut_type = cnas[0].mut_type + curr_genes = [cnas[0].gene] + for cna in cnas[1:]: + cna_index = chromo2genes[gene2chromo[cna.gene]].index(cna.gene) + if (gene2chromo[cna.gene] == curr_chromo and + cna_index == curr_start_index + len(curr_genes) and + cna.mut_type == curr_mut_type): + curr_genes.append(cna.gene) + else: + #store completed block + chromo2blocks[curr_chromo].append(Block(curr_chromo, curr_start_index, + curr_mut_type, curr_genes)) + + #start new block + curr_chromo = gene2chromo[cna.gene] + curr_start_index = cna_index + curr_mut_type = cna.mut_type + curr_genes = [cna.gene] + chromo2blocks[curr_chromo].append(Block(curr_chromo, curr_start_index, + curr_mut_type, curr_genes)) + + return chromo2blocks diff --git a/pypathway/hotnet/hotnet2/run.py b/pypathway/hotnet/hotnet2/run.py new file mode 100755 index 0000000..f4703a0 --- /dev/null +++ b/pypathway/hotnet/hotnet2/run.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +# Load required modules +import json +import sys, os, shutil +import numpy as np + +# Load local modules +from . import heat as hnheat, hotnet2 as hn, hnio, stats, permutations as p +from .delta import get_deltas_for_network, get_deltas_for_heat +from .constants import * + +def run_helper(args, infmat, full_index2gene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_fn, infmat_name="PPR", max_cc_sizes=[5, 10, 15, 20], verbose=0): + """Helper shared by runHotNet2 and runClassicHotNet. + """ + # Perform delta selection (if necessary) + if args.deltas: + deltas = args.deltas + else: + deltas = get_deltas_fn(full_index2gene, heat, args.network_permutations, args.num_cores, infmat, addtl_genes, pnp, infmat_name, max_cc_sizes, verbose) + + sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True, verbose=verbose) + + results = [] + for delta in deltas: + + # find connected components + G = hn.weighted_graph(sim, index2gene, delta, directed=True) + ccs = hn.connected_components(G, args.min_cc_size) + + # calculate significance (using all genes with heat scores) + if verbose > 4: + print("* Performing permuted heat statistical significance...") + # print("\t- Using no. of components >= k (k \\in", end=' ') + print("[%s, %s]) as statistic" % (min(HN2_STATS_SIZES), max(HN2_STATS_SIZES))) + + heat_permutations = p.permute_heat(heat, list(full_index2gene.values()), + args.heat_permutations, addtl_genes, + args.num_cores) + sizes2counts = stats.calculate_permuted_cc_counts(infmat, full_index2gene, + heat_permutations, delta, HN2_STATS_SIZES, True, + args.num_cores) + real_counts = stats.num_components_min_size(G, HN2_STATS_SIZES) + size2real_counts = dict(list(zip(HN2_STATS_SIZES, real_counts))) + sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, + args.heat_permutations) + + # sort ccs list such that genes within components are sorted alphanumerically, and components + # are sorted first by length, then alphanumerically by name of the first gene in the component + ccs = [sorted(cc) for cc in ccs] + ccs.sort(key=lambda comp: comp[0]) + ccs.sort(key=len, reverse=True) + + # Record the results for this delta + results.append( (ccs, sizes2stats, delta) ) + + return results + +def get_deltas_hotnet2(full_index2gene, heat, num_perms, num_cores, _infmat, _addtl_genes, + permuted_networks_path, infmat_name, max_cc_sizes, verbose): + # find smallest delta + deltas = get_deltas_for_network(permuted_networks_path, heat, infmat_name, full_index2gene, + MAX_CC_SIZE, max_cc_sizes, False, num_perms, num_cores, verbose) + + # and run HotNet with the median delta for each size + return [np.median(deltas[size]) for size in deltas] + +def get_deltas_classic(full_index2gene, heat, num_perms, num_cores, infmat, addtl_genes, min_cc_size, max_cc_size, verbose): + # find delta that maximizes # CCs of size >= min_cc_size for each permuted data set + deltas = get_deltas_for_heat(infmat, full_index2gene, heat, addtl_genes, num_perms, NUM_CCS, + [min_cc_size], True, num_cores, verbose) + + # find the multiple of the median delta s.t. the size of the largest CC in the real data + # is <= MAX_CC_SIZE + medianDelta = np.median(deltas[min_cc_size]) + + sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, False) + + for i in range(1, 11): + G = hn.weighted_graph(sim, index2gene, i*medianDelta) + largest_cc_size = max([len(cc) for cc in hn.connected_components(G)]) + if largest_cc_size <= max_cc_size: + break + + # and run HotNet with that multiple and the next 4 multiples + return [i*medianDelta for i in range(i, i+5)] diff --git a/pypathway/hotnet/hotnet2/src/c/c_routines.c b/pypathway/hotnet/hotnet2/src/c/c_routines.c new file mode 100755 index 0000000..0f875d6 --- /dev/null +++ b/pypathway/hotnet/hotnet2/src/c/c_routines.c @@ -0,0 +1,30 @@ +#include + +#define index_in(a, i, j) a[i*p + j] +#define index_out(a, i, j) a[i*q + j] + +void compute_sim(double *infmat, double *h, long *indices, long p, long q, double *M) +{ + int i, j; + for (i = 0; i < q; ++i) + { + for (j = 0; j < q; ++j) + { + index_out(M, i, j) = index_in(infmat, indices[i], indices[j]) * h[j]; + } + } +} + +void compute_sim_classic(double *infmat, double *h, long *indices, long p, long q, double *M) +{ + int i, j; + for (i = 0; i < q; ++i) + { + for (j = 0; j < q; ++j) + { + index_out(M, i, j) = index_out(M, j, i) = fmin(index_in(infmat, indices[i], indices[j]), + index_in(infmat, indices[j], indices[i])) + * fmax(h[i], h[j]); + } + } +} diff --git a/pypathway/hotnet/hotnet2/src/c/c_routines.pyf b/pypathway/hotnet/hotnet2/src/c/c_routines.pyf new file mode 100755 index 0000000..813e7e7 --- /dev/null +++ b/pypathway/hotnet/hotnet2/src/c/c_routines.pyf @@ -0,0 +1,24 @@ +python module c_routines +interface + subroutine compute_sim(infmat, h, indices, p, q, M) + intent(c) compute_sim ! foo is a C function + intent(c) ! all foo arguments are + ! considered as C based + integer, intent(in) :: p, q + integer*8, intent(in) :: indices(q) + double precision, intent(in) :: infmat(p,p), h(q) + double precision, intent(out) :: M(q,q) + + end subroutine compute_sim + subroutine compute_sim_classic(infmat, h, indices, p, q, M) + intent(c) compute_sim_classic ! foo is a C function + intent(c) ! all foo arguments are + ! considered as C based + integer, intent(in) :: p, q + integer*8, intent(in) :: indices(q) + double precision, intent(in) :: infmat(p,p), h(q) + double precision, intent(out) :: M(q,q) + + end subroutine compute_sim_classic +end interface +end python module m diff --git a/pypathway/hotnet/hotnet2/src/fortran/fortran_routines.f95 b/pypathway/hotnet/hotnet2/src/fortran/fortran_routines.f95 new file mode 100755 index 0000000..646a359 --- /dev/null +++ b/pypathway/hotnet/hotnet2/src/fortran/fortran_routines.f95 @@ -0,0 +1,316 @@ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Instructions +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! To compile for use with Python/F2PY, run either the included setup.py script +! or the following command: +! +! f2py -c fortran_routines.f95 -m fortran_routines +! +! Both approaches, which are generally equivalent, require a Fortran compiler +! and NumPy, which contains F2PY. +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Routines +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! compute_sim(M,infmat,h,indices,p,q) +! +! condenses the similarity matrix given the influence matrix, heat scores, +! and indices indicating for which genes the influence matrix and heat scores +! overlap; for HotNet2 +! + +subroutine compute_sim(M,infmat,h,indices,p,q) + + implicit none + + integer, intent(in) :: p, q + integer, intent(in) :: indices(q) + double precision, intent(in) :: infmat(p,p), h(q) + double precision, intent(out) :: M(q,q) + + integer :: i, j + + do j=1,q + do i=1,q + M(i,j) = infmat(indices(i),indices(j))*h(j) + end do + end do + +end subroutine compute_sim + +! +! compute_sim_classic(M,infmat,h,indices,p,q) +! +! condenses the similarity matrix given the influence matrix, heat scores, +! and indices indicating for which genes the influence matrix and heat scores +! overlap; for HotNet +! + +subroutine compute_sim_classic(M,infmat,h,indices,p,q) + + implicit none + + integer, intent(in) :: p, q + integer, intent(in) :: indices(q) + double precision, intent(in) :: infmat(p,p), h(q) + double precision, intent(out) :: M(q,q) + + integer :: i, j + + do j=1,q + do i=j,q + M(i,j) = min(infmat(indices(i),indices(j)), infmat(indices(j),indices(i)))*max(h(i), h(j)) + M(j,i) = M(i,j) + end do + end do + +end subroutine compute_sim_classic + +! +! condense_graph(B,A,V,k,m,n) +! +! condenses the graph given by the weighted adjacency matrix A into a graph +! given by the weighted adjacency matrix B by contracting the vertices in V +! + +subroutine condense_graph(B,A,V,k,m,n) + + implicit none + + integer, intent(in) :: m, n + integer, intent(in) :: V(m), k(n+1) + double precision, intent(in) :: A(m,m) + + double precision, intent(out) :: B(n,n) + + integer :: i, j + + ! m: number of vertices; dimension of A + ! n: number of SCCs; dimension of B + ! V: vertices of the components; component(k(i):k(i+1)-1) are the vertices + ! in component i + ! k: vertex indices of the components; k(i) is the index of the first + ! vertex in component i + ! A: weighted adjacency matrix + ! B: condensed weighted adjacency matrix + + do i=1,n + do j=1,n + if (i/=j) then + B(i,j) = minimum_slice(V(k(i):k(i+1)-1),V(k(j):k(j+1)-1),k(i+1)-k(i),k(j+1)-k(j)) + else + B(i,j) = 0.d0 + end if + end do + end do + +contains + + function minimum_slice(columns,rows,p,q) + + integer, intent(in) :: p, q + integer, intent(in) :: columns(p), rows(q) + + double precision :: minimum_slice + + double precision :: weight + integer :: ii, jj + + minimum_slice = huge(0.d0) + + do jj=1,q + do ii=1,p + weight = A(columns(ii),rows(jj)) + if (weight>0.d0) then + if (weight0) + + v = queue(p) + if (included(v,preorder_collection(1:j),j) .eqv. .false.) then + j = j+1 + preorder_collection(j) = v + i = i+1 + preorder(v) = i + end if + done = .true. + do w=1,n + if (A(v,w)/=0.d0) then + if (included(w,preorder_collection(1:j),j) .eqv. .false.) then + p = p+1 + queue(p) = w + done = .false. + exit + end if + end if + end do + + if (done .eqv. .true.) then + lowlink(v) = preorder(v) + do w=1,n + if (A(v,w)/=0.d0) then + if (scc_found(w) .eqv. .false.) then + if (preorder(w) > preorder(v)) then + lowlink(v) = min(lowlink(v),lowlink(w)) + else + lowlink(v) = min(lowlink(v),preorder(w)) + end if + end if + end if + end do + p = p-1 + if (lowlink(v)==preorder(v)) then + scc_found(v) = .true. + r = r+1 + scc_vertices(v) = r + do while (q>0 .and. preorder(scc_queue(q))>preorder(v)) + k = scc_queue(q) + q = q-1 + scc_found(k) = .true. + scc_vertices(k) = r + end do + else + q = q+1 + scc_queue(q) = v + end if + end if + + end do + end if + end do + + contains + + function included(s,t,m) + + integer, intent(in) :: s, m + integer, intent(in) :: t(m) + + logical :: included + integer :: ii + + included = .false. + + do ii=1,m + if (s==t(ii)) then + included = .true. + exit + end if + end do + + end function included + +end subroutine strongly_connected_components \ No newline at end of file diff --git a/pypathway/hotnet/hotnet2/stats.py b/pypathway/hotnet/hotnet2/stats.py new file mode 100755 index 0000000..248c7fc --- /dev/null +++ b/pypathway/hotnet/hotnet2/stats.py @@ -0,0 +1,119 @@ +# -*- coding: iso-8859-1 -*- +from collections import defaultdict +import multiprocessing as mp +import networkx as nx +from . import hotnet2 as hn +from . import hnio + +strong_ccs = nx.strongly_connected_components + +def num_components_min_size(G, sizes): + """Return a list of the number of connected components of size at least s for each s in sizes. + + Arguments: + G -- a networkx Graph or DiGraph + sizes -- an iterable of minimum connected component sizes + + """ + ccs = strong_ccs(G) if isinstance(G, nx.DiGraph) else nx.connected_components(G) + cc_sizes = [len(cc) for cc in ccs] + return [sum(1 for cc_size in cc_sizes if cc_size >= s) for s in sizes] + +def significance_wrapper(xxx_todo_changeme): + (infmat, index2gene, heat_permutation, delta, sizes, directed) = xxx_todo_changeme + sim, index2gene = hn.similarity_matrix(infmat, index2gene, heat_permutation, directed) + G = hn.weighted_graph(sim, index2gene, delta, directed) + return num_components_min_size(G, sizes) + +def network_significance_wrapper(xxx_todo_changeme1): + (network_path, infmat_name, index2gene, heat, delta, sizes, directed) = xxx_todo_changeme1 + permuted_mat = hnio.load_hdf5(network_path)[infmat_name] + sim, index2gene = hn.similarity_matrix(permuted_mat, index2gene, heat, directed) + G = hn.weighted_graph(sim, index2gene, delta, directed) + return num_components_min_size(G, sizes) + +def calculate_permuted_cc_counts_network(network_paths, infmat_name, index2gene, heat, delta, + sizes=list(range(2,11)), directed=True, num_cores=1): + """Return a dict mapping a CC size to a list of the number of CCs of that size or greater in + each permutation. + """ + if num_cores != 1: + pool = mp.Pool(None if num_cores == -1 else num_cores) + map_fn = pool.map + else: + map_fn = map + + args = [(network_path, infmat_name, index2gene, heat, delta, sizes, directed) + for network_path in network_paths] + all_counts = map_fn(network_significance_wrapper, args) + + if num_cores != 1: + pool.close() + pool.join() + + # Parse the results into a map of k -> counts + size2counts = defaultdict(list) + for counts in all_counts: + for size, count in zip(sizes, counts): size2counts[size].append(count) + + return size2counts + +def calculate_permuted_cc_counts(infmat, index2gene, heat_permutations, delta, + sizes=list(range(2,11)), directed=True, num_cores=1): + """Return a dict mapping a CC size to a list of the number of CCs of that size or greater in + each permutation. + + Arguments: + infmat -- 2D ndarray representing an influence matrix + index2gene -- dict mapping an index in the matrix to the name of the gene represented at that + index in the influence matrix + heat_permutations -- iterable of dicts mapping a gene name to the permuted heat score for that gene + delta -- threshold for edge weight removal + sizes -- list of sizes for which the number of connected components of that sizes should be + calculated in each permutation + directed -- whether the graph constructed from each permuted similarity matrix should be directed + num_cores -- number of cores to use for running in parallel + + """ + if num_cores != 1: + pool = mp.Pool(None if num_cores == -1 else num_cores) + map_fn = pool.map + else: + map_fn = map + + args = [(infmat, index2gene, heat_permutation, delta, sizes, directed) + for heat_permutation in heat_permutations] + all_counts = map_fn(significance_wrapper, args) + + if num_cores != 1: + pool.close() + pool.join() + + # Parse the results into a map of k -> counts + size2counts = defaultdict(list) + for counts in all_counts: + for size, count in zip(sizes, counts): size2counts[size].append(count) + + return size2counts + +def compute_statistics(size2counts_real, size2counts_permuted, num_permutations): + """Return a dict mapping a CC size to a tuple with the expected number of CCs of at least that + size based on permuted data, the observed number of CCs of at least that size in the real data, + and the p-value for the observed number. + + sizes2counts_real -- dict mapping a CC size to the number of CCs of that size or greater + observed in the real data + sizes2counts_permuted -- dict mapping a CC size to a list of the number of CCs of that size or + greater in each permutation + num_permutations -- the number of permuted data sets represented in sizes2counts_permuted + + """ + num_permutations = float(num_permutations) + size2stats = dict() + for size, counts in list(size2counts_permuted.items()): + observed = size2counts_real[size] + expected = sum(counts) / num_permutations + pval = len([c for c in counts if c >= observed]) / num_permutations + size2stats[size] = dict(observed=observed, expected=expected, pval=pval) + + return size2stats diff --git a/pypathway/hotnet/hotnet2/viz.py b/pypathway/hotnet/hotnet2/viz.py new file mode 100755 index 0000000..a9cef17 --- /dev/null +++ b/pypathway/hotnet/hotnet2/viz.py @@ -0,0 +1,66 @@ +from . import hnio +from collections import defaultdict + +def generate_viz_json(results, edges, network_name, gene2heat, snvs, cnas, sampleToType, d_score, d_name): + output = dict(deltas=[], subnetworks=dict(), stats=dict(), gene2heat=gene2heat) + predictions = set() + samples = list(sampleToType.keys()) + for ccs, stats, delta in results: + delta = format(delta, 'g') + output['stats'][delta] = stats + output['subnetworks'][delta] = [] + for cc in ccs: + output['subnetworks'][delta].append(get_component_json(cc, gene2heat, edges, network_name, d_score, d_name)) + predictions |= set(cc) + + if snvs or cnas: + for i, cc in enumerate(ccs): + output['subnetworks'][delta][i]['coverage'] = get_coverage(cc, snvs, cnas, samples) + + # Load the mutation data + if snvs or cnas: + output['geneToMutations'] = get_mutations_json(predictions, snvs, cnas, d_name) + output['sampleToType'] = sampleToType + + return output + +def get_nodes(cc, gene2heat, d_score, d_name): + scores = d_score if d_score else gene2heat + return [{'name': d_name.get(gene, gene), 'value': scores.get(gene, float('nan'))} for gene in cc] + +def get_edges(cc, edges, networkName, d_name): + edgeData = list() + for i in range(len(cc)): + for j in range(i+1, len(cc)): + gene1 = min(cc[i], cc[j]) + gene2 = max(cc[i], cc[j]) + if (gene1, gene2) in edges or (gene2, gene1) in edges: + edgeData.append({'source': d_name.get(gene1, gene1), 'target': d_name.get(gene2, gene2), 'categories': [networkName]}) + + return edgeData + +def get_component_json(cc, gene2heat, edges, networkName, d_score, d_name): + nodes = get_nodes(cc, gene2heat, d_score, d_name) + cc_edges = get_edges(cc, edges, networkName, d_name) + + return {'nodes': nodes, 'edges': cc_edges} + +def get_mutations_json(genes, snvs, cnas, d_name): + M = defaultdict(lambda: defaultdict(list)) + for mut in snvs + cnas: + if mut.gene in genes: + M[d_name.get(mut.gene, mut.gene)][mut.sample].append(mut.mut_type) + + return M + +def get_coverage(cc, snvs, cnas, samples): + coverage = len(set( m.sample for m in snvs + cnas if m.gene in cc )) + return '{} ({:.2f}%)'.format(coverage, float(100*coverage)/len(samples)) + +def write_index_file(index_file, out_file, deltas): + index = hnio.load_file(index_file) + index += '
    \n' + for delta in sorted(deltas): + index += '
  • δ = %s
  • \n' % (delta, delta) + index += '
' + hnio.write_file(out_file, index) diff --git a/pypathway/hotnet/makeHeatFile.py b/pypathway/hotnet/makeHeatFile.py new file mode 100755 index 0000000..c783440 --- /dev/null +++ b/pypathway/hotnet/makeHeatFile.py @@ -0,0 +1,173 @@ +import argparse +import json +import sys +from hotnet2 import heat as hnheat, hnap, hnio + +def get_parser(): + description = "Generates a JSON heat file for input to runHotNet2." + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + + parent_parser = hnap.HotNetArgParser(add_help=False, fromfile_prefix_chars='@') + parent_parser.add_argument('-o', '--output_file', + help='Output file. If none given, output will be written to stdout.') + parent_parser.add_argument('-n', '--name', + help='Name/Label describing the heat scores.') + + subparsers = parser.add_subparsers(title='Heat score type') + + heat_parser = subparsers.add_parser('scores', help='Pre-computed heat scores', parents=[parent_parser]) + heat_parser.add_argument('-hf', '--heat_file', required=True, + help='Path to a tab-separated file containing a gene name in the first\ + column and the heat score for that gene in the second column of\ + each line.') + heat_parser.add_argument('-ms', '--min_heat_score', type=float, default=0, + help='Minimum heat score for genes to have their original heat score\ + in the resulting output file. Genes with score below this value\ + will be assigned score 0.') + heat_parser.add_argument('-gff', '--gene_filter_file', default=None, + help='Path to file listing genes whose heat scores should be\ + preserved, one per line. If present, all other heat scores\ + will be discarded.') + heat_parser.set_defaults(heat_fn=load_direct_heat) + + mutation_parser = subparsers.add_parser('mutation', help='Mutation data', parents=[parent_parser]) + mutation_parser.add_argument('--snv_file', required=True, + help='Path to a tab-separated file containing SNVs where the first\ + column of each line is a sample ID and subsequent columns\ + contain the names of genes with SNVs in that sample. Lines\ + starting with "#" will be ignored.') + mutation_parser.add_argument('--cna_file', + help='Path to a tab-separated file containing CNAs where the first\ + column of each line is a sample ID and subsequent columns\ + contain gene names followed by "(A)" or "(D)" indicating an\ + amplification or deletion in that gene for the sample.\ + Lines starting with "#" will be ignored.') + mutation_parser.add_argument('--sample_file', default=None, + help='File listing samples. Any SNVs or CNAs in samples not listed\ + in this file will be ignored. If HotNet is run with mutation\ + permutation testing, all samples in this file will be eligible\ + for random mutations even if the sample did not have any\ + mutations in the real data. If not provided, the set of samples\ + is assumed to be all samples that are provided in the SNV\ + or CNA data.') + mutation_parser.add_argument('--sample_type_file', default=None, + help='File listing type (e.g. cancer, datasets, etc.) of samples\ + (see --sample_file). Each line is a space-separated row\ + listing one sample and its type. The sample types are used\ + for creating the HotNet(2) web output.') + mutation_parser.add_argument('--gene_file', default=None, + help='File listing tested genes. SNVs or CNAs in genes not listed\ + in this file will be ignored. If HotNet is run with mutation\ + permutation testing, every gene in this file will be eligible\ + for random mutations even if the gene did not have mutations\ + in any samples in the original data. If not provided, the set\ + of tested genes is assumed to be all genes that have mutations\ + in either the SNV or CNA data.') + mutation_parser.add_argument('--min_freq', type=int, default=1, + help='The minimum number of samples in which a gene must have an\ + SNV to be considered mutated in the heat score calculation.') + mutation_parser.add_argument('--cna_filter_threshold', type=valid_cna_filter_thresh, + default=None, + help='Proportion of CNAs in a gene across samples that must share\ + the same CNA type in order for the CNAs to be included. This\ + must either be > .5, or the default, None, in which case all\ + CNAs will be included.') + mutation_parser.set_defaults(heat_fn=load_mutation_heat) + + oncodrive_parser = subparsers.add_parser('oncodrive', help='Oncodrive scores', parents=[parent_parser]) + oncodrive_parser.add_argument('--fm_scores', required=True, help='Oncodrive-FM scores (gene to q-value).') + oncodrive_parser.add_argument('--cis_amp_scores', required=True, + help='Oncodrive-CIS scores (gene to q-value); amplifications only.') + oncodrive_parser.add_argument('--cis_del_scores', required=True, + help='Oncodrive-CIS scores (gene to q-value); deletions only.') + oncodrive_parser.add_argument('--fm_threshold', type=float, default=0.2, + help='Maximum Oncodrive-FM q-value threshold') + oncodrive_parser.add_argument('--cis_threshold', type=float, default=0.2, + help='Maximum Oncodrive-CIS q-value threshold') + oncodrive_parser.add_argument('--cis', default=False, action='store_true', + help='Flag whether to include Oncodrive-CIS scores when generating '\ + 'the Oncodrive heat file.') + oncodrive_parser.add_argument('--gene_filter_file', default=None, + help='File listing genes whose heat scores should be preserved.\ + If present, all other heat scores will be discarded.') + oncodrive_parser.set_defaults(heat_fn=load_oncodrive_heat) + + mutsig_parser = subparsers.add_parser('mutsig', help='MutSig scores', parents=[parent_parser]) + mutsig_parser.add_argument('--mutsig_score_file', required=True, help='MutSig score file (gene to q-value).') + mutsig_parser.add_argument('--threshold', type=float, default=1.0, help='Maximum q-value threshold.') + mutsig_parser.add_argument('--gene_filter_file', default=None, + help='File listing genes whose heat scores should be preserved.\ + If present, all other heat scores will be discarded.') + mutsig_parser.set_defaults(heat_fn=load_mutsig_heat) + + music_parser = subparsers.add_parser('music', help='MuSiC scores', parents=[parent_parser]) + music_parser.add_argument('--music_score_file', required=True, help='MuSiC score file (gene to q-value).') + music_parser.add_argument('--threshold', type=float, default=1.0, help='Maximum q-value threshold.') + music_parser.add_argument('--max_heat', type=float, default=15, help='Max heat') + music_parser.add_argument('--gene_filter_file', default=None, + help='File listing genes whose heat scores should be preserved.\ + If present, all other heat scores will be discarded.') + music_parser.set_defaults(heat_fn=load_music_heat) + + return parser + +def valid_cna_filter_thresh(string): + value = float(string) + if value <= .5: + raise argparse.ArgumentTypeError("cna_filter_threshold must be > .5") + return value + +def load_direct_heat(args): + heat = hnio.load_heat_tsv(args.heat_file) + print("* Loading heat scores for %s genes" % len(heat)) + + #ensure that all heat scores are positive + bad_genes = [gene for gene in heat if heat[gene] < 0] + if bad_genes: + raise ValueError("ERROR: All gene heat scores must be non-negative. There are %s genes with\ + negative heat scores: %s" % (len(bad_genes), bad_genes)) + + heat, _ = hnheat.filter_heat(heat, args.min_heat_score, True, + 'Assigning score 0 to ## genes with score below %s' % args.min_heat_score) + return heat + +def load_mutation_heat(args): + genes = hnio.load_genes(args.gene_file) if args.gene_file else None + samples = hnio.load_samples(args.sample_file) if args.sample_file else None + snvs = hnio.load_snvs(args.snv_file, genes, samples) + cnas = hnio.load_cnas(args.cna_file, genes, samples) if args.cna_file else [] + if args.cna_filter_threshold: + cnas = hnheat.filter_cnas(cnas, args.cna_filter_threshold) + + if not samples: + samples = set([snv.sample for snv in snvs] + [cna.sample for cna in cnas]) + if not genes: + genes = set([snv.gene for snv in snvs] + [cna.gene for cna in cnas]) + return hnheat.mut_heat(genes, len(samples), snvs, cnas, args.min_freq) + +def load_oncodrive_heat(args): + gene2heat = hnio.load_oncodrive_data(args.fm_scores, args.cis_amp_scores, args.cis_del_scores) + return hnheat.fm_heat(gene2heat, args.fm_threshold, args.cis_threshold, args.cis) + +def load_mutsig_heat(args): + gene2mutsig = hnio.load_mutsig_scores(args.mutsig_score_file) + return hnheat.mutsig_heat(gene2mutsig, args.threshold) + +def load_music_heat(args): + gene2music = hnio.load_music_scores(args.music_score_file) + return hnheat.music_heat(gene2music, args.threshold, args.max_heat) + +def run(args): + heat = args.heat_fn(args) + if args.heat_fn != load_mutation_heat and args.gene_filter_file: + heat = hnheat.reconcile_heat_with_tested_genes(heat, hnio.load_genes(args.gene_filter_file)) + + args.heat_fn = args.heat_fn.__name__ + output_dict = {"parameters": vars(args), "heat": heat} + + output_file = open(args.output_file, 'w') if args.output_file else sys.stdout + json.dump(output_dict, output_file, indent=4) + if (args.output_file): output_file.close() + +if __name__ == "__main__": + run(get_parser().parse_args(sys.argv[1:])) diff --git a/pypathway/hotnet/makeNetworkFiles.py b/pypathway/hotnet/makeNetworkFiles.py new file mode 100755 index 0000000..f08577b --- /dev/null +++ b/pypathway/hotnet/makeNetworkFiles.py @@ -0,0 +1,92 @@ +#!/usr/bin/python + +# Load required modules +import os, sys, multiprocessing as mp + +# Load HotNet2 and script for permuting networks +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/scripts/')) +from hotnet2 import * +import permuteNetwork as permute + +# Argument parser +def get_parser(): + description = 'Create the personalized pagerank matrix and 100 permuted PPR matrices for the'\ + 'given network and restart probability beta.' + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + parser.add_argument('-e', '--edgelist_file', required=True, + help='Path to TSV file listing edges of the interaction network, where'\ + 'each row contains the indices of two genes that are connected in the'\ + 'network.') + parser.add_argument('-i', '--gene_index_file', required=True, + help='Path to tab-separated file containing an index in the first column'\ + 'and the name of the gene represented at that index in the second'\ + 'column of each line.') + parser.add_argument('-nn', '--network_name', required=True, + help='Name of network.') + parser.add_argument('-p', '--prefix', required=True, + help='Output prefix.') + parser.add_argument('-is', '--index_file_start_index', default=1, type=int, + help='Minimum index in the index file.') + parser.add_argument('-b', '--beta', required=True, type=float, + help='Beta is the restart probability for the '\ + 'insulated heat diffusion process.') + + parser.add_argument('-op', '--only_permutations', action='store_true', + help='Only permutations, i.e., do not generate influence matrix for'\ + 'observed data. Useful for generating permuted network files on'\ + 'multiple machines.') + parser.add_argument('-q', '--Q', default=115, type=float, + help='Edge swap constant. The script will attempt Q*|E| edge swaps') + parser.add_argument('-ps', '--permutation_start_index', default=1, type=int, + help='Index at which to start permutation file names.') + parser.add_argument('-np', '--num_permutations', default=100, type=int, + help='Number of permuted networks to create.') + + parser.add_argument('-o', '--output_dir', required=True, + help='Output directory.') + parser.add_argument('-c', '--cores', default=1, type=int, + help='Use given number of cores. Pass -1 to use all available.') + + return parser + +def run(args): + # create output directory if doesn't exist; warn if it exists and is not empty + if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) + if len(os.listdir(args.output_dir)) > 0: + print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " + "(Ctrl-c to cancel).") + + # make real PPR + if not args.only_permutations: + print("\nCreating PPR matrix for real network") + print("--------------------------------------") + pprfile = "{}/{}_ppr_{:g}.h5".format(args.output_dir, args.prefix, args.beta) + params = dict(network_name=args.network_name) + save_diffusion_to_file( HOTNET2, args.beta, args.gene_index_file, args.edgelist_file, pprfile, params=params) + + # make permuted edge lists + if args.num_permutations > 0: + print("\nCreating edge lists for permuted networks") + print("-------------------------------------------") + perm_dir = '%s/permuted' % args.output_dir + perm_path = '{}/{}_ppr_{:g}_##NUM##.h5'.format(perm_dir, args.prefix, args.beta) + if not os.path.exists(perm_dir): os.makedirs(perm_dir) + pargs = '-q %s -s %s -e %s -p %s -o %s -n %s -c %s' % (args.Q, args.permutation_start_index, args.edgelist_file, + args.prefix, perm_dir, args.num_permutations, args.cores) + permute.run(permute.get_parser().parse_args(pargs.split())) + + # make permuted PPRs + print("\nCreating PPR matrices for permuted networks") + print("---------------------------------------------") + diffusion_args = [] + params = dict(network_name=args.network_name, beta=args.beta) + for i in range(args.permutation_start_index, args.permutation_start_index + args.num_permutations): + sys.stdout.write("\r{}/{}".format(i, args.permutation_start_index + args.num_permutations - 1)) + sys.stdout.flush() + + edge_file = '%s/%s_edgelist_%s' % (perm_dir, args.prefix, i) + output_file = "{}/{}_ppr_{:g}_{}.h5".format(perm_dir, args.prefix, args.beta, i) + save_diffusion_to_file( HOTNET2, args.beta, args.gene_index_file, edge_file, output_file, params=params, verbose=0 ) + +if __name__ == "__main__": + run(get_parser().parse_args(sys.argv[1:])) diff --git a/pypathway/hotnet/requirements.txt b/pypathway/hotnet/requirements.txt new file mode 100755 index 0000000..589258a --- /dev/null +++ b/pypathway/hotnet/requirements.txt @@ -0,0 +1,4 @@ +scipy +numpy +networkx +h5py diff --git a/pypathway/hotnet/scripts/consensus.py b/pypathway/hotnet/scripts/consensus.py new file mode 100755 index 0000000..2c9314c --- /dev/null +++ b/pypathway/hotnet/scripts/consensus.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python + +# Load required modules +import os, sys, argparse, json, networkx as nx +from itertools import combinations +from collections import defaultdict + +# Load HotNet2 +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/../')) +from hotnet2.consensus import identify_consensus + +# Argument parser +def get_parser(): + description = 'Constructs consensus subnetworks from HotNet(2) results.' + + parser = argparse.ArgumentParser(description=description, fromfile_prefix_chars='@') + parser.add_argument('-d', '--directories', nargs='*', default=[], + help='Paths to HotNet(2) results directories; the consensus procedure chooses from these directories.') + parser.add_argument('-f', '--files', nargs='*', default=[], + help='Paths to HotNet(2) results files.') + parser.add_argument('-n', '--networks', nargs='*', required=True, + help='Networks for HotNet(2) results directories or files.') + parser.add_argument('-p', '--p_value_threshold', type=float, default=0.01, + help='Threshold for p-values; default is 0.01.') + parser.add_argument('-m', '--min_cc_size', type=int, default=2, + help='Minimum connected component size; default is 2.') + parser.add_argument('-o', '--output_file', required=True, + help='Output file; provide .json extension for JSON output.') + parser.add_argument('-v', '--verbose', action='store_true', + help='Verbose') + + return parser + +# Find the HotNet2 results file in each directory, grouping each file +# by directory +def find_results_files(directories, verbose=0): + # Identify the results files + results_files = [] + for directory in directories: + # Consider each \delta value for each run. + result_group = [] + for subdirectory in os.listdir(directory): + if os.path.isdir(os.path.join(directory, subdirectory)) and subdirectory.startswith('delta'): + result_group.append(os.path.join(directory, subdirectory, 'results.json')) + results_files.append(result_group) + + return results_files + +# Choose the results files when given directories of results files. +def load_single_runs(results_groups, verbose=0): + # Load each run + single_runs = [] + for result_group in results_groups: + # We are assuming all the results in the same directory will be + # for the same heat score and network + run_group = [] + for results_file in result_group: + with open(results_file, 'r') as f: + data = json.load(f) + heat_name = data['parameters']['heat_name'] + network_name = data['parameters']['network_name'] + run_group.append( (data['components'], data['statistics'], data['parameters']['delta']) ) + single_runs.append((network_name, heat_name, run_group)) + + return single_runs + +def run(args): + # Check arguments. + if not args.directories and not args.files: + raise ValueError('Neither the directories or files argument is specified; specify one argument.') + if args.directories and args.files: + raise ValueError('Both the directories and files arguments are specified; specify only one argument.') + + # Choose results files if given directories of results files. + if args.directories: + result_files = find_results_files(args.directories, args.verbose) + else: + result_files = [ [f] for f in args.files ] + + # Load results. + single_runs = load_single_runs(result_files, args.verbose) + + # Create the full consensus graph. + consensus, linkers, auto_deltas = identify_consensus(single_runs) + consensus_genes = set( g for cc in consensus for g in cc['core'] + cc['expansion'] ) + + # Summarize the results. + if args.verbose: + total_genes = set(v for ccs in results for cc in ccs for v in cc) + print('* Returning {} consensus genes from {} original genes...'.format(len(consensus_genes), len(total_genes))) + + # Output to file (either JSON or text depending on the file extension). + with open(args.output_file, 'w') as f: + if args.output_file.lower().endswith('.json'): + # Convert the consensus to lists + consensus = [ dict(core=list(c['core']), expansion=list(c['expansion'])) for c in consensus ] + output = dict(linkers=list(linkers), consensus=consensus, deltas=auto_deltas) + json.dump(output, f, sort_keys=True, indent=4) + else: + output = [ '# Linkers: {}'.format(', '.join(sorted(linkers))), '# Core\tExpansion' ] + output += [ '{}\t{}'.format(', '.join(sorted(c['core'])), ', '.join(sorted(c['expansion']))) for c in consensus ] + f.write( '\n'.join(output) ) + +if __name__ == '__main__': + run(get_parser().parse_args( sys.argv[1:]) ) diff --git a/pypathway/hotnet/scripts/consensus.py.bak b/pypathway/hotnet/scripts/consensus.py.bak new file mode 100755 index 0000000..6d1a96a --- /dev/null +++ b/pypathway/hotnet/scripts/consensus.py.bak @@ -0,0 +1,105 @@ +#!/usr/bin/env python + +# Load required modules +import os, sys, argparse, json, networkx as nx +from itertools import combinations +from collections import defaultdict + +# Load HotNet2 +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/../')) +from hotnet2.consensus import identify_consensus + +# Argument parser +def get_parser(): + description = 'Constructs consensus subnetworks from HotNet(2) results.' + + parser = argparse.ArgumentParser(description=description, fromfile_prefix_chars='@') + parser.add_argument('-d', '--directories', nargs='*', default=[], + help='Paths to HotNet(2) results directories; the consensus procedure chooses from these directories.') + parser.add_argument('-f', '--files', nargs='*', default=[], + help='Paths to HotNet(2) results files.') + parser.add_argument('-n', '--networks', nargs='*', required=True, + help='Networks for HotNet(2) results directories or files.') + parser.add_argument('-p', '--p_value_threshold', type=float, default=0.01, + help='Threshold for p-values; default is 0.01.') + parser.add_argument('-m', '--min_cc_size', type=int, default=2, + help='Minimum connected component size; default is 2.') + parser.add_argument('-o', '--output_file', required=True, + help='Output file; provide .json extension for JSON output.') + parser.add_argument('-v', '--verbose', action='store_true', + help='Verbose') + + return parser + +# Find the HotNet2 results file in each directory, grouping each file +# by directory +def find_results_files(directories, verbose=0): + # Identify the results files + results_files = [] + for directory in directories: + # Consider each \delta value for each run. + result_group = [] + for subdirectory in os.listdir(directory): + if os.path.isdir(os.path.join(directory, subdirectory)) and subdirectory.startswith('delta'): + result_group.append(os.path.join(directory, subdirectory, 'results.json')) + results_files.append(result_group) + + return results_files + +# Choose the results files when given directories of results files. +def load_single_runs(results_groups, verbose=0): + # Load each run + single_runs = [] + for result_group in results_groups: + # We are assuming all the results in the same directory will be + # for the same heat score and network + run_group = [] + for results_file in result_group: + with open(results_file, 'r') as f: + data = json.load(f) + heat_name = data['parameters']['heat_name'] + network_name = data['parameters']['network_name'] + run_group.append( (data['components'], data['statistics'], data['parameters']['delta']) ) + single_runs.append((network_name, heat_name, run_group)) + + return single_runs + +def run(args): + # Check arguments. + if not args.directories and not args.files: + raise ValueError('Neither the directories or files argument is specified; specify one argument.') + if args.directories and args.files: + raise ValueError('Both the directories and files arguments are specified; specify only one argument.') + + # Choose results files if given directories of results files. + if args.directories: + result_files = find_results_files(args.directories, args.verbose) + else: + result_files = [ [f] for f in args.files ] + + # Load results. + single_runs = load_single_runs(result_files, args.verbose) + + # Create the full consensus graph. + consensus, linkers, auto_deltas = identify_consensus(single_runs) + consensus_genes = set( g for cc in consensus for g in cc['core'] + cc['expansion'] ) + + # Summarize the results. + if args.verbose: + total_genes = set(v for ccs in results for cc in ccs for v in cc) + print '* Returning {} consensus genes from {} original genes...'.format(len(consensus_genes), len(total_genes)) + + # Output to file (either JSON or text depending on the file extension). + with open(args.output_file, 'w') as f: + if args.output_file.lower().endswith('.json'): + # Convert the consensus to lists + consensus = [ dict(core=list(c['core']), expansion=list(c['expansion'])) for c in consensus ] + output = dict(linkers=list(linkers), consensus=consensus, deltas=auto_deltas) + json.dump(output, f, sort_keys=True, indent=4) + else: + output = [ '# Linkers: {}'.format(', '.join(sorted(linkers))), '# Core\tExpansion' ] + output += [ '{}\t{}'.format(', '.join(sorted(c['core'])), ', '.join(sorted(c['expansion']))) for c in consensus ] + f.write( '\n'.join(output) ) + +if __name__ == '__main__': + run(get_parser().parse_args( sys.argv[1:]) ) diff --git a/pypathway/hotnet/scripts/createDendrogram.py b/pypathway/hotnet/scripts/createDendrogram.py new file mode 100755 index 0000000..eff2ac0 --- /dev/null +++ b/pypathway/hotnet/scripts/createDendrogram.py @@ -0,0 +1,84 @@ +#!/usr/bin/python + +# Load the required modules +import sys, os, json, pickle, os.path, scipy.io +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/../')) +from hotnet2 import hotnet2 as hn +from hotnet2 import hnap, hnio, heat as hnheat +from hotnet2.hierarchy import HD, convertToLinkage, convertToNewick + +# Parse arguments +def get_parser(): + description = 'Create a hierarchical decomposition of the HotNet2 similarity matrix.' + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + parser.add_argument('-r', '--run_name', required=False, default='Hotnet2', + help='Name of run to appear in output files.') + parser.add_argument('-mf', '--infmat_file', required=True, + help='Path to .mat file containing influence matrix') + parser.add_argument('-if', '--infmat_index_file', required=True, + help='Path to tab-separated file containing an index in the first column\ + and the name of the gene represented at that index in the second\ + column of each line.') + parser.add_argument('-hf', '--heat_file', required=True, + help='Path to heat file containing gene names and scores. This can either\ + be a JSON file created by generateHeat.py, in which case the file\ + name must end in .json, or a tab-separated file containing a gene\ + name in the first column and the heat score for that gene in the\ + second column of each line.') + parser.add_argument('-in', '--infmat_name', required=False, default='PPR', + help='Name of matrix in MATLAB file.') + parser.add_argument('-v', '--verbose', required=False, default=False, action='store_true', + help='Flag verbose output.') + parser.add_argument('-o', '--output_directory', required=True, + help='Output directory in which the hierarchy should be created.') + return parser + +# Create the dendrogram and output in .json and .pickle formats +def createDendrogram(sim, genespace, output_directory, params, verbose=False): + if verbose: print('* Creating dendrogram...') + + # Perform the clustering + if verbose: print('\t- Performing hierarchical clustering...') + T = HD(genespace, sim, False) + + # Convert to SciPy linkage matrix (http://goo.gl/yd6z3V) + if verbose: print('\t- Converting into SciPy linkage matrix format...') + Z, labels = convertToLinkage(T) + hierarchyOutput = dict(Z=Z, labels=labels, params=params) + + newickOutput = convertToNewick(T) + + # Output the linkage matrix to JSON and Newick file + if verbose: print('\t- Outputting to JSON and Newick files...') + with open('{}/hn2-hierarchy.json'.format(output_directory), "w") as out: + json.dump(hierarchyOutput, out) + + with open('{}/hn2-hierarchy.newick.txt'.format(output_directory), "w") as out: + out.write(newickOutput) + +# Run +def run(args): + # Load the input data + if args.verbose: print('* Loading infmat and heat files...') + infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) + full_index2gene = hnio.load_index(args.infmat_index_file) + + using_json_heat = os.path.splitext(args.heat_file.lower())[1] == '.json' + if using_json_heat: + heat = json.load(open(args.heat_file))['heat'] + else: + heat = hnio.load_heat_tsv(args.heat_file) + print("* Loaded heat scores for %s genes" % len(heat)) + + # filter out genes not in the network + heat = hnheat.filter_heat_to_network_genes(heat, set(full_index2gene.values())) + + # genes with score 0 cannot be in output components, but are eligible for heat in permutations + heat, addtl_genes = hnheat.filter_heat(heat, None, False, 'There are ## genes with heat score 0') + if args.verbose: print('* Creating similarity matrix...') + sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True) + + # Create and output the dendrogram + createDendrogram( sim, list(index2gene.values()), args.output_directory, vars(args), args.verbose ) + +if __name__ == "__main__": run( get_parser().parse_args(sys.argv[1:])) diff --git a/pypathway/hotnet/scripts/createDendrogram.py.bak b/pypathway/hotnet/scripts/createDendrogram.py.bak new file mode 100755 index 0000000..8cdde82 --- /dev/null +++ b/pypathway/hotnet/scripts/createDendrogram.py.bak @@ -0,0 +1,84 @@ +#!/usr/bin/python + +# Load the required modules +import sys, os, json, pickle, os.path, scipy.io +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/../')) +from hotnet2 import hotnet2 as hn +from hotnet2 import hnap, hnio, heat as hnheat +from hotnet2.hierarchy import HD, convertToLinkage, convertToNewick + +# Parse arguments +def get_parser(): + description = 'Create a hierarchical decomposition of the HotNet2 similarity matrix.' + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + parser.add_argument('-r', '--run_name', required=False, default='Hotnet2', + help='Name of run to appear in output files.') + parser.add_argument('-mf', '--infmat_file', required=True, + help='Path to .mat file containing influence matrix') + parser.add_argument('-if', '--infmat_index_file', required=True, + help='Path to tab-separated file containing an index in the first column\ + and the name of the gene represented at that index in the second\ + column of each line.') + parser.add_argument('-hf', '--heat_file', required=True, + help='Path to heat file containing gene names and scores. This can either\ + be a JSON file created by generateHeat.py, in which case the file\ + name must end in .json, or a tab-separated file containing a gene\ + name in the first column and the heat score for that gene in the\ + second column of each line.') + parser.add_argument('-in', '--infmat_name', required=False, default='PPR', + help='Name of matrix in MATLAB file.') + parser.add_argument('-v', '--verbose', required=False, default=False, action='store_true', + help='Flag verbose output.') + parser.add_argument('-o', '--output_directory', required=True, + help='Output directory in which the hierarchy should be created.') + return parser + +# Create the dendrogram and output in .json and .pickle formats +def createDendrogram(sim, genespace, output_directory, params, verbose=False): + if verbose: print '* Creating dendrogram...' + + # Perform the clustering + if verbose: print '\t- Performing hierarchical clustering...' + T = HD(genespace, sim, False) + + # Convert to SciPy linkage matrix (http://goo.gl/yd6z3V) + if verbose: print '\t- Converting into SciPy linkage matrix format...' + Z, labels = convertToLinkage(T) + hierarchyOutput = dict(Z=Z, labels=labels, params=params) + + newickOutput = convertToNewick(T) + + # Output the linkage matrix to JSON and Newick file + if verbose: print '\t- Outputting to JSON and Newick files...' + with open('{}/hn2-hierarchy.json'.format(output_directory), "w") as out: + json.dump(hierarchyOutput, out) + + with open('{}/hn2-hierarchy.newick.txt'.format(output_directory), "w") as out: + out.write(newickOutput) + +# Run +def run(args): + # Load the input data + if args.verbose: print '* Loading infmat and heat files...' + infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) + full_index2gene = hnio.load_index(args.infmat_index_file) + + using_json_heat = os.path.splitext(args.heat_file.lower())[1] == '.json' + if using_json_heat: + heat = json.load(open(args.heat_file))['heat'] + else: + heat = hnio.load_heat_tsv(args.heat_file) + print "* Loaded heat scores for %s genes" % len(heat) + + # filter out genes not in the network + heat = hnheat.filter_heat_to_network_genes(heat, set(full_index2gene.values())) + + # genes with score 0 cannot be in output components, but are eligible for heat in permutations + heat, addtl_genes = hnheat.filter_heat(heat, None, False, 'There are ## genes with heat score 0') + if args.verbose: print '* Creating similarity matrix...' + sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True) + + # Create and output the dendrogram + createDendrogram( sim, index2gene.values(), args.output_directory, vars(args), args.verbose ) + +if __name__ == "__main__": run( get_parser().parse_args(sys.argv[1:])) diff --git a/pypathway/hotnet/scripts/createInfluenceMatrix.py b/pypathway/hotnet/scripts/createInfluenceMatrix.py new file mode 100755 index 0000000..5f1b7ff --- /dev/null +++ b/pypathway/hotnet/scripts/createInfluenceMatrix.py @@ -0,0 +1,48 @@ +#!/usr/bin/python + +# Load required modules +import sys, os, numpy as np, networkx as nx, scipy as sp, scipy.io +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/../')) +from hotnet2 import * + +# Parse arguments +def get_parser(): + # Common arguments + description = 'Create the personalized PageRank matrix for the given '\ + 'network and restart probability beta.' + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + parser.add_argument('-e', '--edgelist_file', required=True, + help='Location of edgelist file.') + parser.add_argument('-i', '--gene_index_file', required=True, + help='Location of gene-index file.') + parser.add_argument('-n', '--network_name', required=True, + help='Name of network.') + parser.add_argument('-o', '--output_file', required=True, + help="Output file.") + parser.add_argument('-v', '--verbose', required=False, default=0, type=int, choices=list(range(5)), + help="Control verbosity of output.") + + # Subparsers for the different diffusion types + subparser = parser.add_subparsers(dest='diffusion_type', help='Diffusion method') + + hn2_parser = subparser.add_parser(HOTNET2) + hn2_parser.add_argument('-b', '--beta', required=True, type=float, help="Restart probability beta.") + + hn_parser = subparser.add_parser(HOTNET) + hn_parser.add_argument('-t', '--time', required=True, type=float, help='Diffusion time.') + + return parser + +def run(args): + params = dict(network_name=args.network_name) + if args.diffusion_type == HOTNET2: + diffusion_param = args.beta + elif args.diffusion_type == HOTNET: + diffusion_param = args.time + else: + raise NotImplementedError('Diffusion of type "%s" not implemented' % args.diffusion_type) + save_diffusion_to_file(args.diffusion_type, diffusion_param, args.gene_index_file, args.edgelist_file, + args.output_file, params=params, verbose=args.verbose) + +if __name__ == "__main__": + run(get_parser().parse_args(sys.argv[1:])) diff --git a/pypathway/hotnet/scripts/createInfluenceMatrix.py.bak b/pypathway/hotnet/scripts/createInfluenceMatrix.py.bak new file mode 100755 index 0000000..10e67c4 --- /dev/null +++ b/pypathway/hotnet/scripts/createInfluenceMatrix.py.bak @@ -0,0 +1,48 @@ +#!/usr/bin/python + +# Load required modules +import sys, os, numpy as np, networkx as nx, scipy as sp, scipy.io +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/../')) +from hotnet2 import * + +# Parse arguments +def get_parser(): + # Common arguments + description = 'Create the personalized PageRank matrix for the given '\ + 'network and restart probability beta.' + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + parser.add_argument('-e', '--edgelist_file', required=True, + help='Location of edgelist file.') + parser.add_argument('-i', '--gene_index_file', required=True, + help='Location of gene-index file.') + parser.add_argument('-n', '--network_name', required=True, + help='Name of network.') + parser.add_argument('-o', '--output_file', required=True, + help="Output file.") + parser.add_argument('-v', '--verbose', required=False, default=0, type=int, choices=range(5), + help="Control verbosity of output.") + + # Subparsers for the different diffusion types + subparser = parser.add_subparsers(dest='diffusion_type', help='Diffusion method') + + hn2_parser = subparser.add_parser(HOTNET2) + hn2_parser.add_argument('-b', '--beta', required=True, type=float, help="Restart probability beta.") + + hn_parser = subparser.add_parser(HOTNET) + hn_parser.add_argument('-t', '--time', required=True, type=float, help='Diffusion time.') + + return parser + +def run(args): + params = dict(network_name=args.network_name) + if args.diffusion_type == HOTNET2: + diffusion_param = args.beta + elif args.diffusion_type == HOTNET: + diffusion_param = args.time + else: + raise NotImplementedError('Diffusion of type "%s" not implemented' % args.diffusion_type) + save_diffusion_to_file(args.diffusion_type, diffusion_param, args.gene_index_file, args.edgelist_file, + args.output_file, params=params, verbose=args.verbose) + +if __name__ == "__main__": + run(get_parser().parse_args(sys.argv[1:])) diff --git a/pypathway/hotnet/scripts/permuteNetwork.py b/pypathway/hotnet/scripts/permuteNetwork.py new file mode 100755 index 0000000..a61484f --- /dev/null +++ b/pypathway/hotnet/scripts/permuteNetwork.py @@ -0,0 +1,105 @@ +#!/usr/bin/python + +# Load required modules +import sys, os, networkx as nx, multiprocessing as mp +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/../')) +from traceback import format_exc + +try: + from pypathway.analysis.modelling.third_party.hotnet2.hotnet2.fast_swap import node_swap +except: + pass + # print(format_exc()) + # print("slow node swap") + node_swap = None +else: + pass + +# Parse arguments +def get_parser(): + from hotnet2 import hnap + description = 'Creates permuted versions of the given network, where each '\ + 'node retains its degree.' + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + # We set Q to be 115 by default (instead of the standard 100) because some + # proportion of swaps fail as we are doing *connected* edge swaps + parser.add_argument('-q', '--Q', default=115, type=float, + help='Edge swap constant.') + parser.add_argument('-s', '--start_index', default=1, type=int, + help='Index to start output of permutations at.') + parser.add_argument('-e', '--edgelist_file', required=True, + help='Edgelist file path.') + parser.add_argument('-p', '--output_prefix', required=True, + help='Output prefix.') + parser.add_argument('-o', '--output_dir', required=True, + help='Output directory.') + parser.add_argument('-n', '--num_permutations', default=100, type=int, + help='Number of permuted networks to create.') + parser.add_argument('-c', '--cores', default=1, type=int, + help='Use given number of cores. Pass -1 to use all available.') + + return parser + +def permute_network( G, Q, numEdges, outputFile ): + # Permutes network by swapping edges Q * numEdges times + H = G.copy() + nswap = Q*numEdges + if node_swap: + #print("fast node swap") + H = node_swap(H, nswap=nswap) + # fix me + swaps = nswap + else: + swaps = nx.connected_double_edge_swap(H, nswap=nswap) + nx.write_edgelist(H, outputFile) + return swaps + +store = dict(maxSeen=-1) +def permute_network_wrapper(xxx_todo_changeme): + (G, Q, numEdges, outputFile, i, n) = xxx_todo_changeme + swaps = permute_network( G, Q, numEdges, outputFile ) + store['maxSeen'] = max(store['maxSeen'], i) + sys.stdout.write("\r{}/{}".format(store['maxSeen'], n)) + sys.stdout.flush() + return swaps + +def run(args): + # Load graph + from hotnet2 import largest_component + print("* Loading edge list..") + G = nx.Graph() + with open(args.edgelist_file) as infile: + G.add_edges_from([ l.rstrip().split()[:2] for l in infile ]) + G = largest_component(G) + numEdges, numNodes = len(G.edges()), len(G.nodes()) + + # Report info about the graph and number of swaps + Q = args.Q + print("\t- {} edges among {} nodes.".format(numEdges, numNodes)) + print("\t- No. swaps to attempt = {}".format(Q*numEdges)) + + # Permute graph the prescribed number of times + print("* Creating permuted networks...") + def outputFileName(i): + return "{}/{}_edgelist_{}".format(args.output_dir, args.output_prefix, i+args.start_index) + + n = args.num_permutations + if args.cores != 1: + cores = mp.cpu_count() if args.cores == -1 else min(args.cores, mp.cpu_count) + jobArgs = [ (G, Q, numEdges, outputFileName(i), i+args.start_index, n) for i in range(n) ] + pool = mp.Pool(cores) + swaps = pool.map(permute_network_wrapper, jobArgs) + pool.close() + pool.join() + else: + swaps = [ permute_network_wrapper((G, Q, numEdges, outputFileName(i), i+args.start_index, n)) + for i in range(n) ] + + print() + + # Report how many swaps were actually made + avgSwaps = int(sum(swaps) / float(len(swaps))) + print("* Avg. No. Swaps Made: {}".format(avgSwaps)) + +if __name__ == "__main__": + run(get_parser().parse_args(sys.argv[1:])) diff --git a/pypathway/hotnet/scripts/permuteNetwork.py.bak b/pypathway/hotnet/scripts/permuteNetwork.py.bak new file mode 100755 index 0000000..6011bb0 --- /dev/null +++ b/pypathway/hotnet/scripts/permuteNetwork.py.bak @@ -0,0 +1,87 @@ +#!/usr/bin/python + +# Load required modules +import sys, os, networkx as nx, multiprocessing as mp +sys.path.append(os.path.normpath(os.path.dirname(os.path.realpath(__file__)) + '/../')) + +# Parse arguments +def get_parser(): + from hotnet2 import hnap + description = 'Creates permuted versions of the given network, where each '\ + 'node retains its degree.' + parser = hnap.HotNetArgParser(description=description, fromfile_prefix_chars='@') + # We set Q to be 115 by default (instead of the standard 100) because some + # proportion of swaps fail as we are doing *connected* edge swaps + parser.add_argument('-q', '--Q', default=115, type=float, + help='Edge swap constant.') + parser.add_argument('-s', '--start_index', default=1, type=int, + help='Index to start output of permutations at.') + parser.add_argument('-e', '--edgelist_file', required=True, + help='Edgelist file path.') + parser.add_argument('-p', '--output_prefix', required=True, + help='Output prefix.') + parser.add_argument('-o', '--output_dir', required=True, + help='Output directory.') + parser.add_argument('-n', '--num_permutations', default=100, type=int, + help='Number of permuted networks to create.') + parser.add_argument('-c', '--cores', default=1, type=int, + help='Use given number of cores. Pass -1 to use all available.') + + return parser + +def permute_network( G, Q, numEdges, outputFile ): + # Permutes network by swapping edges Q * numEdges times + H = G.copy() + nswap = Q*numEdges + swaps = nx.connected_double_edge_swap(H, nswap=nswap) + nx.write_edgelist(H, outputFile) + return swaps + +store = dict(maxSeen=-1) +def permute_network_wrapper((G, Q, numEdges, outputFile, i, n)): + swaps = permute_network( G, Q, numEdges, outputFile ) + store['maxSeen'] = max(store['maxSeen'], i) + sys.stdout.write("\r{}/{}".format(store['maxSeen'], n)) + sys.stdout.flush() + return swaps + +def run(args): + # Load graph + from hotnet2 import largest_component + print "* Loading edge list.." + G = nx.Graph() + with open(args.edgelist_file) as infile: + G.add_edges_from([ l.rstrip().split()[:2] for l in infile ]) + G = largest_component(G) + numEdges, numNodes = len(G.edges()), len(G.nodes()) + + # Report info about the graph and number of swaps + Q = args.Q + print "\t- {} edges among {} nodes.".format(numEdges, numNodes) + print "\t- No. swaps to attempt = {}".format(Q*numEdges) + + # Permute graph the prescribed number of times + print "* Creating permuted networks..." + def outputFileName(i): + return "{}/{}_edgelist_{}".format(args.output_dir, args.output_prefix, i+args.start_index) + + n = args.num_permutations + if args.cores != 1: + cores = mp.cpu_count() if args.cores == -1 else min(args.cores, mp.cpu_count) + jobArgs = [ (G, Q, numEdges, outputFileName(i), i+args.start_index, n) for i in range(n) ] + pool = mp.Pool(cores) + swaps = pool.map(permute_network_wrapper, jobArgs) + pool.close() + pool.join() + else: + swaps = [ permute_network_wrapper((G, Q, numEdges, outputFileName(i), i+args.start_index, n)) + for i in range(n) ] + + print + + # Report how many swaps were actually made + avgSwaps = int(sum(swaps) / float(len(swaps))) + print "* Avg. No. Swaps Made: {}".format(avgSwaps) + +if __name__ == "__main__": + run(get_parser().parse_args(sys.argv[1:]))