Commit 06b2914a authored by Tiago Peixoto's avatar Tiago Peixoto
Browse files

Speed up motifs() and motif_significance()

In both functions we can save some time by hashing the subgraphs found,
according to their degree distribution signature (instead of number of
edges). This avoids a large number of useless exactness or isomorphism
comparisons.

This also removes the "seed" parameter, which is redundant to
the numpy.random.seed() function.
parent 4d69fce0
...@@ -272,6 +272,23 @@ struct wrap_undirected ...@@ -272,6 +272,23 @@ struct wrap_undirected
}; };
}; };
// get the signature of the graph: concatenated out + in degree histograms
template <class Graph>
void get_sig(Graph& g, vector<size_t>& sig)
{
size_t N = num_vertices(g) + 1;
sig.resize(is_directed::apply<Graph>::type::value ? 2*N : N);
for (size_t i = 0; i < sig.size(); ++i)
sig[i] = 0;
typename graph_traits<Graph>::vertex_iterator v, v_end;
for (tie(v, v_end) = vertices(g); v != v_end; ++v)
{
sig[out_degree(*v,g)]++;
if(is_directed::apply<Graph>::type::value)
sig[in_degreeS()(*v,g)+N]++;
}
}
// gets (or samples) all the subgraphs in graph g // gets (or samples) all the subgraphs in graph g
struct get_all_motifs struct get_all_motifs
{ {
...@@ -288,14 +305,24 @@ struct get_all_motifs ...@@ -288,14 +305,24 @@ struct get_all_motifs
vector<graph_sg_t>& subgraph_list = vector<graph_sg_t>& subgraph_list =
any_cast<vector<graph_sg_t>&>(list); any_cast<vector<graph_sg_t>&>(list);
tr1::unordered_map<size_t, vector<pair<size_t, graph_sg_t> > > sub_list; // this hashes subgraphs according to their signature
tr1::unordered_map<vector<size_t>,
vector<pair<size_t, graph_sg_t> >,
hash<vector<size_t> > > sub_list;
vector<size_t> sig; // current signature
for (size_t i = 0; i < subgraph_list.size(); ++i) for (size_t i = 0; i < subgraph_list.size(); ++i)
sub_list[num_edges(subgraph_list[i])].\ {
push_back(make_pair(i,subgraph_list[i])); get_sig(subgraph_list[i], sig);
sub_list[sig].push_back(make_pair(i,subgraph_list[i]));
}
// the subgraph count // the subgraph count
hist.resize(subgraph_list.size()); hist.resize(subgraph_list.size());
typedef tr1::uniform_real<double> rdist_t;
tr1::variate_generator<rng_t&, rdist_t> random(rng, rdist_t());
// the set of vertices V to be sampled (filled only if p < 1) // the set of vertices V to be sampled (filled only if p < 1)
vector<size_t> V; vector<size_t> V;
if (p < 1) if (p < 1)
...@@ -304,9 +331,6 @@ struct get_all_motifs ...@@ -304,9 +331,6 @@ struct get_all_motifs
for (tie(v, v_end) = vertices(g); v != v_end; ++v) for (tie(v, v_end) = vertices(g); v != v_end; ++v)
V.push_back(*v); V.push_back(*v);
typedef tr1::uniform_real<double> rdist_t;
tr1::variate_generator<rng_t&, rdist_t> random(rng, rdist_t());
size_t n; size_t n;
if (random() < p) if (random() < p)
n = ceil(V.size()*p); n = ceil(V.size()*p);
...@@ -331,7 +355,8 @@ struct get_all_motifs ...@@ -331,7 +355,8 @@ struct get_all_motifs
#endif #endif
int i, N = (p < 1) ? V.size() : num_vertices(g); int i, N = (p < 1) ? V.size() : num_vertices(g);
#pragma omp parallel for default(shared) private(i) schedule(dynamic) #pragma omp parallel for default(shared) private(i, sig) \
schedule(dynamic)
for (i = 0; i < N; ++i) for (i = 0; i < N; ++i)
{ {
vector<vector<typename graph_traits<Graph>::vertex_descriptor> > vector<vector<typename graph_traits<Graph>::vertex_descriptor> >
...@@ -348,16 +373,26 @@ struct get_all_motifs ...@@ -348,16 +373,26 @@ struct get_all_motifs
{ {
graph_sg_t sub; graph_sg_t sub;
make_subgraph(subgraphs[j], g, sub); make_subgraph(subgraphs[j], g, sub);
get_sig(sub, sig);
#ifdef USING_OPENMP #ifdef USING_OPENMP
if (fill_list) if (fill_list)
omp_set_lock(&lock); omp_set_lock(&lock);
#endif #endif
typeof(sub_list.begin()) iter = sub_list.find(sig);
if(iter == sub_list.end())
{
if (!fill_list)
continue; // avoid inserting an element in sub_list
sub_list[sig] = vector<pair<size_t,graph_sg_t> >();
}
bool found = false; bool found = false;
for (size_t l = 0; l < sub_list[num_edges(sub)].size(); ++l) vector<pair<size_t, graph_sg_t> >& sl = sub_list[sig];
for (size_t l = 0; l < sl.size(); ++l)
{ {
graph_sg_t& motif = sub_list[num_edges(sub)][l].second; graph_sg_t& motif = sl[l].second;
if (comp_iso) if (comp_iso)
{ {
if (isomorphism(motif, sub)) if (isomorphism(motif, sub))
...@@ -368,11 +403,10 @@ struct get_all_motifs ...@@ -368,11 +403,10 @@ struct get_all_motifs
if (graph_cmp(motif, sub)) if (graph_cmp(motif, sub))
found = true; found = true;
} }
if (found) if (found)
{ {
#pragma omp critical #pragma omp critical
hist[sub_list[num_edges(sub)][l].first]++; hist[sl[l].first]++;
break; break;
} }
} }
...@@ -380,9 +414,8 @@ struct get_all_motifs ...@@ -380,9 +414,8 @@ struct get_all_motifs
if (found == false && fill_list) if (found == false && fill_list)
{ {
subgraph_list.push_back(sub); subgraph_list.push_back(sub);
sub_list[num_edges(sub)]. sl.push_back(make_pair(subgraph_list.size()-1,sub));
push_back(make_pair(subgraph_list.size()-1,sub)); hist.push_back(1);
hist.push_back(1);
} }
#ifdef USING_OPENMP #ifdef USING_OPENMP
......
...@@ -30,7 +30,9 @@ dl_import("import libgraph_tool_clustering as _gt") ...@@ -30,7 +30,9 @@ dl_import("import libgraph_tool_clustering as _gt")
from .. core import _degree, _prop, Graph from .. core import _degree, _prop, Graph
from .. topology import isomorphism from .. topology import isomorphism
from .. generation import random_rewire from .. generation import random_rewire
from .. stats import vertex_hist
from itertools import izip from itertools import izip
from collections import defaultdict
from numpy import * from numpy import *
import sys import sys
...@@ -251,7 +253,7 @@ def extended_clustering(g, props=None, max_depth=3, undirected=False): ...@@ -251,7 +253,7 @@ def extended_clustering(g, props=None, max_depth=3, undirected=False):
return props return props
def motifs(g, k, p=1.0, motif_list=None, undirected=None, seed=0): def motifs(g, k, p=1.0, motif_list=None, undirected=None):
r""" r"""
Count the occurrence of k-size subgraphs (motifs). A tuple with two lists is Count the occurrence of k-size subgraphs (motifs). A tuple with two lists is
returned: the list of motifs found, and the list with their respective returned: the list of motifs found, and the list with their respective
...@@ -263,20 +265,17 @@ def motifs(g, k, p=1.0, motif_list=None, undirected=None, seed=0): ...@@ -263,20 +265,17 @@ def motifs(g, k, p=1.0, motif_list=None, undirected=None, seed=0):
Graph to be used. Graph to be used.
k : int k : int
number of vertices of the motifs number of vertices of the motifs
p : float or float list, optional (default: 1.0) p : float or float list (optional, default: 1.0)
uniform fraction of the motifs to be sampled. If a float list is uniform fraction of the motifs to be sampled. If a float list is
provided, it will be used as the fraction at each depth provided, it will be used as the fraction at each depth
:math:`[1,\dots,k]` in the algorithm. See [wernicke_efficient_2006]_ for :math:`[1,\dots,k]` in the algorithm. See [wernicke_efficient_2006]_ for
more details. more details.
motif_list : list of Graph objects, optional motif_list : list of Graph objects, optional
If supplied, the algorithms will only search for the motifs in this list If supplied, the algorithms will only search for the motifs in this list
(or isomorphisms thereof) (or isomorphisms)
undirected : bool, optional undirected : bool, optional
Treat the graph as *undirected*, if graph is directed Treat the graph as *undirected*, if graph is directed
(this option has no effect if the graph is undirected). (this option has no effect if the graph is undirected).
seed : int, optional (default: 0)
Seed for the random number generator. It the value is 0, a random seed
is used.
Returns Returns
------- -------
...@@ -318,8 +317,7 @@ def motifs(g, k, p=1.0, motif_list=None, undirected=None, seed=0): ...@@ -318,8 +317,7 @@ def motifs(g, k, p=1.0, motif_list=None, undirected=None, seed=0):
Bioinformatics (TCBB), Volume 3, Issue 4, Pages 347-359, 2006. Bioinformatics (TCBB), Volume 3, Issue 4, Pages 347-359, 2006.
""" """
if seed == 0: seed = random.randint(0, sys.maxint)
seed = random.randint(0, sys.maxint)
sub_list = [] sub_list = []
directed_motifs = g.is_directed() if undirected == None else not undirected directed_motifs = g.is_directed() if undirected == None else not undirected
...@@ -376,10 +374,23 @@ def motifs(g, k, p=1.0, motif_list=None, undirected=None, seed=0): ...@@ -376,10 +374,23 @@ def motifs(g, k, p=1.0, motif_list=None, undirected=None, seed=0):
return sub_list, hist return sub_list, hist
def _graph_sig(g):
"""return the graph signature, i.e., the in and out degree distribution as
concatenated as a tuple."""
bins = range(0, g.num_vertices()+1)
in_dist = vertex_hist(g, "in", bins = bins if g.is_directed() else [0],
float_count=False)
out_dist = vertex_hist(g, "out", bins = bins, float_count=False)
sig = tuple([(in_dist[1][i],in_dist[0][i]) for \
i in xrange(len(in_dist[0]))] +
[(out_dist[1][i],out_dist[0][i]) for\
i in xrange(len(out_dist[0]))])
return sig
def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None, def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None,
undirected=None, self_loops=False, parallel_edges=False, threshold=0, undirected=None, self_loops=False,
full_output=False, shuffle_strategy="uncorrelated", parallel_edges=False, full_output=False,
seed=0): shuffle_strategy= "uncorrelated"):
r""" r"""
Obtain the motif significance profile, for subgraphs with k vertices. A Obtain the motif significance profile, for subgraphs with k vertices. A
tuple with two lists is returned: the list of motifs found, and their tuple with two lists is returned: the list of motifs found, and their
...@@ -391,35 +402,34 @@ def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None, ...@@ -391,35 +402,34 @@ def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None,
Graph to be used. Graph to be used.
k : int k : int
number of vertices of the motifs number of vertices of the motifs
n_shuffles : int, optional (default: 100) n_shuffles : int (optional, default: 100)
number of shuffled networks to consider for the z-score number of shuffled networks to consider for the z-score
p : float or float list, optional (default: 1.0) p : float or float list (optional, default: 1.0)
uniform fraction of the motifs to be sampled. If a float list is uniform fraction of the motifs to be sampled. If a float list is
provided, it will be used as the fraction at each depth provided, it will be used as the fraction at each depth
:math:`[1,\dots,k]` in the algorithm. See [wernicke_efficient_2006]_ for :math:`[1,\dots,k]` in the algorithm. See [wernicke_efficient_2006]_ for
more details. more details.
motif_list : list of Graph objects, optional motif_list : list of Graph objects (optional, default: None)
If supplied, the algorithms will only search for the motifs in this list If supplied, the algorithms will only search for the motifs in this list
(or isomorphisms thereof) (isomorphisms)
undirected : bool, optional threshold : int (optional, default: 0)
If a given motif count is below this level, it is not considered.
undirected : bool (optional, default: None)
Treat the graph as *undirected*, if graph is directed Treat the graph as *undirected*, if graph is directed
(this option has no effect if the graph is undirected). (this option has no effect if the graph is undirected).
self_loops : bool, optional (default: False) self_loops : bool (optional, default: False)
Whether or not the shuffled graphs are allowed to contain self-loops Whether or not the shuffled graphs are allowed to contain self-loops
parallel_edges : bool, optional (default: False) parallel_edges : bool (optional, default: False)
Whether or not the shuffled graphs are allowed to contain parallel Whether or not the shuffled graphs are allowed to contain parallel
edges. edges.
full_output : bool, optional (default: False) full_output : bool (optional, default: False)
If set to True, three additional lists are returned: the count If set to True, three additional lists are returned: the count
of each motif, the average count of each motif in the shuffled networks, of each motif, the average count of each motif in the shuffled networks,
and the standard deviation of the average count of each motif in the and the standard deviation of the average count of each motif in the
shuffled networks. shuffled networks.
shuffle_strategy : string, optional (default: "uncorrelated") shuffle_strategy : string (optional, default: "uncorrelated")
Shuffle strategy to use. Can be either "correlated" or "uncorrelated". Shuffle strategy to use. Can be either "correlated" or "uncorrelated".
See random_rewire() for details. See random_rewire() for details.
seed : int, optional (default: 0)
Seed for the random number generator. It the value is 0, a random seed
is used.
Returns Returns
------- -------
...@@ -458,6 +468,8 @@ def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None, ...@@ -458,6 +468,8 @@ def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None,
Examples Examples
-------- --------
>>> from numpy import random
>>> random.seed(42)
>>> g = gt.random_graph(100, lambda: (3,3), seed=10) >>> g = gt.random_graph(100, lambda: (3,3), seed=10)
>>> motifs, zscores = gt.motif_significance(g, 3) >>> motifs, zscores = gt.motif_significance(g, 3)
>>> print len(motifs) >>> print len(motifs)
...@@ -466,19 +478,31 @@ def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None, ...@@ -466,19 +478,31 @@ def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None,
[1.6197267471265697, 1.6271265351937325, 0.99350347553112339, -1.4729502635694927, -1.1004922497513825, -1.1181853811005562, 0.92339606894139736, -0.11, -0.10000000000000001, -0.28999999999999998, -0.22, -0.01] [1.6197267471265697, 1.6271265351937325, 0.99350347553112339, -1.4729502635694927, -1.1004922497513825, -1.1181853811005562, 0.92339606894139736, -0.11, -0.10000000000000001, -0.28999999999999998, -0.22, -0.01]
""" """
s_ms, counts = motifs(g, k, p, motif_list, undirected, seed) s_ms, counts = motifs(g, k, p, motif_list, undirected)
if threshold > 0:
s_ms, counts = zip(*[x for x in zip(s_ms, counts) if x[1] > threshold])
s_ms = list(s_ms)
counts = list(counts)
s_counts = [0]*len(s_ms) s_counts = [0]*len(s_ms)
s_dev = [0]*len(s_ms) s_dev = [0]*len(s_ms)
# group subgraphs by number of edges
m_e = defaultdict(lambda: [])
for i in xrange(len(s_ms)):
m_e[_graph_sig(s_ms[i])].append(i)
# get samples # get samples
sg = g.copy() sg = g.copy()
for i in xrange(0, n_shuffles): for i in xrange(0, n_shuffles):
random_rewire(sg, shuffle_strategy, self_loops=self_loops, random_rewire(sg, shuffle_strategy, self_loops=self_loops,
parallel_edges=parallel_edges) parallel_edges=parallel_edges)
m_temp, count_temp = motifs(sg, k, p, motif_list, undirected, seed) m_temp, count_temp = motifs(sg, k, p, motif_list, undirected)
if threshold > 0:
m_temp, count_temp = zip(*[x for x in zip(m_temp, count_temp) \
if x[1] > threshold])
for j in xrange(0, len(m_temp)): for j in xrange(0, len(m_temp)):
found = False found = False
for l in xrange(0, len(s_ms)): for l in m_e[_graph_sig(m_temp[j])]:
if isomorphism(s_ms[l], m_temp[j]): if isomorphism(s_ms[l], m_temp[j]):
found = True found = True
s_counts[l] += count_temp[j] s_counts[l] += count_temp[j]
...@@ -488,6 +512,7 @@ def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None, ...@@ -488,6 +512,7 @@ def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None,
s_counts.append(count_temp[j]) s_counts.append(count_temp[j])
s_dev.append(count_temp[j]**2) s_dev.append(count_temp[j]**2)
counts.append(0) counts.append(0)
m_e[_graph_sig(m_temp[j])].append(len(s_ms)-1)
s_counts = [ x/float(n_shuffles) for x in s_counts ] s_counts = [ x/float(n_shuffles) for x in s_counts ]
s_dev = [ max(sqrt(x[0]/float(n_shuffles) - x[1]**2),1) \ s_dev = [ max(sqrt(x[0]/float(n_shuffles) - x[1]**2),1) \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment