__init__.py 19.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#! /usr/bin/env python
# graph_tool.py -- a general graph manipulation python module
#
# Copyright (C) 2007 Tiago de Paula Peixoto <tiago@forked.de>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

19
"""
20
21
``graph_tool.clustering`` - Clustering coefficients
---------------------------------------------------
22
23
24

Provides algorithms for calculation of clustering coefficients,
aka. transitivity.
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

Summary
+++++++

.. autosummary::
   :nosignatures:

   local_clustering
   global_clustering
   extended_clustering
   motifs
   motif_significance

Contents
++++++++
40
41
"""

Tiago Peixoto's avatar
Tiago Peixoto committed
42
from .. dl_import import dl_import
43
dl_import("import libgraph_tool_clustering as _gt")
44

45
from .. core import _degree, _prop, Graph
46
47
from .. topology import isomorphism
from .. generation import random_rewire
48
from .. stats import vertex_hist
49
from itertools import izip
50
from collections import defaultdict
51
from numpy import *
52
from numpy import random
53
import sys
54

55
__all__ = ["local_clustering", "global_clustering", "extended_clustering",
56
           "motifs", "motif_significance"]
57

58
59
def local_clustering(g, prop=None, undirected=False):
    r"""
Tiago Peixoto's avatar
Tiago Peixoto committed
60
    Return the local clustering coefficients for all vertices.
61
62
63

    Parameters
    ----------
64
    g : :class:`~graph_tool.Graph`
65
        Graph to be used.
66
    prop : :class:`~graph_tool.PropertyMap` or string, optional
67
68
69
70
71
72
73
74
        Vertex property map where results will be stored. If specified, this
        parameter will also be the return value.
    undirected : bool, optional
        Calculate the *undirected* clustering coefficient, if graph is directed
        (this option has no effect if the graph is undirected).

    Returns
    -------
75
    prop : :class:`~graph_tool.PropertyMap`
76
77
78
79
80
81
        Vertex property containing the clustering coefficients.

    See Also
    --------
    global_clustering: global clustering coefficient
    extended_clustering: extended (generalized) clustering coefficient
82
    motifs: motif counting
83
84
85

    Notes
    -----
86
87
    The local clustering coefficient [watts-collective-1998]_ :math:`c_i` is
    defined as
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

    .. math::
       c_i = \frac{|\{e_{jk}\}|}{k_i(k_i-1)} :\, v_j,v_k \in N_i,\, e_{jk} \in E

    where :math:`k_i` is the out-degree of vertex :math:`i`, and

    .. math::
       N_i = \{v_j : e_{ij} \in E\}

    is the set of out-neighbours of vertex :math:`i`. For undirected graphs the
    value of :math:`c_i` is normalized as

    .. math::
       c'_i = 2c_i.

    The implemented algorithm runs in :math:`O(|V|\left< k\right>^3)` time,
    where :math:`\left< k\right>` is the average out-degree.

    If enabled during compilation, this algorithm runs in parallel.

    Examples
    --------
110
111
112
    >>> from numpy.random import seed
    >>> seed(42)
    >>> g = gt.random_graph(1000, lambda: (5,5))
113
114
    >>> clust = gt.local_clustering(g)
    >>> print gt.vertex_average(g, clust)
Tiago Peixoto's avatar
Tiago Peixoto committed
115
    (0.0052816666666666671, 0.00046415526317530143)
116
117
118

    References
    ----------
119
    .. [watts-collective-1998] D. J. Watts and Steven Strogatz, "Collective
Tiago Peixoto's avatar
Tiago Peixoto committed
120
       dynamics of 'small-world' networks", Nature, vol. 393, pp 440-442, 1998.
121
122
123
       doi:10.1038/30918
    """

124
125
    if prop == None:
        prop = g.new_vertex_property("double")
126
127
    was_directed = g.is_directed()
    if g.is_directed() and undirected:
128
129
        g.set_directed(False)
    try:
130
       _gt.extended_clustering(g._Graph__graph,
131
132
133
134
                                [_prop("v", g, prop)])
    finally:
        if was_directed and undirected:
            g.set_directed(True)
135
136
137
    return prop

def global_clustering(g):
138
    r"""
Tiago Peixoto's avatar
Tiago Peixoto committed
139
    Return the global clustering coefficient.
140
141
142

    Parameters
    ----------
143
    g : :class:`~graph_tool.Graph`
144
145
146
147
148
149
150
151
152
153
154
        Graph to be used.

    Returns
    -------
    c : tuple of floats
        Global clustering coefficient and standard deviation (jacknife method)

    See Also
    --------
    local_clustering: local clustering coefficient
    extended_clustering: extended (generalized) clustering coefficient
155
    motifs: motif counting
156
157
158

    Notes
    -----
159
160
    The global clustering coefficient [newman-structure-2003]_ :math:`c` is
    defined as
161
162
163
164
165
166
167
168
169
170
171
172

    .. math::
       c = 3 \times \frac{\text{number of triangles}}
                          {\text{number of connected triples}}

    The implemented algorithm runs in :math:`O(|V|\left< k\right>^3)` time,
    where :math:`\left< k\right>` is the average (total) degree.

    If enabled during compilation, this algorithm runs in parallel.

    Examples
    --------
173
174
175
    >>> from numpy.random import seed
    >>> seed(42)
    >>> g = gt.random_graph(1000, lambda: (5,5))
176
    >>> print gt.global_clustering(g)
Tiago Peixoto's avatar
Tiago Peixoto committed
177
    (0.0075696677384780283, 0.00039465997422993533)
178
179
180

    References
    ----------
181
182
    .. [newman-structure-2003] M. E. J. Newman, "The structure and function of
       complex networks", SIAM Review, vol. 45, pp. 167-256, 2003
183
184
    """

185
    c =_gt.global_clustering(g._Graph__graph)
186
187
    return c

188
189
def extended_clustering(g, props=None, max_depth=3, undirected=False):
    r"""
Tiago Peixoto's avatar
Tiago Peixoto committed
190
    Return the extended clustering coefficients for all vertices.
191
192
193

    Parameters
    ----------
194
    g : :class:`~graph_tool.Graph`
195
        Graph to be used.
196
    props : list of :class:`~graph_tool.PropertyMap` objects, optional
197
198
199
200
201
202
203
204
205
206
        list of vertex property maps where results will be stored. If specified,
        this parameter will also be the return value.
    max_depth : int, optional
        Maximum clustering order (default: 3).
    undirected : bool, optional
        Calculate the *undirected* clustering coefficients, if graph is directed
        (this option has no effect if the graph is undirected).

    Returns
    -------
207
    prop : list of :class:`~graph_tool.PropertyMap` objects
208
209
210
211
212
213
        List of vertex properties containing the clustering coefficients.

    See Also
    --------
    local_clustering: local clustering coefficient
    global_clustering: global clustering coefficient
214
    motifs: motif counting
215
216
217

    Notes
    -----
218
219
    The extended clustering coefficient :math:`c^d_i` of order :math:`d` is
    defined as
220
221

    .. math::
222
223
       c^d_i = \frac{\left|\right\{ \{u,v\}; u,v \in N_i | d_{G(V\setminus
       \{i\})}(u,v) = d \left\}\right|}{{\left|N_i\right| \choose 2}},
224
225
226
227
228
229
230
231
232
233
234

    where :math:`d_G(u,v)` is the shortest distance from vertex :math:`u` to
    :math:`v` in graph :math:`G`, and

    .. math::
       N_i = \{v_j : e_{ij} \in E\}

    is the set of out-neighbours of :math:`i`. According to the above
    definition, we have that the traditional local clustering coefficient is
    recovered for :math:`d=1`, i.e., :math:`c^1_i = c_i`.

235
    The implemented algorithm runs in
Tiago Peixoto's avatar
Tiago Peixoto committed
236
    :math:`O(|V|\left<k\right>^{2+\text{max-depth}})` worst time, where
237
    :math:`\left< k\right>` is the average out-degree.
238
239
240
241
242

    If enabled during compilation, this algorithm runs in parallel.

    Examples
    --------
243
244
245
    >>> from numpy.random import seed
    >>> seed(42)
    >>> g = gt.random_graph(1000, lambda: (5,5))
246
247
248
249
    >>> clusts = gt.extended_clustering(g, max_depth=5)
    >>> for i in xrange(0, 5):
    ...    print gt.vertex_average(g, clusts[i])
    ...
Tiago Peixoto's avatar
Tiago Peixoto committed
250
251
252
253
254
    (0.0052816666666666671, 0.00046415526317530143)
    (0.026543333333333332, 0.0010405374199048405)
    (0.11648, 0.0019761350156302583)
    (0.40672499999999995, 0.0031128844140121867)
    (0.42646999999999996, 0.0030644539462829075)
255
256
257

    References
    ----------
258
259
    .. [abdo-clustering] A. H. Abdo, A. P. S. de Moura, "Clustering as a
       measure of the local topology of networks", arXiv:physics/0605235v4
260
261
    """

262
263
    was_directed = g.is_directed()
    if g.is_directed() and undirected:
264
        g.set_directed(False)
265
266
267
268
    if props == None:
        props = []
        for i in xrange(0, max_depth):
            props.append(g.new_vertex_property("double"))
269
    try:
270
271
       _gt.extended_clustering(g._Graph__graph,
                               [_prop("v", g, p) for p in props])
272
273
274
    finally:
        if was_directed and undirected:
            g.set_directed(True)
275
    return props
276
277


278
def motifs(g, k, p=1.0, motif_list=None, undirected=None):
279
280
281
282
283
284
285
    r"""
    Count the occurrence of k-size subgraphs (motifs). A tuple with two lists is
    returned: the list of motifs found, and the list with their respective
    counts.

    Parameters
    ----------
286
    g : :class:`~graph_tool.Graph`
287
288
289
        Graph to be used.
    k : int
        number of vertices of the motifs
290
    p : float or float list (optional, default: 1.0)
291
292
        uniform fraction of the motifs to be sampled. If a float list is
        provided, it will be used as the fraction at each depth
293
        :math:`[1,\dots,k]` in the algorithm. See [wernicke-efficient-2006]_ for
294
        more details.
295
    motif_list : list of :class:`~graph_tool.Graph` objects, optional
296
        If supplied, the algorithms will only search for the motifs in this list
Tiago Peixoto's avatar
Tiago Peixoto committed
297
        (or isomorphisms).
298
299
300
301
302
303
    undirected : bool, optional
        Treat the graph as *undirected*, if graph is directed
        (this option has no effect if the graph is undirected).

    Returns
    -------
304
    motifs : list of :class:`~graph_tool.Graph` objects
305
306
307
308
309
310
311
312
313
        List of motifs of size k found in the Graph. Graphs are grouped
        according to their isomorphism class, and only one of each class appears
        in this list. The list is sorted according to in-degree sequence,
        out-degree-sequence, and number of edges (in this order).
    counts : list of ints
        The number of times the respective motif in the motifs list was counted

    See Also
    --------
314
    motif_significance: significance profile of motifs
315
316
317
318
319
320
321
    local_clustering: local clustering coefficient
    global_clustering: global clustering coefficient
    extended_clustering: extended (generalized) clustering coefficient

    Notes
    -----
    This functions implements the ESU and RAND-ESU algorithms described in
322
    [wernicke-efficient-2006]_.
323
324
325
326
327

    If enabled during compilation, this algorithm runs in parallel.

    Examples
    --------
328
329
330
    >>> from numpy.random import seed
    >>> seed(42)
    >>> g = gt.random_graph(1000, lambda: (5,5))
331
    >>> motifs, counts = gt.motifs(g, 4, undirected=True)
332
    >>> print len(motifs)
Tiago Peixoto's avatar
Tiago Peixoto committed
333
    11
334
    >>> print counts
Tiago Peixoto's avatar
Tiago Peixoto committed
335
336
    [115780, 390603, 667, 764, 2666, 1694, 821, 5, 9, 4, 4]

337
338
339

    References
    ----------
340
    .. [wernicke-efficient-2006] S. Wernicke, "Efficient detection of network
341
342
343
344
       motifs", IEEE/ACM Transactions on Computational Biology and
       Bioinformatics (TCBB), Volume 3, Issue 4, Pages 347-359, 2006.
    """

345
    seed = random.randint(0, sys.maxint)
346
347
348
349

    sub_list = []
    directed_motifs = g.is_directed() if undirected == None else not undirected

350
351
352
    if motif_list != None:
        directed_motifs = motif_list[0].is_directed()
        for m in motif_list:
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
            if m.is_directed() != directed_motifs:
                raise ValueError("all motif graphs must be either directed or undirected")
            if m.num_vertices() != k:
                raise ValueError("all motifs must have the same number of vertices: " + k)
            sub_list.append(m._Graph__graph)

    if type(p) == float:
        pd = [1.0]*(k-1)
        pd.append(p)
    if type(p) == list:
        pd = [float(x) for x in p]

    hist = []
    was_directed = g.is_directed()
    if g.is_directed() and not directed_motifs:
        g.set_directed(False)
    try:
       _gt.get_motifs(g._Graph__graph, k, sub_list, hist, pd,
                      True, len(sub_list) == 0,
                      seed)
    finally:
        if was_directed and not directed_motifs:
            g.set_directed(True)

    # assemble graphs
    temp = []
    for m in sub_list:
        mg = Graph()
        mg._Graph__graph = m
        temp.append(mg)
    sub_list = temp

    list_hist = zip(sub_list, hist)
    # sort according to in-degree sequence
    list_hist.sort(lambda x,y: cmp(sorted([v.in_degree() for v in x[0].vertices()]),
                                   sorted([v.in_degree() for v in y[0].vertices()])))

    # sort according to out-degree sequence
    list_hist.sort(lambda x,y: cmp(sorted([v.out_degree() for v in x[0].vertices()]),
                                   sorted([v.out_degree() for v in y[0].vertices()])))

    # sort according to ascending number of edges
    list_hist.sort(lambda x,y: cmp(x[0].num_edges(), y[0].num_edges()))

    sub_list = [x[0] for x in list_hist]
    hist = [x[1] for x in list_hist]

    return sub_list, hist
401

402
403
404
405
406
407
408
409
410
411
412
413
414
def _graph_sig(g):
    """return the graph signature, i.e., the in and out degree distribution as
    concatenated as a tuple."""
    bins = range(0, g.num_vertices()+1)
    in_dist = vertex_hist(g, "in", bins = bins if g.is_directed() else [0],
                          float_count=False)
    out_dist = vertex_hist(g, "out", bins = bins, float_count=False)
    sig = tuple([(in_dist[1][i],in_dist[0][i]) for \
                 i in xrange(len(in_dist[0]))] +
                [(out_dist[1][i],out_dist[0][i]) for\
                 i in xrange(len(out_dist[0]))])
    return sig

415
def motif_significance(g, k, n_shuffles=100, p=1.0, motif_list=None,
416
417
418
                       threshold=0, undirected=None, self_loops=False,
                       parallel_edges=False, full_output=False,
                       shuffle_strategy= "uncorrelated"):
419
420
421
422
423
424
425
    r"""
    Obtain the motif significance profile, for subgraphs with k vertices. A
    tuple with two lists is returned: the list of motifs found, and their
    respective z-scores.

    Parameters
    ----------
426
    g : :class:`~graph_tool.Graph`
427
428
429
        Graph to be used.
    k : int
        number of vertices of the motifs
430
    n_shuffles : int (optional, default: 100)
431
        number of shuffled networks to consider for the z-score
432
    p : float or float list (optional, default: 1.0)
433
434
        uniform fraction of the motifs to be sampled. If a float list is
        provided, it will be used as the fraction at each depth
435
        :math:`[1,\dots,k]` in the algorithm. See [wernicke-efficient-2006]_ for
436
        more details.
437
    motif_list : list of :class:`~graph_tool.Graph` objects (optional, default: None)
438
        If supplied, the algorithms will only search for the motifs in this list
439
440
441
442
        (isomorphisms)
    threshold : int (optional, default: 0)
        If a given motif count is below this level, it is not considered.
    undirected : bool (optional, default: None)
443
444
        Treat the graph as *undirected*, if graph is directed
        (this option has no effect if the graph is undirected).
445
    self_loops : bool (optional, default: False)
446
        Whether or not the shuffled graphs are allowed to contain self-loops
447
    parallel_edges : bool (optional, default: False)
448
449
        Whether or not the shuffled graphs are allowed to contain parallel
        edges.
450
    full_output : bool (optional, default: False)
451
452
453
454
        If set to True, three additional lists are returned: the count
        of each motif, the average count of each motif in the shuffled networks,
        and the standard deviation of the average count of each motif in the
        shuffled networks.
455
    shuffle_strategy : string (optional, default: "uncorrelated")
456
457
        Shuffle strategy to use. Can be either "correlated" or "uncorrelated".
        See random_rewire() for details.
458
459
460

    Returns
    -------
461
    motifs : list of :class:`~graph_tool.Graph` objects
462
463
464
465
466
467
        List of motifs of size k found in the Graph. Graphs are grouped
        according to their isomorphism class, and only one of each class appears
        in this list. The list is sorted according to in-degree sequence,
        out-degree-sequence, and number of edges (in this order).
    z-scores : list of floats
        The z-score of the respective motives. See below for the definition of
Tiago Peixoto's avatar
Tiago Peixoto committed
468
        the z-score.
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484

    See Also
    --------
    motifs: motif counting or sampling
    local_clustering: local clustering coefficient
    global_clustering: global clustering coefficient
    extended_clustering: extended (generalized) clustering coefficient

    Notes
    -----
    The z-score :math:`z_i` of motif i is defined as

    .. math::
         z_i = \frac{N_i - \left<N^s_i\right>}
         {\sqrt{\left<(N^s_i)^2\right> - \left<N^s_i\right>^2}},

485
    where :math:`N_i` is the number of times motif i found, and :math:`N^s_i`
486
487
488
489
490
491
492
493
494
495
    is the count of the same motif but on a shuffled network. It measures how
    many standard deviations is each motif count, in respect to a ensemble of
    randomly shuffled graphs with the same degree sequence.

    The z-scores values are not normalized.

    If enabled during compilation, this algorithm runs in parallel.

    Examples
    --------
496
    >>> from numpy import random
497
498
    >>> random.seed(10)
    >>> g = gt.random_graph(100, lambda: (3,3))
499
500
    >>> motifs, zscores = gt.motif_significance(g, 3)
    >>> print len(motifs)
Tiago Peixoto's avatar
Tiago Peixoto committed
501
    12
502
    >>> print zscores
Tiago Peixoto's avatar
Tiago Peixoto committed
503
    [0.84493166311546375, 0.83875258032645894, 1.2117425659306142, -0.20405718722884647, -0.69886762637118316, -0.68343227667794837, -0.92481997609648403, -0.11, -0.14999999999999999, -0.26000000000000001, -0.14000000000000001, -0.01]
504
505
    """

506
507
508
509
510
    s_ms, counts = motifs(g, k, p, motif_list, undirected)
    if threshold > 0:
        s_ms, counts = zip(*[x for x in zip(s_ms, counts) if x[1] > threshold])
        s_ms = list(s_ms)
        counts = list(counts)
511
512
513
    s_counts = [0]*len(s_ms)
    s_dev = [0]*len(s_ms)

514
515
516
517
518
    # group subgraphs by number of edges
    m_e = defaultdict(lambda: [])
    for i in xrange(len(s_ms)):
        m_e[_graph_sig(s_ms[i])].append(i)

519
520
521
    # get samples
    sg = g.copy()
    for i in xrange(0, n_shuffles):
522
523
        random_rewire(sg, shuffle_strategy, self_loops=self_loops,
                      parallel_edges=parallel_edges)
524
525
526
527
        m_temp, count_temp = motifs(sg, k, p, motif_list, undirected)
        if threshold > 0:
            m_temp, count_temp = zip(*[x for x in zip(m_temp, count_temp) \
                                       if x[1] > threshold])
528
529
        for j in xrange(0, len(m_temp)):
            found = False
530
            for l in m_e[_graph_sig(m_temp[j])]:
531
532
533
534
535
536
537
538
539
                if isomorphism(s_ms[l], m_temp[j]):
                    found = True
                    s_counts[l] += count_temp[j]
                    s_dev[l] += count_temp[j]**2
            if not found:
                s_ms.append(m_temp[j])
                s_counts.append(count_temp[j])
                s_dev.append(count_temp[j]**2)
                counts.append(0)
540
                m_e[_graph_sig(m_temp[j])].append(len(s_ms)-1)
541
542

    s_counts = [ x/float(n_shuffles) for x in s_counts ]
543
    s_dev = [ max(sqrt(x[0]/float(n_shuffles) - x[1]**2),1) \
544
545
546
547
              for x in izip(s_dev,s_counts) ]

    list_hist = zip(s_ms, s_counts, s_dev)
    # sort according to in-degree sequence
548
549
550
551
    list_hist.sort(lambda x,y: cmp(sorted([v.in_degree()\
                                           for v in x[0].vertices()]),
                                   sorted([v.in_degree()\
                                           for v in y[0].vertices()])))
552
553

    # sort according to out-degree sequence
554
555
556
557
    list_hist.sort(lambda x,y: cmp(sorted([v.out_degree()\
                                           for v in x[0].vertices()]),
                                   sorted([v.out_degree()\
                                           for v in y[0].vertices()])))
558
559
560
561
562
563
564
565
566
567
568
569

    # sort according to ascending number of edges
    list_hist.sort(lambda x,y: cmp(x[0].num_edges(), y[0].num_edges()))

    s_ms, s_counts, s_dev = zip(*list_hist)

    zscore = [(x[0] - x[1])/x[2] for x in izip(counts, s_counts, s_dev)]

    if full_output:
        return s_ms, zscore, counts, s_counts, s_dev
    else:
        return s_ms, zscore