Commit 9137c542 authored by Tiago Peixoto's avatar Tiago Peixoto

vertex_similarity(): Add support for weighted and multigraphs

This fixes issue #592.
parent 2c084876
Pipeline #492 failed with stage
in 395 minutes and 23 seconds
......@@ -25,107 +25,135 @@ using namespace std;
using namespace boost;
using namespace graph_tool;
void get_dice_similarity(GraphInterface& gi, boost::any as, bool self_loop)
typedef UnityPropertyMap<uint8_t, GraphInterface::edge_t> ecmap_t;
typedef boost::mpl::push_back<edge_scalar_properties, ecmap_t>::type
weight_props_t;
void get_dice_similarity(GraphInterface& gi, boost::any as, boost::any weight)
{
if (weight.empty())
weight = ecmap_t();
gt_dispatch<>()
([&](auto& g, auto& s)
([&](auto& g, auto& s, auto& w)
{
all_pairs_similarity(g, s,
[&](auto u, auto v, auto& mask)
[&](auto u, auto v, auto& mask, auto& w)
{
return dice(u, v, self_loop, mask, g);
});
return dice(u, v, mask, w, g);
}, w);
},
all_graph_views(), vertex_floating_vector_properties())
(gi.get_graph_view(), as);
all_graph_views(), vertex_floating_vector_properties(),
weight_props_t())
(gi.get_graph_view(), as, weight);
}
void get_dice_similarity_pairs(GraphInterface& gi, python::object opairs,
python::object osim, bool self_loop)
python::object osim, boost::any weight)
{
multi_array_ref<int64_t,2> pairs = get_array<int64_t,2>(opairs);
multi_array_ref<double,1> sim = get_array<double,1>(osim);
if (weight.empty())
weight = ecmap_t();
gt_dispatch<>()
([&](auto& g)
([&](auto& g, auto w)
{
some_pairs_similarity(g, pairs, sim,
[&](auto u, auto v, auto& mask)
[&](auto u, auto v, auto& mask, auto& w)
{
return dice(u, v, self_loop, mask, g);
});
return dice(u, v, mask, w, g);
}, w);
},
all_graph_views())
(gi.get_graph_view());
all_graph_views(), weight_props_t())
(gi.get_graph_view(), weight);
}
void get_jaccard_similarity(GraphInterface& gi, boost::any as, bool self_loop)
void get_jaccard_similarity(GraphInterface& gi, boost::any as, boost::any weight)
{
if (weight.empty())
weight = ecmap_t();
gt_dispatch<>()
([&](auto& g, auto& s)
([&](auto& g, auto& s, auto w)
{
all_pairs_similarity(g, s,
[&](auto u, auto v, auto& mask)
[&](auto u, auto v, auto& mask, auto w)
{
return jaccard(u, v, self_loop, mask, g);
});
return jaccard(u, v, mask, w, g);
}, w);
},
all_graph_views(), vertex_floating_vector_properties())
(gi.get_graph_view(), as);
all_graph_views(), vertex_floating_vector_properties(),
weight_props_t())
(gi.get_graph_view(), as, weight);
}
void get_jaccard_similarity_pairs(GraphInterface& gi, python::object opairs,
python::object osim, bool self_loop)
python::object osim, boost::any weight)
{
multi_array_ref<int64_t,2> pairs = get_array<int64_t,2>(opairs);
multi_array_ref<double,1> sim = get_array<double,1>(osim);
if (weight.empty())
weight = ecmap_t();
gt_dispatch<>()
([&](auto& g)
([&](auto& g, auto w)
{
some_pairs_similarity(g, pairs, sim,
[&](auto u, auto v, auto& mask)
[&](auto u, auto v, auto& mask, auto w)
{
return jaccard(u, v, self_loop, mask, g);
});
return jaccard(u, v, mask, w, g);
}, w);
},
all_graph_views())
(gi.get_graph_view());
all_graph_views(), weight_props_t())
(gi.get_graph_view(), weight);
}
void get_inv_log_weight_similarity(GraphInterface& gi, boost::any as)
void get_inv_log_weight_similarity(GraphInterface& gi, boost::any as,
boost::any weight)
{
if (weight.empty())
weight = ecmap_t();
gt_dispatch<>()
([&](auto& g, auto& s)
([&](auto& g, auto& s, auto w)
{
all_pairs_similarity(g, s,
[&](auto u, auto v, auto& mask)
[&](auto u, auto v, auto& mask, auto w)
{
return inv_log_weighted(u, v, mask, g);
});
return inv_log_weighted(u, v, mask, w, g);
}, w);
},
all_graph_views(), vertex_floating_vector_properties())
(gi.get_graph_view(), as);
all_graph_views(), vertex_floating_vector_properties(),
weight_props_t())
(gi.get_graph_view(), as, weight);
}
void get_inv_log_weight_similarity_pairs(GraphInterface& gi,
python::object opairs,
python::object osim)
python::object osim,
boost::any weight)
{
multi_array_ref<int64_t,2> pairs = get_array<int64_t,2>(opairs);
multi_array_ref<double,1> sim = get_array<double,1>(osim);
if (weight.empty())
weight = ecmap_t();
gt_dispatch<>()
([&](auto& g)
([&](auto& g, auto w)
{
some_pairs_similarity(g, pairs, sim,
[&](auto u, auto v, auto& mask)
[&](auto u, auto v, auto& mask, auto w)
{
return inv_log_weighted(u, v, mask, g);
});
return inv_log_weighted(u, v, mask, w, g);
}, w);
},
all_graph_views())
(gi.get_graph_view());
all_graph_views(), weight_props_t())
(gi.get_graph_view(), weight);
}
......
......@@ -25,80 +25,81 @@ namespace graph_tool
using namespace std;
using namespace boost;
template <class Graph, class Vertex, class Mark>
double dice(Vertex u, Vertex v, bool self_loop, Mark& mark, Graph& g)
template <class Graph, class Vertex, class Mark, class Weight>
double dice(Vertex u, Vertex v, Mark& mark, Weight& weight, Graph& g)
{
size_t count = 0;
for (auto w : adjacent_vertices_range(u, g))
mark[w] = true;
if (self_loop)
mark[u] = true;
for (auto w : adjacent_vertices_range(v, g))
typename property_traits<Weight>::value_type count = 0, ku = 0, kv = 0;
for (auto e : out_edges_range(u, g))
{
if (mark[w])
count++;
auto w = weight[e];
mark[target(e, g)] += w;
ku += w;
}
for (auto e : out_edges_range(v, g))
{
auto w = weight[e];
auto dw = std::min(w, mark[target(e, g)]);
mark[target(e, g)] -= dw;
count += dw;
kv += w;
}
for (auto w : adjacent_vertices_range(u, g))
mark[w] = false;
if (self_loop)
mark[u] = false;
return 2 * count / double(out_degree(u, g) + out_degree(v, g));
mark[w] = 0;
return 2 * count / double(ku + kv);
}
template <class Graph, class Vertex, class Mark>
double jaccard(Vertex u, Vertex v, bool self_loop, Mark& mark, Graph& g)
template <class Graph, class Vertex, class Mark, class Weight>
double jaccard(Vertex u, Vertex v, Mark& mark, Weight& weight, Graph& g)
{
size_t count = 0, total = 0;
for (auto w : adjacent_vertices_range(u, g))
typename property_traits<Weight>::value_type count = 0, total = 0;
for (auto e : out_edges_range(u, g))
{
mark[w] = true;
total++;
auto w = weight[e];
mark[target(e, g)] += w;
total += w;
}
if (self_loop)
mark[u] = true;
for (auto w : adjacent_vertices_range(v, g))
for (auto e : out_edges_range(v, g))
{
if (mark[w])
count++;
else
total++;
auto w = weight[e];
auto dw = std::min(w, mark[target(e, g)]);
count += dw;
mark[target(e, g)] -= dw;
total += w - dw;
}
for (auto w : adjacent_vertices_range(u, g))
mark[w] = false;
if (self_loop)
mark[u] = false;
mark[w] = 0;
return count / double(total);
}
template <class Graph, class Vertex, class Mark>
double inv_log_weighted(Vertex u, Vertex v, Mark& mark, Graph& g)
template <class Graph, class Vertex, class Mark, class Weight>
double inv_log_weighted(Vertex u, Vertex v, Mark& mark, Weight& weight, Graph& g)
{
double count = 0;
for (auto w : adjacent_vertices_range(u, g))
mark[w] = true;
for (auto e : out_edges_range(u, g))
mark[target(e, g)] += weight[e];
for (auto w : adjacent_vertices_range(v, g))
{
if (mark[w])
if (mark[w] > 0)
{
if (graph_tool::is_directed(g))
count += 1. / log(in_degreeS()(w, g));
count += mark[w] / log(in_degreeS()(w, g, weight));
else
count += 1. / log(out_degree(w, g));
count += mark[w] / log(out_degreeS()(w, g, weight));
}
}
for (auto w : adjacent_vertices_range(u, g))
mark[w] = false;
mark[w] = 0;
return count;
}
template <class Graph, class VMap, class Sim>
void all_pairs_similarity(Graph& g, VMap s, Sim&& f)
template <class Graph, class VMap, class Sim, class Weight>
void all_pairs_similarity(Graph& g, VMap s, Sim&& f, Weight& weight)
{
vector<bool> mask(num_vertices(g), false);
vector<typename property_traits<Weight>::value_type>
mask(num_vertices(g));
#pragma omp parallel if (num_vertices(g) > OPENMP_MIN_THRESH) \
firstprivate(mask)
parallel_vertex_loop_no_spawn
......@@ -107,23 +108,25 @@ void all_pairs_similarity(Graph& g, VMap s, Sim&& f)
{
s[v].resize(num_vertices(g));
for (auto w : vertices_range(g))
s[v][w] = f(v, w, mask);
s[v][w] = f(v, w, mask, weight);
});
}
template <class Graph, class Vlist, class Slist, class Sim>
void some_pairs_similarity(Graph& g, Vlist& vlist, Slist& slist, Sim&& f)
template <class Graph, class Vlist, class Slist, class Sim, class Weight>
void some_pairs_similarity(Graph& g, Vlist& vlist, Slist& slist, Sim&& f,
Weight& weight)
{
vector<bool> mask(num_vertices(g), false);
vector<typename property_traits<Weight>::value_type>
mark(num_vertices(g));
#pragma omp parallel if (num_vertices(g) > OPENMP_MIN_THRESH) \
firstprivate(mask)
firstprivate(mark)
parallel_loop_no_spawn
(vlist,
[&](size_t i, const auto& val)
{
size_t u = val[0];
size_t v = val[1];
slist[i] = f(u, v, mask);
slist[i] = f(u, v, mark, weight);
});
}
......
......@@ -264,7 +264,7 @@ def similarity(g1, g2, eweight1=None, eweight2=None, label1=None, label2=None,
return s
@_limit_args({"sim_type": ["dice", "jaccard", "inv-log-weight"]})
def vertex_similarity(g, sim_type="jaccard", vertex_pairs=None, self_loops=True,
def vertex_similarity(g, sim_type="jaccard", vertex_pairs=None, eweight=None,
sim_map=None):
r"""Return the similarity between pairs of vertices.
......@@ -278,9 +278,8 @@ def vertex_similarity(g, sim_type="jaccard", vertex_pairs=None, self_loops=True,
vertex_pairs : iterable of pairs of integers (optional, default: ``None``)
Pairs of vertices to compute the similarity. If omitted, all pairs will
be considered.
self_loops : bool (optional, default: ``True``)
If ``True``, vertices will be considered adjacent to themselves for the
purpose of the similarity computation.
eweight : :class:`~graph_tool.EdgePropertyMap` (optional, default: ``None``)
Edge weights.
sim_map : :class:`~graph_tool.VertexPropertyMap` (optional, default: ``None``)
If provided, and ``vertex_pairs is None``, the vertex similarities will
be stored in this vector-valued property. Otherwise, a new one will be
......@@ -296,27 +295,41 @@ def vertex_similarity(g, sim_type="jaccard", vertex_pairs=None, self_loops=True,
Notes
-----
According to ``sim_type``, this function computes the following similarities:
According to ``sim_type``, this function computes one of the following
similarities:
``sim_type == "dice"``
The Sørensen–Dice similarity [sorensen-dice]_ is twice the number of
common neighbors between two vertices divided by the sum of their
degrees.
The Sørensen–Dice similarity [sorensen-dice]_ of vertices :math:`u` and
:math:`v` is defined as
.. math::
\frac{2|\Gamma(u)\cap\Gamma(v)|}{|\Gamma(u)|+|\Gamma(v)},
where :math:`\Gamma(u)` is the set of neighbors of vertex :math:`u`.
``sim_type == "jaccard"``
The Jaccard similarity [jaccard]_ is the number of common neighbors
between two vertices divided by the size of the set of all neighbors to
both vertices.
The Jaccard similarity [jaccard]_ of vertices :math:`u` and
:math:`v` is defined as
.. math::
\frac{|\Gamma(u)\cap\Gamma(v)|}{|\Gamma(u)\cup\Gamma(v)},
where :math:`\Gamma(u)` is the set of neighbors of vertex :math:`u`.
``sim_type == "inv-log-weight"``
The inverse log weighted similarity [adamic-friends-2003]_ is the sum of
the weights of common neighbors between two vertices, where the weights
are computed as :math:`1/\log(k)`, with :math:`k` being the degree of the
vertex.
The inverse log weighted similarity [adamic-friends-2003]_ of vertices
:math:`u` and :math:`v` is defined as
.. math::
\sum_{w \in \Gamma(u)\cap\Gamma(v)}\frac{1}{\log |\Gamma(w)|},
where :math:`\Gamma(u)` is the set of neighbors of vertex :math:`u`.
For directed graphs, only out-neighbors are considered in the above
algorthms (for "inv-log-weight", the in-degrees are used to compute the
......@@ -324,6 +337,17 @@ def vertex_similarity(g, sim_type="jaccard", vertex_pairs=None, self_loops=True,
should be used to reverse the graph, e.g. ``vertex_similarity(GraphView(g,
reversed=True))``.
For weighted or multigraphs, in the above equations it is assumed the
following:
.. math::
|\Gamma(u)\cap\Gamma(v)| &= \sum_w \operatorname{min}(A_{wv}, A_{wu}),\\
|\Gamma(u)\cup\Gamma(v)| &= \sum_w \operatorname{max}(A_{wv}, A_{wu}),\\
|\Gamma(u)| &= \sum_w A_{wu},
where :math:`A_{wu}` is the weighted adjacency matrix.
The algorithm runs with complexity :math:`O(\left<k\right>N^2)` if
``vertex_pairs is None``, otherwise with :math:`O(\left<k\right>P)` where
:math:`P` is the length of ``vertex_pairs``.
......@@ -371,8 +395,14 @@ def vertex_similarity(g, sim_type="jaccard", vertex_pairs=None, self_loops=True,
"The link-prediction problem for social networks", Journal of the
American Society for Information Science and Technology, Volume 58, Issue
7, pages 1019–1031 (2007), :doi:`10.1002/asi.20591`
"""
if eweight is None:
eweight = libcore.any()
else:
eweight = _prop("e", g, eweight)
if vertex_pairs is None:
if sim_map is None:
s = g.new_vp("vector<double>")
......@@ -381,29 +411,30 @@ def vertex_similarity(g, sim_type="jaccard", vertex_pairs=None, self_loops=True,
if sim_type == "dice":
libgraph_tool_topology.dice_similarity(g._Graph__graph,
_prop("v", g, s),
self_loops)
eweight)
elif sim_type == "jaccard":
libgraph_tool_topology.jaccard_similarity(g._Graph__graph,
_prop("v", g, s),
self_loops)
eweight)
elif sim_type == "inv-log-weight":
libgraph_tool_topology.inv_log_weight_similarity(g._Graph__graph,
_prop("v", g, s))
_prop("v", g, s),
eweight)
else:
vertex_pairs = numpy.asarray(vertex_pairs, dtype="int64")
s = numpy.zeros(vertex_pairs.shape[0], dtype="double")
if sim_type == "dice":
libgraph_tool_topology.dice_similarity_pairs(g._Graph__graph,
vertex_pairs,
s, self_loops)
s, eweight)
elif sim_type == "jaccard":
libgraph_tool_topology.jaccard_similarity_pairs(g._Graph__graph,
vertex_pairs,
s, self_loops)
s, eweight)
elif sim_type == "inv-log-weight":
libgraph_tool_topology.\
inv_log_weight_similarity_pairs(g._Graph__graph, vertex_pairs,
s)
s, eweight)
return s
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment