Commit 46ca49e7 authored by Tiago Peixoto's avatar Tiago Peixoto
Browse files

Substantial reorganization and improvement of blockmodel inference

This adds support for exact likelihood calculation, correct posterior
sampling of models with nonempty groups, much improved "distributed"
degree-corrected prior, as well as continuous edge covariates.
parent 0c6e9de7
Pipeline #187 failed with stage
in 1188 minutes and 31 seconds
......@@ -10,10 +10,11 @@ if not verbose:
out = open(os.devnull, 'w')
else:
out = sys.stdout
from collections import OrderedDict
import itertools
from graph_tool.all import *
import numpy.random
from numpy.random import randint
from numpy.random import randint, normal, random
numpy.random.seed(42)
seed_rng(42)
......@@ -23,137 +24,216 @@ graph_tool.inference.set_test(True)
g = collection.data["football"]
ec = g.new_ep("int", randint(0, 10, g.num_edges()))
def gen_state(directed, deg_corr, layers, overlap):
rec_p = g.new_ep("double", random(g.num_edges()))
rec_s = g.new_ep("double", normal(0, 10, g.num_edges()))
def gen_state(directed, deg_corr, layers, overlap, rec, allow_empty):
u = GraphView(g, directed=directed)
if layers != False:
state = graph_tool.inference.LayeredBlockState(u, B=u.num_vertices(),
deg_corr=deg_corr,
ec=ec.copy(),
rec=rec,
overlap=overlap,
layers=layers == True)
layers=layers == True,
allow_empty=allow_empty)
elif overlap:
state = graph_tool.inference.OverlapBlockState(u, B=2 * u.num_edges(),
rec=rec,
deg_corr=deg_corr)
else:
state = graph_tool.inference.BlockState(u, B=u.num_vertices(),
deg_corr=deg_corr)
rec=rec,
deg_corr=deg_corr,
allow_empty=allow_empty)
return state
for directed in [True, False]:
for overlap in [False, True]:
for layered in [False, "covariates", True]:
for deg_corr in [False, True]:
for dl in [False, True]:
pranges = [("directed", [False, True]),
("overlap", [False, True]),
("layered", [False, "covariates", True]),
("rec", [None, "positive", "signed"]),
("deg_corr", [False, True]),
("dl", [False, True]),
("degree_dl_kind", ["uniform", "distributed", "entropy"]),
("allow_empty", [False, True]),
("exact", [True, False])]
pranges = OrderedDict(pranges)
def iter_ranges(ranges):
for vals in itertools.product(*[v for k, v in ranges.items()]):
yield zip(ranges.keys(), vals)
for pvals in iter_ranges(pranges):
params = OrderedDict(pvals)
locals().update(params)
if not deg_corr and degree_dl_kind != "uniform":
continue
if (rec is not None or layered != False) and not exact:
continue
print(params, file=out)
rec_ = None
if rec == "positive":
rec_ = rec_p
elif rec == "signed":
rec_ = rec_s
print("\t mcmc (unweighted)", file=out)
state = gen_state(directed, deg_corr, layered, overlap, rec_, allow_empty)
print("\t\t",
state.mcmc_sweep(beta=0,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
exact=exact)),
state.get_nonempty_B(), file=out)
if overlap:
print("\t\t",
state.mcmc_sweep(beta=0, bundled=True,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
exact=exact)),
state.get_nonempty_B(), file=out)
state = gen_state(directed, deg_corr, layered, overlap, rec_, allow_empty)
if not overlap:
print("\t mcmc", file=out)
bstate = state.get_block_state(vweight=True, deg_corr=deg_corr)
print("\t\t",
bstate.mcmc_sweep(beta=0,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
exact=exact)),
bstate.get_nonempty_B(), file=out)
print("\t\t",
bstate.mcmc_sweep(beta=0,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
exact=exact)),
bstate.get_nonempty_B(), file=out)
print("\t\t",
bstate.gibbs_sweep(beta=0,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
exact=exact)),
bstate.get_nonempty_B(), file=out)
print("\t merge", file=out)
state = gen_state(directed, deg_corr, layered, overlap, rec_, allow_empty)
if not overlap:
bstate = state.get_block_state(vweight=True, deg_corr=deg_corr)
print("\t\t",
bstate.merge_sweep(50,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
multigraph=False,
exact=exact)),
file=out)
bstate = bstate.copy()
print("\t\t",
bstate.mcmc_sweep(beta=0,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
exact=exact)),
file=out)
print("\t\t",
bstate.gibbs_sweep(beta=0,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
exact=exact)),
file=out)
else:
print("\t\t",
state.merge_sweep(50,
entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
multigraph=False,
exact=exact)),
file=out)
print("\ndirected:", directed, "overlap:", overlap,
"layered:", layered, "deg-corr:", deg_corr, "dl:", dl,
file=out)
print("\t shrink", file=out)
state = gen_state(directed, deg_corr, layered, overlap, rec_, allow_empty)
state = state.shrink(B=5, entropy_args=dict(dl=dl,
degree_dl_kind=degree_dl_kind,
multigraph=False,
exact=exact))
print("\t\t", state.B, "\n", file=out)
print("\t mcmc (unweighted)", file=out)
state = gen_state(directed, deg_corr, layered, overlap)
print("\t\t", state.mcmc_sweep(beta=0, allow_empty=True,
entropy_args=dict(dl=dl)),
(state.wr.a > 0).sum(), file=out)
if overlap:
print("\t\t", state.mcmc_sweep(beta=0, bundled=True,
allow_empty=True,
entropy_args=dict(dl=dl)),
(state.wr.a > 0).sum(),
file=out)
pranges = [("directed", [False, True]),
("overlap", [False, True]),
("layered", [False, "covariates", True]),
("rec", [None, "positive", "signed"]),
("deg_corr", [False, True]),
("degree_dl_kind", ["uniform", "distributed", "entropy"]),
("exact", [True])]
state = gen_state(directed, deg_corr, layered, overlap)
pranges = OrderedDict(pranges)
if not overlap:
print("\t mcmc", file=out)
bstate = state.get_block_state(vweight=True,
deg_corr=deg_corr)
for pvals in iter_ranges(pranges):
params = OrderedDict(pvals)
print("\t\t",
bstate.mcmc_sweep(beta=0,
allow_empty=True,
entropy_args=dict(dl=dl,
multigraph=False)),
(bstate.wr.a > 0).sum(), file=out)
locals().update(params)
print("\t\t",
bstate.mcmc_sweep(beta=0, allow_empty=True,
entropy_args=dict(dl=dl,
multigraph=False)),
(bstate.wr.a > 0).sum(), file=out)
if not deg_corr and degree_dl_kind != "uniform":
continue
print("\t\t",
bstate.gibbs_sweep(beta=0, allow_empty=True,
entropy_args=dict(dl=dl,
multigraph=False)),
(bstate.wr.a > 0).sum(), file=out)
print(params, file=out)
print("\t merge", file=out)
rec_ = None
if rec == "positive":
rec_ = rec_p
elif rec == "signed":
rec_ = rec_s
state = gen_state(directed, deg_corr, layered, overlap)
if layered != False:
state_args = dict(ec=ec, layers=(layered == True), rec=rec_)
else:
state_args = dict(rec=rec_)
if not overlap:
bstate = state.get_block_state(vweight=True,
deg_corr=deg_corr)
entropy_args = dict(exact=exact)
state = minimize_blockmodel_dl(GraphView(g, directed=directed),
verbose=(1, "\t") if verbose else False,
deg_corr=deg_corr,
overlap=overlap,
layers=layered != False,
state_args=state_args,
mcmc_args=dict(entropy_args=entropy_args))
print(state.B, state.entropy(), file=out)
state = minimize_nested_blockmodel_dl(GraphView(g, directed=directed),
verbose=(2, "\t") if verbose else False,
deg_corr=deg_corr,
overlap=overlap,
layers=layered != False,
state_args=state_args,
mcmc_args=dict(entropy_args=entropy_args))
if verbose:
state.print_summary()
print(state.entropy(), "\n", file=out)
print("\t\t",
bstate.merge_sweep(50,
entropy_args=dict(dl=dl,
multigraph=False)),
file=out)
bstate = bstate.copy()
print("\t\t",
bstate.mcmc_sweep(beta=0, allow_empty=True,
entropy_args=dict(dl=dl,
multigraph=False)),
file=out)
print("\t\t",
bstate.gibbs_sweep(beta=0, allow_empty=True,
entropy_args=dict(dl=dl,
multigraph=False)),
file=out)
else:
print("\t\t",
state.merge_sweep(50,
entropy_args=dict(dl=dl,
multigraph=False)),
file=out)
print("\t shrink", file=out)
state = gen_state(directed, deg_corr, layered, overlap)
state = state.shrink(B=5, entropy_args=dict(dl=dl,
multigraph=False))
print("\t\t", state.B, file=out)
for directed in [True, False]:
for overlap in [False, True]:
for layered in [False, "covariates", True]:
for deg_corr in [False, True]:
print("\ndirected:", directed, "overlap:", overlap,
"layered:", layered, "deg-corr:", deg_corr, file=out)
state_args = dict(ec=ec, layers=(layered == True)) if layered != False else {}
state = minimize_blockmodel_dl(GraphView(g, directed=directed),
verbose=(1, "\t") if verbose else False,
deg_corr=deg_corr,
overlap=overlap,
layers=layered != False,
state_args=state_args)
print(state.B, state.entropy(), file=out)
state = minimize_nested_blockmodel_dl(GraphView(g, directed=directed),
verbose=(1, "\t") if verbose else False,
deg_corr=deg_corr,
overlap=overlap,
layers=layered != False,
state_args=state_args)
if verbose:
state.print_summary()
print(state.entropy(), file=out)
graph_tool.inference.set_test(False)
print("OK")
\ No newline at end of file
......@@ -16,7 +16,6 @@ verbose = __name__ == "__main__"
g = collection.data["football"]
B = 10
for directed in [True, False]:
clf()
g.set_directed(directed)
......@@ -96,32 +95,35 @@ for directed in [True, False]:
savefig("test_mcmc_move_prob_directed%s.pdf" % directed)
g = collection.data["karate"]
B = 3
g = collection.data["lesmis"]
B = 6
for directed in [True, False]:
clf()
g.set_directed(directed)
hists = {}
state = minimize_blockmodel_dl(g, deg_corr=False, B_min=B, B_max=B)
state = minimize_blockmodel_dl(g, deg_corr=True, B_min=B, B_max=B)
state = state.copy(B=B+1)
cs = list(reversed([numpy.inf, 1, 0.1, 0.01, 0.001, "gibbs"]))
for i, c in enumerate(cs):
if c != "gibbs":
mcmc_args=dict(beta=1, c=c, niter=600, allow_empty=True)
mcmc_args=dict(beta=1, c=c, niter=100, allow_vacate=True)
else:
mcmc_args=dict(beta=1, niter=600, allow_empty=True)
mcmc_args=dict(beta=1, niter=100, allow_vacate=True)
if i == 0:
mcmc_equilibrate(state, mcmc_args=mcmc_args, gibbs=c=="gibbs",
wait=1000,
mcmc_equilibrate(state,
mcmc_args=mcmc_args,
gibbs=c=="gibbs",
wait=100,
verbose=(1, "c = %s (t) " % str(c)) if verbose else False)
hists[c] = mcmc_equilibrate(state, mcmc_args=mcmc_args,
hists[c] = mcmc_equilibrate(state,
mcmc_args=mcmc_args,
gibbs=c=="gibbs",
force_niter=2000,
wait=2000,
nbreaks=12,
verbose=(1, "c = %s " % str(c)) if verbose else False,
history=True)
......@@ -146,20 +148,30 @@ for directed in [True, False]:
"(%s, %s) are not the same, with a p-value: %g (D=%g)") %
(str(directed), str(c1), str(c2), p, D))
bins = None
for c in cs:
hist = hists[c]
h = histogram(list(zip(*hist))[0], 1000000)
plot(h[-1][:-1], numpy.cumsum(h[0]), "-", label="c=%s" % str(c))
if c != numpy.inf:
hist = hists[numpy.inf]
h2 = histogram(list(zip(*hist))[0], bins=h[-1])
res = abs(numpy.cumsum(h2[0]) - numpy.cumsum(h[0]))
i = res.argmax()
axvline(h[-1][i], color="grey")
legend(loc="lower right")
savefig("test_mcmc_directed%s.pdf" % directed)
for cum in [True, False]:
clf()
bins = None
for c in cs:
hist = hists[c]
if cum:
h = histogram(list(zip(*hist))[0], 1000000, density=True)
plot(h[-1][:-1], numpy.cumsum(h[0]), "-", label="c=%s" % str(c))
if c != numpy.inf:
hist = hists[numpy.inf]
h2 = histogram(list(zip(*hist))[0], bins=h[-1], density=True)
res = abs(numpy.cumsum(h2[0]) - numpy.cumsum(h[0]))
i = res.argmax()
axvline(h[-1][i], color="grey")
else:
h = histogram(list(zip(*hist))[0], 40)
plot(h[-1][:-1], h[0], "s-", label="c=%s" % str(c))
gca().set_yscale("log")
if cum:
legend(loc="lower right")
else:
legend(loc="best")
savefig("test_mcmc_directed%s-cum%s.pdf" % (directed, str(cum)))
print("OK")
\ No newline at end of file
......@@ -17,6 +17,7 @@ libgraph_tool_inference_la_LDFLAGS = $(MOD_LDFLAGS)
libgraph_tool_inference_la_SOURCES = \
cache.cc \
graph_blockmodel.cc \
graph_blockmodel_imp.cc \
graph_blockmodel_em.cc \
graph_blockmodel_exhaustive.cc \
graph_blockmodel_gibbs.cc \
......@@ -44,6 +45,7 @@ libgraph_tool_inference_la_SOURCES = \
graph_blockmodel_overlap_multicanonical.cc \
graph_blockmodel_overlap_vacate.cc \
graph_inference.cc \
int_part.cc \
spence.cc
libgraph_tool_inference_la_include_HEADERS = \
......@@ -70,4 +72,5 @@ libgraph_tool_inference_la_include_HEADERS = \
merge_loop.hh \
multicanonical_loop.hh \
parallel_rng.hh \
int_part.hh \
util.hh
......@@ -36,6 +36,9 @@ namespace graph_tool
template <class MergeState, class RNG>
auto bundled_vacate_sweep(MergeState& state, RNG& rng)
{
if (state._nmerges == 0)
return make_pair(double(0), size_t(0));
// individual bundles can move in different directions
auto get_best_move = [&] (auto& bundle, auto& past_moves)
{
......@@ -134,7 +137,6 @@ auto bundled_vacate_sweep(MergeState& state, RNG& rng)
double dS = 0;
for (auto& bundle : bundles)
{
gt_hash_set<size_t> past_moves(forbidden_moves);
auto best_move = get_best_move(bundle, past_moves);
if (get<0>(best_move) == state._null_move)
......@@ -196,7 +198,7 @@ auto bundled_vacate_sweep(MergeState& state, RNG& rng)
double S = 0;
size_t nmerges = 0;
gt_hash_set<size_t> vacated;
while (nmerges != state._nmerges && !queue.empty())
while (nmerges < state._nmerges && !queue.empty())
{
auto pos = queue.top();
queue.pop();
......
......@@ -28,12 +28,15 @@ vector<double> __lgamma_cache;
void init_safelog(size_t x)
{
size_t old_size = __safelog_cache.size();
if (x >= old_size)
#pragma omp critical (_safelog_)
{
__safelog_cache.resize(x + 1);
for (size_t i = old_size; i < __safelog_cache.size(); ++i)
__safelog_cache[i] = safelog(double(i));
size_t old_size = __safelog_cache.size();
if (x >= old_size)
{
__safelog_cache.resize(x + 1);
for (size_t i = old_size; i < __safelog_cache.size(); ++i)
__safelog_cache[i] = safelog(double(i));
}
}
}
......@@ -45,12 +48,15 @@ void clear_safelog()
void init_xlogx(size_t x)
{
size_t old_size = __xlogx_cache.size();
if (x >= old_size)
#pragma omp critical (_xlogx_)
{
__xlogx_cache.resize(x + 1);
for (size_t i = old_size; i < __xlogx_cache.size(); ++i)
__xlogx_cache[i] = i * safelog(i);
size_t old_size = __xlogx_cache.size();
if (x >= old_size)
{
__xlogx_cache.resize(x + 1);
for (size_t i = old_size; i < __xlogx_cache.size(); ++i)
__xlogx_cache[i] = i * safelog(i);
}
}
}
......@@ -61,12 +67,19 @@ void clear_xlogx()
void init_lgamma(size_t x)
{
size_t old_size = __lgamma_cache.size();
if (x >= old_size)
using namespace boost::math::policies;
#pragma omp critical (_lgamma_)
{
__lgamma_cache.resize(x + 1);
for (size_t i = old_size; i < __lgamma_cache.size(); ++i)
__lgamma_cache[i] = lgamma(i);
size_t old_size = __lgamma_cache.size();
if (x >= old_size)
{
__lgamma_cache.resize(x + 1);
if (old_size == 0)
__lgamma_cache[0] = numeric_limits<double>::infinity();
for (size_t i = std::max(old_size, size_t(1)); i < __lgamma_cache.size(); ++i)
__lgamma_cache[i] = boost::math::lgamma(i);
}
}
}
......
......@@ -23,6 +23,8 @@
#include <vector>
#include <cmath>
#include <boost/math/special_functions/gamma.hpp>
namespace graph_tool
{
using namespace std;
......
......@@ -90,8 +90,7 @@ auto gibbs_sweep(GibbsState state, RNG& rng_)
if (state.node_weight(v) == 0)
return;
vector<size_t>& moves = state.get_moves(v);
auto& weights = state.get_weights(v);
auto& moves = state.get_moves(v);
probs.resize(moves.size());
deltas.resize(moves.size());
......@@ -114,13 +113,13 @@ auto gibbs_sweep(GibbsState state, RNG& rng_)
if (std::isinf(deltas[j]))
probs[j] = 0;
else
probs[j] = exp((-deltas[j] + dS_min) * beta) * weights[j];
probs[j] = exp((-deltas[j] + dS_min) * beta);
}
}
else
{
for (size_t j = 0; j < moves.size(); ++j)
probs[j] = (deltas[j] == dS_min) ? weights[j] : 0;
probs[j] = (deltas[j] == dS_min) ? 1 : 0;
}
Sampler<size_t> sampler(idx, probs);
......
......@@ -27,17 +27,22 @@
using namespace boost;
using namespace graph_tool;
namespace graph_tool
{