Theme: Graphs and networks
Topic: modelling graphs, graph problems, working with graphs in python
Keywords: nodes, edges, paths, graphs, networks, networkx
Presenter | James Powell james@dutc.io |
Date | Friday, May 21, 2021 |
Time | 1:30 PM PST |
“All around me I see nodes and edges! Graphs are everywhere—in all sorts of problems we need to solve, from modeling connectivity in a data center to modeling workflows and business processes! In this session, we’ll brush up on some fundamental graph theory and discuss how to work with and model graphs in Python!”
Hands-on with Graph Analysis in Python.
Join Python expert-in-residence James Powell for a hands-on, small group workshop on Working with Graphs in Python! Using a case study as a foundation, we will cover topics based on attendees interests, such as:
There are no formal prerequisites, however, experience with Python is a must! There will be plenty of time during the session for questions and discussion.
print("Let's get started!")
Question: What is a “node”? What is a “vertex”? What is an “edge”? What is a “link”? Question: What is a “directed” vs “undirected” graph? Question: No, what are these, really? Question: What is a “dense” or “sparse” graph? Question: What are the general formats for data? Question: What is a “tree”? What is a “cyclic” vs “acyclic” graph?
G = { V, E }
V = { ... }
E = { (u, v), ... }
from networkx import Graph
g = Graph()
g.add_edge('a', 'b')
g.add_edge('a', 'c')
print(f'{[*g.neighbors("a")] = }')
print(f'{[*g.neighbors("b")] = }')
print(f'{[*g.neighbors("c")] = }')
from networkx import DiGraph
g = DiGraph()
g.add_edge('a', 'b')
g.add_edge('a', 'c')
g.add_edges_from((v, u) for u, v in g.edges)
print(f'{[*g.neighbors("a")] = }')
print(f'{[*g.neighbors("b")] = }')
print(f'{[*g.neighbors("c")] = }')
from networkx import DiGraph
g = DiGraph()
g.add_edge('a', 'b', weight=123)
g.add_edge('a', 'b', weight=456)
print(f'{g.get_edge_data("a", "b") = }')
print(f'{[*g.edges] = }')
from networkx import MultiDiGraph
g = MultiDiGraph()
g.add_edge('a', 'b', weight=123)
g.add_edge('a', 'b', weight=456)
print(f'{g.get_edge_data("a", "b") = }')
# a ----> b
# ----> b
# a - ab - b
# \ ab /
# SF --a-- LON
# --b--
# --c--
# --d--
list # opaque, typically homogeneous collection of entities (`for`-loop syntax)
# → human-ordered
set # opaque, typically homogeneous collection of unique entities, mathematical
# → machine-ordered
dict # collection of key value pairs, one-way lookup
# → used to be machine-ordered, now human/insertion-order
frozenset # immutable set
tuple # heterogeneous collection of fields (unpacking syntax)
# `__iter__`
# - “human”-vs-“machine”-ordering
# - uniqueness
# - unpacking vs loop syntax
from dataclasses import dataclass
from collections import namedtuple
class T:
pass
@dataclass
class T:
left : 'T'
right : 'T'
T = namedtuple('T', '')
from dataclasses import dataclass, field
@dataclass
class Manufacturer:
name : str
devices : list = field(default_factory=list)
@dataclass
class Device:
name : str
manufacturer : Manufacturer
def __hash__(self):
return hash(self.name)
samsung = Manufacturer('samsung')
abc123 = Device('abc123', samsung)
samsung.devices.append(abc123)
print(f'{abc123 = }')
print(f'{abc123.manufacturer.devices[0].manufacturer.devices[0].manufacturer.devices[0] = }')
from collections import Counter
c1 = Counter({'a': 1, 'b': 2, 'c': 3})
c2 = Counter({ 'b': 20, 'c': 30, 'd': 4})
print(
c1 + c2
)
from numpy.random import randint
xs1 = randint(0, 255, dtype='uint8', size=(3, 3, 3))
xs2 = randint(0, 255, dtype='uint8', size=(3, 3))
xs1 += 1_000_000
print(
xs1,
# xs1 + xs2,
)
from pandas import Series, DataFrame, date_range
from numpy.random import normal
s1 = Series(normal(size=(size := 10)),
index=date_range('2021-05-21', periods=size, freq='7D'))
s2 = Series(normal(size=(size := 10)),
index=date_range('2021-05-14', periods=size, freq='7D'))
print(
# s1,
# s1 + s2,
)
df1 = DataFrame({
'a': normal(size=(size := 10)),
'b': normal(size=size),
}, date_range('2021-05-21', periods=size, freq='7D'))
df2 = DataFrame({
'b': normal(size=(size := 10)),
'c': normal(size=size),
}, date_range('2021-05-14', periods=size, freq='7D'))
print(
df1 + df2
)
from xarray import DataArray
from xarray import Dataset
from pandas import date_range
from numpy import linspace
from numpy.random import randint
da1 = DataArray(
data=randint(0, 255, dtype='uint8', size=(3, 3, 10)),
dims=[*'xyt'],
coords={
'x': linspace(0, 10, 3),
'y': linspace(1, 12, 3),
't': date_range('2021-05-21', periods=10, freq='3T'),
}
)
da2 = DataArray(
data=randint(0, 255, dtype='uint8', size=(3, 3, 10)),
dims=[*'xyt'],
coords={
'x': linspace(0, 10, 3),
'y': linspace(1, 12, 3),
't': date_range('2021-05-21', periods=10, freq='3T'),
}
)
print(
# da1,
# da2,
)
ds = Dataset({
'da1': da1,
'da2': da2,
})
print(
ds,
)
Question: What is path-finding?
Question: What are the standard traversals?
Question: What is colouring?
Question: What is bipartite matching?
Question: What is min/max flow?
Question: What else can graphs be used to model?
Question: How can we combine networkx
with other tools like pandas
?
from networkx import DiGraph
from networkx.algorithms import all_simple_paths, shortest_path
g = DiGraph()
g.add_edge('a', 'b')
g.add_edge('b', 'c')
g.add_edge('c', 'd')
g.add_edge('c', 'e')
g.add_edge('e', 'f')
g.add_edge('f', 'g')
g.add_edge('d', 'g')
# print(f'{[*all_simple_paths(g, "a", "g")] = }')
print(f'{shortest_path(g, "a", "g") = }')
help(shortest_path)
from networkx import DiGraph
from networkx.algorithms import dfs_edges, bfs_edges, topological_sort
# a
# b c
# d e f g
g = DiGraph()
g.add_edge('a', 'b')
g.add_edge('b', 'c')
g.add_edge('b', 'd')
# g.add_edge('c', 'e')
# g.add_edge('c', 'f')
# g.add_edge('c', 'g')
# g.add_edge('c', 'h')
#
# g.add_edge('d', 'i')
# g.add_edge('d', 'j')
# g.add_edge('d', 'k')
print(f'{[*bfs_edges(g, "a")] = }')
print(f'{[*dfs_edges(g, "a")] = }')
print(f'{[*topological_sort(g)] = }')
from networkx.algorithms import greedy_color
from networkx import DiGraph
from itertools import permutations
groups = {frozenset(x.split(':')[-1].split()) for x in '''
a : Nitin Herman Gilad
b : Herman Anju Salina
c : Anju Salina Hans-Jürgen
d : Nitin Salina Anju
'''.splitlines() if x.strip()}
g = DiGraph()
g.add_edges_from(uv for g in groups for uv in permutations(g, 2))
print(
greedy_color(g)
)
from networkx import DiGraph
from networkx.algorithms.bipartite import is_bipartite, sets
from itertools import permutations
groups = {frozenset(x.split()) for x in '''
Anju Gilad
Anju Salina
Herman Salina
Hans-Jürgen Gilad
Herman Salina
Nitin Gilad
Nitin Salina
'''.splitlines() if x.strip()}
g = DiGraph()
g.add_edges_from(uv for g in groups for uv in permutations(g, 2))
print(f'{is_bipartite(g) = }')
print(f'{sets(g) = }')
from networkx import DiGraph
from networkx.algorithms.flow import maximum_flow, minimum_cut
g = DiGraph()
g.add_edge('a', 'b', capacity=1.0)
g.add_edge('a', 'c', capacity=10.0)
g.add_edge('b', 'd', capacity=4.0)
g.add_edge('c', 'd', capacity=1.0)
print(f'{maximum_flow(g, "a", "d") = }')
print(f'{minimum_cut(g, "a", "d") = }')
from networkx import DiGraph
from dataclasses import dataclass
# init - a - terminal
# b
@dataclass(frozen=True)
class State:
name : str
states = {k: State(k) for k in 'init a b terminal'.split()}
g = DiGraph()
g.add_nodes_from(states)
g.add_edge(states['init'], states['a'])
g.add_edge(states['init'], states['b'])
g.add_edge(states['a'], states['terminal'])
g.add_edge(states['b'], states['terminal'])
def g():
print('init')
mode = yield
if mode:
print('a')
else
print('b')
yield
print('terminal')
yield
Question: How can we model a graph? What is an adjacency list? What is a connectivity matrix?
Question: How does networkx
model graphs?
Question: What other choices do we have for graph modellings?
Question: How does networkx
stack up against the other options?
from numpy import array
# a → a 50%
# a → b 25%
# a → c 25%
t = array([
# to: a b c # from:
[ .50, .25, .25 ], # a
[ .25, .50, .25 ], # b
[ .25, .50, .25 ], # c
])
print(t @ t)
from numpy import array
from numpy.linalg import matrix_power
# a → b → d
# a → c → d
g = array([
# to: a b c d e # from:
[ 1, 1, 1, 0, 0 ], # a
[ 0, 1, 0, 1, 0 ], # b
[ 0, 0, 1, 1, 0 ], # c
[ 0, 0, 0, 1, 0 ], # d
[ 0, 0, 0, 0, 1 ], # e
]).astype(bool)
# print(f'{matrix_power(g, 1) * 1}')
# print(f'{matrix_power(g, 2) * 1}')
# print(f'{matrix_power(g, 3) * 1}')
print(f'{matrix_power(g, 4) * 1}')
from numpy import eye
from numpy.random import randint
from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, lil_matrix
g = eye(10).astype(bool)
g[randint(0, 10-1, size=3), randint(0, 10-1, size=3)] = True
print(
coo_matrix(g),
# csc_matrix(g), # efficient A + B, A * C
# csr_matrix(g), # efficient A + B, A * C
# lil_matrix(g), # fast slicing
)
from scipy.sparse.csgraph import csgraph_from_dense, shortest_path
from numpy import array
g = array([
# to: a b c d e # from:
[ 1, 1, 1, 0, 0 ], # a
[ 0, 1, 0, 1, 0 ], # b
[ 0, 0, 1, 1, 0 ], # c
[ 0, 0, 0, 1, 0 ], # d
[ 0, 0, 0, 0, 1 ], # e
]).astype(bool)
g = csgraph_from_dense(g)
print(
# g,
shortest_path(g),
# *shortest_path(g, return_predecessors=True),
# *shortest_path(g, indices=0, return_predecessors=True),
# sep='\n'
)
from networkx import DiGraph
g = DiGraph()
g.add_edge('a', 'b')
g.add_edge('a', 'c')
g.add_edge('b', 'd', weight=123)
g.add_edge('c', 'd')
print(
[f'{id(x):#_x}' for x in g._adj],
)
from time import perf_counter
from contextlib import contextmanager
@contextmanager
def timed(heading):
try:
start = perf_counter()
yield
finally:
stop = perf_counter()
print(f'Elasped {heading:<16} \N{mathematical bold capital delta}t {stop - start:.4f}s')
from numpy.random import randint as np_randint
from random import randint as py_randint
def dot(xs, ys):
return sum(x * y for x, y in zip(xs, ys))
with timed('create: np'):
np_xs = np_randint(-1_000, 1_000, size=1_000_000)
np_ys = np_randint(-1_000, 1_000, size=1_000_000)
with timed('create: py'):
py_xs = [py_randint(-1_000, 1_000) for _ in range(1_000_000)]
py_ys = [py_randint(-1_000, 1_000) for _ in range(1_000_000)]
with timed('dot: np'):
np_xs.dot(np_ys)
with timed('dot: py'):
dot(py_xs, py_ys)
with timed('dot: np&py'):
dot(np_xs, np_ys)
from pandas import Timestamp
from numpy.random import default_rng
rng = default_rng(Timestamp('2021-05-21').asm8.astype('uint32'))
from numpy import triu_indices, dstack
from pandas import DataFrame
from string import ascii_lowercase
from networkx import DiGraph
from pandas.api.extensions import register_dataframe_accessor
@register_dataframe_accessor('graph')
class GraphAccessor:
def __init__(self, obj):
self.obj = obj
self.g = DiGraph()
self.g.add_edges_from(self.obj.index)
def downlink(self, *idxs):
return self.obj.loc[self.obj.loc[[*idxs]].index]
def uplink(self, *idxs):
return self.obj.loc[[n for idx in idxs for n in self.g.predecessors(idx)]]
entities = rng.choice([*ascii_lowercase], size=(10, 9))
entities[:, -6:] = [*'.fbnet',]
entities = entities.view('<U9').ravel()
df = DataFrame(
entities[rng.choice(dstack(triu_indices(len(entities), k=1))[0], size=(size := 7))],
columns = 'uplink downlink'.split()
).pipe(lambda df: df
.assign(bytes_in=rng.integers(1_000, size=size))
.assign(bytes_out=-rng.integers(1_000, size=size))
.set_index(['uplink', 'downlink'])
.sort_index()
)
print(
# df,
# df.loc['cpj.fbnet'],
# df.graph.downlink('cpj.fbnet'),
# df.graph.downlink('cpj.fbnet', 'fex.fbnet'),
df.graph.uplink('aaz.fbnet', 'jmb.fbnet'),
)
Question: Are nodes first-class entities in a networkx
graph? What is a valid node?
Question: Are edges first-class entities in a networkx
graph? What is a valid edge?
Question: How do we store node & edge data?
Question: How do we safely manipulate the networkx
graph? Is it mutable or immutable?
Question: How do we customise algorithms?
Question: Is the graph static or dynamic?
from networkx import DiGraph
from networkx.algorithms.flow import maximum_flow, minimum_cut
g = DiGraph()
g.add_edge('a', 'b', capacity=1.0)
g.add_edge('a', 'c', capacity=10.0)
g.add_edge('b', 'd', capacity=4.0)
g.add_edge('c', 'd', capacity=1.0)
for u, v in g.edges():
g.get_edge_data(u, v)['capacity'] += 10
print(f'{maximum_flow(g, "a", "d") = }')
g.get_edge_data(u, v)['capacity'] -= 10
from dataclasses import dataclass
from networkx import DiGraph
@dataclass
class Node:
state : None = None
def __hash__(self):
return hash(id(self))
g = DiGraph()
g.add_edge(Node(), Node())
for x in g.nodes():
...
#
# a - b
# \ c
#
Given a large, mostly connected, nearly bipartite graph, color bands of nodes such that they are either red or blue.
Given a sequence of messages from some destination in the red group to some other destination in the red group, compute the path that each message would take.
Assume that the message must move subject to a specified policy, e.g.,:
Recompute the paths that each message would take and construct a heatmap of which nodes participate most heavily in the various paths.