Graphs can be found most anywhere. Here's an example with Wikipedia.
# First we need to make sure we have wikipedia installed...
import pip
pip.main(['install', 'wikipedia'])
import wikipedia
We want to focus on Wikipedia articles related to complex networks.
We'll construct a network centered on the "Complex Network" article and look at all articles within 1 hop of this article and their connections (i.e., the 1.5-degree ego net).
"""
Construct a network of Wikipedia pages anchored at Complex Networks
"""
from operator import itemgetter
import networkx as nx
import wikipedia
# Where will we start
SEED = "Complex network".title()
# Wikipedia Articles to Ignore
STOPS = ("International Standard Serial Number",
"International Standard Book Number",
"National Diet Library",
"International Standard Name Identifier",
"International Standard Book Number (Identifier)",
"Pubmed Identifier", "Pubmed Central",
"Digital Object Identifier", "Arxiv",
"Proc Natl Acad Sci Usa", "Bibcode",
"Library Of Congress Control Number", "Jstor")
# Data structures for tracking what articles we've read
todo_lst = [(0, SEED)] # The SEED is in the degree 0
todo_set = set(SEED) # The SEED itself
done_set = set() # Nothing is done yet
# We want a directed graph since links only go one way
F = nx.DiGraph()
# Initialize our search
layer, page = todo_lst[0]
# Look for all pages less than 2 hops away
while layer < 2:
# remove the current article to process (we've already pulled it from this list)
del todo_lst[0] #(1)
# Add this article to the list of articles we've processed
done_set.add(page)
print(layer, page) # Show progress
# Try to load the wikipedia page of this name
try: #(2)
wiki = wikipedia.page(page)
except:
layer, page = todo_lst[0]
print("Could not load", page)
continue
# Now traverse links on this page
for link in wiki.links: #(3)
link = link.title()
# Skip links we decided to ignore or that start with "List Of"
if link not in STOPS and not link.startswith("List Of"):
# If we haven't already processed or queued this link, add it
if link not in todo_set and link not in done_set:
todo_lst.append((layer + 1, link))
todo_set.add(link)
# Add an edge from the current page to this link
F.add_edge(page, link)
# Get the next layer and page
layer, page = todo_lst[0] #(4)
print("{} nodes, {} edges".format(len(F), nx.number_of_edges(F)))
# 11597 nodes, 21331 edges
# How dense is this network?
print("Density:", nx.density(F))
# Delete all self-loops
F.remove_edges_from(F.selfloop_edges())
# We occasionally see "Complex Network" and "Complex Networks"
# in Wikipedia. Let's collapse those.
duplicates = [(node, node + "s") for node in F if node + "s" in F]
for dup in duplicates:
# For each pair of duplicates, collapse into one node
F = nx.contracted_nodes(F, *dup, self_loops=False)
# More duplicate removal with "Complex-Networks" and "Complex Networks"
duplicates = [(x, y) for x, y
in [(node, node.replace("-", " ")) for node in F]
if x != y and y in F]
for dup in duplicates:
F = nx.contracted_nodes(F, *dup, self_loops=False)
# Clear all contraction attributes (necessary for graphml export)
nx.set_node_attributes(F, 0, "contraction")
print("{} nodes, {} edges".format(len(F), nx.number_of_edges(F)))
# 11597 nodes, 21331 edges
# How dense is this network?
print("Density:", nx.density(F))
# Only keep articles with more than 1 degree
core = [node for node, deg in F.degree() if deg >= 2]
# Get a subgraph with these core articles
G = nx.subgraph(F, core)
print("{} nodes, {} edges".format(len(G), nx.number_of_edges(G)))
# 2995 nodes, 11817 edges
# What nodes have the highest in-degree?
top_indegree = sorted(G.in_degree(),
reverse=True, key=itemgetter(1))[:100]
print("\n".join(map(lambda t: "{} {}".format(*reversed(t)), top_indegree)))
# different centrality types
centrality_pairs = [
("d_cent", nx.degree_centrality),
("c_cent", nx.closeness_centrality),
("b_cent", nx.betweenness_centrality),
]
# Keep track of different centralities
centrality_map = {}
# For each type of centrality, apply the node attributes
for cent_label, cent_func in centrality_pairs:
centrality = cent_func(G)
centrality_map[cent_label] = centrality
nx.set_node_attributes(G, centrality, cent_label)
for cent_label, cent_vals in centrality_map.items():
print(cent_label)
tops = [(x, cent_vals[x]) for x in sorted(cent_vals, key=cent_vals.get, reverse=True)]
for t in tops[:10]:
print("\t", t)
nx.write_graphml(G, "cna.graphml")