# Magic line to ensure plotting happens in Jupyter
%matplotlib inline
# Matplotlib Graphing Library
import matplotlib
# PyPlot is an object-oriented plot interface to matplotlib
import matplotlib.pyplot as plt
x_vals = list(range(8))
y1_vals = [x for x in x_vals]
y2_vals = [2*x for x in x_vals]
y3_vals = [x**0.5 for x in x_vals]
# Plot the three datasets
plt.plot(x_vals, y1_vals, label="identity")
plt.plot(x_vals, y2_vals, label="linear")
plt.plot(x_vals, y3_vals, label="sqrt")
# Set axis labels (you should always do this)
plt.xlabel("x")
plt.ylabel("f(x)")
# Activate legend in graph
plt.legend()
# Best practice is to use plt.show() to force rendering
plt.show()
# For Python's random number generator
import random
# Generate several random numbers
y_r_vals = [random.random() for x in range(len(x_vals))]
# Plot a scatter plot
plt.scatter(x_vals, y_r_vals)
# Set axis labels (you should always do this)
plt.xlabel("x")
plt.ylabel("f(x)")
# And show it
plt.show()
# Generate several random numbers
y_r_vals = [random.random() for x in range(1000)]
# Plot a histogram of random numbers
plt.hist(y_r_vals)
# Set axis labels (you should always do this)
plt.xlabel("Random Value Bins")
plt.ylabel("Count")
# Show grid lines
plt.grid()
# And show it
plt.show()
Way more can be done with Matplotlib. See https://matplotlib.org/gallery/index.html for more examples.
A nice YouTube tutorial on Matplotlis is available here: https://www.youtube.com/watch?v=q7Bo_J8x_dw&list=PLQVvvaa0QuDfefDfXb9Yf0la1fPDKluPF
from IPython.display import HTML
# Youtube
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/q7Bo_J8x_dw" ' +
'frameborder="0" gesture="media" allow="encrypted-media" allowfullscreen></iframe>')
Working with lists of numbers can be simplified with NumPy. In fact, most non-trivial data analysis packages rely on NumPy directly.
# Numpy for fast numeric computation
import numpy as np
# Want to add a value to each element of the list
py_list = [1, 2, 3, 4, 5, 6, 7]
print(py_list)
# Could do this, but it's a little verbose
print([x+5 for x in py_list])
# Makes more sense, but this will throw an error
print(py_list + 5)
arr1 = np.array(py_list) # create a numpy array
print(arr1)
# Can add value in element-wise operation
# also works for multiply, divide, subtract, etc.
print(arr1 + 5)
# And works natively with matplotlib
plt.plot(range(7), arr1, label="Original")
plt.plot(range(7), arr1 * 2, label="Added")
# Set axis labels (you should always do this)
plt.xlabel("x")
plt.ylabel("f(x)")
plt.legend()
plt.show()
A dataframe is quite similar to your standard Excel spreadsheet but can be manipulated more easily in Python.
# Pandas for R-like DataFrames
import pandas as pd
# Test tab-separated file for reading data
# The adjacency list for the risk board from HW2
tsv_file = "risk.adj"
# Read in TSV file and convert to DataFream
df = pd.read_csv(tsv_file, sep='\t', header=0)
# Pretty-Print the dataframe automatically
df
# How many rows and columns does this dataset have?
print("Dataset Size:", df.shape)
Use the adjacency list to create an adjacency matrix.
# How many countries
countries = set(df.SOURCE).union(set(df.SINK))
country_count = len(countries)
print("Countries:", country_count)
# Map country names to IDs
country_map = dict(zip(countries, range(country_count)))
# Initialize the adjacency matrix
adj_matrix = np.zeros((country_count, country_count))
# Populate the Adjacency Matrix
for idx, row in df.iterrows():
i = country_map[row.SOURCE]
j = country_map[row.SINK]
adj_matrix[i][j] = 1
adj_matrix[j][i] = 1
print("Number of Edges:", np.sum(adj_matrix))
Calculate the degree for each country in our list
# Invert the country map, so we can take a matrix row
# and convert it to the country name
inv_country_map = {x[1]:x[0] for x in country_map.items()}
# Get the degree for each country
for i in range(country_count):
print("Country:", inv_country_map[i], "Degree:", np.sum(adj_matrix[i]))
# Get the degree for each row (i.e., country)
degrees = np.sum(adj_matrix, axis=0)
# Build a histogram of degrees
plt.hist(degrees, bins=[x+1 for x in range(int(np.max(degrees)))])
plt.xlabel("Degree")
plt.ylabel("Degree Frequency")
plt.grid()
plt.show()
# Calculate the degree centrality for all countries
degree_centrality = degrees / (country_count - 1)
# Map countries to their centralities
d_cent_map = {inv_country_map[i]:degree_centrality[i] for i in range(country_count)}
# Sort countries by centrality
sorted_countries = sorted(d_cent_map, key=d_cent_map.get, reverse=True)
for c in sorted_countries[:10]:
print(c, d_cent_map[c])
max_centrality = np.max(degree_centrality)
centralization = np.sum([max_centrality - x for x in degree_centrality])
print("Unnormalized Centralization:", centralization)