A package for defining patterns of text, searching for said pattern, and checking if a string matches this pattern.
import re # Regular expression package
# Sample email addresses
email_addresses = [
"cbuntain@cs.umd.edu", # Valid email
"cbuntain@umd.edu", # valid
"cbuntain@umd", # invalid because "umd" isn't a valid domain
"cbuntain@", # Invalid because no domain is given
]
# Regular expression for matching an email address
# - Should stard with a letter or number
# - Can include a dot before the @ sign
# - Must have the @ sign
# - Domain after @ must have some number of letters/numbers, a dot, and a top-level domain
email_match_pattern = re.compile("[A-Za-z0-9]+(\.?[A-Za-z0-9])+@([A-Za-z0-9]+\.)+[A-Za-z]")
# For each email address sample, check if it is valid
for addr in email_addresses:
if (email_match_pattern.match(addr) != None):
print("Email address (%s) is valid" % addr)
else:
print("Email address (%s) is INVALID" % addr)
# Magic line to ensure plotting happens in Jupyter
%matplotlib inline
# Matplotlib Graphing Library
import matplotlib
# PyPlot is an object-oriented plot interface to matplotlib
import matplotlib.pyplot as plt
x_vals = list(range(8))
y1_vals = [x for x in x_vals]
y2_vals = [2*x for x in x_vals]
y3_vals = [x**0.5 for x in x_vals]
# Plot the three datasets
plt.plot(x_vals, y1_vals, label="identity")
plt.plot(x_vals, y2_vals, label="linear")
plt.plot(x_vals, y3_vals, label="sqrt")
# Set axis labels (you should always do this)
plt.xlabel("x")
plt.ylabel("f(x)")
# Activate legend in graph
plt.legend()
# Best practice is to use plt.show() to force rendering
plt.show()
# For Python's random number generator
import random
# Generate several random numbers
y_r_vals = [random.random() for x in range(len(x_vals))]
# Plot a scatter plot
plt.scatter(x_vals, y_r_vals)
# Set axis labels (you should always do this)
plt.xlabel("x")
plt.ylabel("f(x)")
# And show it
plt.show()
# Generate several random numbers
y_r_vals = [random.random() for x in range(1000)]
# Plot a histogram of random numbers
plt.hist(y_r_vals)
# Set axis labels (you should always do this)
plt.xlabel("Random Value Bins")
plt.ylabel("Count")
# Show grid lines
plt.grid()
# And show it
plt.show()
Way more can be done with Matplotlib. See https://matplotlib.org/gallery/index.html for more examples.
A nice YouTube tutorial on Matplotlis is available here: https://www.youtube.com/watch?v=q7Bo_J8x_dw&list=PLQVvvaa0QuDfefDfXb9Yf0la1fPDKluPF
from IPython.display import HTML
# Youtube
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/q7Bo_J8x_dw" ' +
'frameborder="0" gesture="media" allow="encrypted-media" allowfullscreen></iframe>')
Working with lists of numbers can be simplified with NumPy. In fact, most non-trivial data analysis packages rely on NumPy directly.
# Numpy for fast numeric computation
import numpy as np
# Want to add a value to each element of the list
py_list = [1, 2, 3, 4, 5, 6, 7]
print(py_list)
# Could do this, but it's a little verbose
print([x+5 for x in py_list])
# Makes more sense, but this will throw an error
print(py_list + 5)
arr1 = np.array(py_list) # create a numpy array
print(arr1)
# Can add value in element-wise operation
# also works for multiply, divide, subtract, etc.
print(arr1 + 5)
# And works natively with matplotlib
plt.plot(range(7), arr1, label="Original")
plt.plot(range(7), arr1 * 2, label="Added")
# Set axis labels (you should always do this)
plt.xlabel("x")
plt.ylabel("f(x)")
plt.show()
A dataframe is quite similar to your standard Excel spreadsheet but can be manipulated more easily in Python.
# Pandas for R-like DataFrames
import pandas as pd
# Test tab-separated file for reading data
# Woo, Chipotle data
tsv_file = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
# Read in TSV file and convert to DataFream
df = pd.read_csv(tsv_file, sep='\t')
# Pretty-Print the dataframe automatically
df
# How many rows and columns does this dataset have?
print("Dataset Size:", df.shape)
# Can answer questions like average price
df["item_price"].apply(lambda x: float(x.replace("$", ""))).mean()
# Can slice to look for only rows that mention chicken
chx_df = df[df["item_name"].str.contains("Chicken")]
print("Non-chicken items:", df.shape[0] - chx_df.shape[0])
# And we can use matplotlib's graphing capabilities directly
df["item_price"].apply(lambda x: float(x.replace("$", ""))).hist()
import nltk
# We'll go ahead and pull in some analysis data
from nltk.corpus import stopwords # Lists of stopwords in several languages
from nltk.tokenize import word_tokenize # Function to split string into individual tokens
# We need something to tokenize. How about our Chipotle data?
choices = df["choice_description"].dropna().tolist()
# Print some examples of tokenized choices
for choice in choices[:20]:
print(choice, "-> ", end="")
print(word_tokenize(choice))
# Build a map of tokens
map_tokens = {}
# for each choice, tokenize its content and add it to map
for choice in choices:
for w in map(lambda x: x.lower(), word_tokenize(choice)):
map_tokens[w] = map_tokens.get(w, 0) + 1
print("Token Count Map:")
for token, count in sorted(map_tokens.items(), key=lambda x: x[1], reverse=True):
print(token, count)
NLTK also has a bunch of corpora we can use for testing.
# Corpus of US President State of the Union addresses
nltk.download("state_union")
from nltk.corpus import state_union
# Look at the last 1000 characters of the 2006 State of the Union
state_union.raw()[-1000:]
# Let's tokenize the state of the unions to figure out common words
word_list = word_tokenize(state_union.raw())
# NLTK has a nice convenience class called `FreqDist` for creating frequency distributions
state_union_freqs = nltk.FreqDist(map(lambda x: x.lower(), word_list))
# Print the most common words...
for word, count in state_union_freqs.most_common(20):
print(word, count)
import string # we'll use this for punctuation too
# Note that this word count has many stop words and punctuation, not hugely useful
for stop in stopwords.words("english") + list(string.punctuation):
if ( stop in state_union_freqs ):
del state_union_freqs[stop]
# Print the most common words...
for word, count in state_union_freqs.most_common(20):
print(word, count)
plt.hist(list(state_union_freqs.values()), bins=20)
plt.xlabel("Token Count")
plt.ylabel("Count Frequency")
plt.grid()
plt.show()
Above were the more common packages we will use throughout the course.
Let's quickly look at a few other packages we'll utilize, just so you can see them. We'll go into more depth later.
TextBlob is a layer built on top of NLTK to make the library more accessible. Basically, you just give it a string of text, and it can do a bunch of things for you right off the bat.
# High-level Package for Text Analysis, relies on NLTK
from textblob import TextBlob
# NLTK's corpora have tokenized sentences for us, so lets use that
# to play with TextBlob. Note that a sentence from NLTK comes out
# as a list of tokens, but TextBlob expects a string, so we'll
# join the words together with white space
test_sentence = " ".join(state_union.sents()[2])
blob = TextBlob(text=test_sentence)
print(blob)
# What language is this?
print(blob.detect_language())
# Rapid sentiment. Negative means unhappy
print(blob.polarity)
# Mentioned noun phrases in this sentence
print(blob.noun_phrases)
# Graph general sentiment
per_sentence_feels = np.array([TextBlob(" ".join(x)).polarity for x in state_union.sents()])
print("Average Sentiment:", per_sentence_feels.mean())
plt.hist(per_sentence_feels)
plt.xlabel("Sentence Sentiment")
plt.ylabel("Sentiment Frequency")
plt.grid()
plt.show()
Basemap lets us plot world maps
from mpl_toolkits.basemap import Basemap
# create new figure, axes instances.
fig = plt.figure(figsize=(10,10))
ax = fig.gca()
# setup mercator map projection.
m = Basemap(llcrnrlon=-160.,llcrnrlat=-40.,urcrnrlon=120.,urcrnrlat=60.,\
rsphere=(6378137.00,6356752.3142),\
resolution='l',projection='merc',\
lat_0=40.,lon_0=-20.,lat_ts=20.)
# nylat, nylon are lat/lon of New York
nylat = 32.7766642
nylon = -96.7969879
# lonlat, lonlon are lat/lon of London.
lonlat = 19.
lonlon = -156.
# draw great circle route between NY and London
m.drawgreatcircle(nylon, nylat, lonlon, lonlat, linewidth=2, color='b')
m.drawcoastlines()
m.fillcontinents()
# draw parallels
m.drawparallels(np.arange(-80,80,20),labels=[1,1,0,1])
# draw meridians
m.drawmeridians(np.arange(-180,180,30),labels=[1,1,0,1])
ax.set_title('Great Circle from New York to London')
plt.show()