Our goal: analyze top word counts from these two politicians
Windows separates directories with \
, Mac/Linux separates with /
, so we use the os
module to help create platform-independent code...
import os
transcript_directory = "political-speech-files"
def get_speech_path(politician):
return os.path.join(transcript_directory, "{}-speeches.txt".format(politician))
print(get_speech_path("obama"))
print(get_speech_path("trump"))
# If you haven't used nltk before, run the following
import nltk
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
def preprocess(word):
bad_letters = string.punctuation + string.digits
word = list(word.lower())
return "".join([letter for letter in word if (letter=="'" or letter not in bad_letters)])
def get_all_words(politician):
with open(get_speech_path(politician), 'r', encoding='utf8') as f:
all_words = f.read().split()
preprocessed = [preprocess(word) for word in all_words]
# common words we don't care about
stop_words = stopwords.words('english') + ['—', '–']
return [word for word in preprocessed if word and word not in stop_words]
obama_words = get_all_words('obama')
trump_words = get_all_words('trump')
print(len(obama_words))
print(len(trump_words))
print(obama_words[:30])
print(trump_words[:30])
Sometimes it's useful to look at a random sample of words (e.g., to understand the dataset, try out a function that might take a long time)...
import random
def sample_words(words, k, seed=None):
if seed is not None:
random.seed(seed)
return random.sample(words, k)
print(sample_words(obama_words, 100, 322))
print(sample_words(trump_words, 100, 322))
Our goal will be to produce a list of lists, where the inner lists have two elements (word, count).
Note: our next topic will make this MUCH faster.
def add_word_to_count(word, counts):
# try to add to existing count
for count in counts:
if count[0] == word:
count[1] += 1
return
# if no match, set count to 1
counts.append([word, 1])
def word_count(words):
counts = []
for word in words:
add_word_to_count(word, counts)
return counts
obama_count = word_count(obama_words) # ~1 minute
trump_count = word_count(trump_words) # ~10 seconds
print(obama_count[:10])
print(trump_count[:10])
# Let's sort these counts
def get_count(count_pair):
return count_pair[1]
def sort_counts(counts):
return sorted(counts, key=get_count, reverse=True)
obama_sorted_counts = sort_counts(obama_count)
trump_sorted_counts = sort_counts(trump_count)
print(obama_sorted_counts[:100])
print(trump_sorted_counts[:100])
# Analysis of top-k words for each (that are different)
k = 200
top_k_obama = [pair[0] for pair in obama_sorted_counts[:k]]
top_k_trump = [pair[0] for pair in trump_sorted_counts[:k]]
most_obama = [word for word in top_k_obama if word not in top_k_trump]
most_trump = [word for word in top_k_trump if word not in top_k_obama]
print(most_obama)
print(most_trump)