# If you haven't used nltk before, run the following
# import nltk
# nltk.download('stopwords')
import os
import string
from nltk.corpus import stopwords
def get_speech_path(politician):
return os.path.join("political-speech-files", "{}-speeches.txt".format(politician))
def preprocess(word):
bad_letters = string.punctuation + string.digits
word = list(word.lower())
return "".join([letter for letter in word if (letter=="'" or letter not in bad_letters)])
def get_all_words(politician):
with open(get_speech_path(politician), 'r', encoding='utf8') as f:
all_words = f.read().split()
preprocessed = [preprocess(word) for word in all_words]
# common words we don't care about
stop_words = stopwords.words('english') + ['—', '–']
return [word for word in preprocessed if word and word not in stop_words]
obama_words = get_all_words('obama')
trump_words = get_all_words('trump')
class SentimentAnalyzer:
def __init__(self, vocab_fpath, weights_fpath):
with open(vocab_fpath, 'r', encoding='utf8') as vocab, open(weights_fpath, 'r', encoding='utf8') as weights:
self.vocab_dict = {word:float(weight) for word, weight in zip(vocab.read().split(), weights.read().split())}
def score_word(self, word):
return self.vocab_dict.get(word, 0)
def score_collection(self, phrase):
if not phrase:
return 0
else:
return sum([self.score_word(word) for word in phrase]) / len(phrase)
import os
imdb_sentiment = SentimentAnalyzer(os.path.join('aclImdb', 'imdb.vocab'), os.path.join('aclImdb', 'imdbEr.txt'))
print(imdb_sentiment.score_word('terrible'))
print(imdb_sentiment.score_word('hilarious'))
review = '''
A surprisingly beautiful movie.
Beautifully conceived, beautifully directed, beautifully acted, beautifully acted and most beautifully photographed.....the cinematography is nothing short of splendid.
It is a war movie but is epic in it's scope and blends romance, tragedy and comedy into a story that is as harrowing as it is provoking.
'''
print(imdb_sentiment.score_collection(review.split()))
obama_sentiment = imdb_sentiment.score_collection(obama_words)
trump_sentiment = imdb_sentiment.score_collection(trump_words)
import matplotlib.pyplot as plt
plt.bar([0, 1], [obama_sentiment, trump_sentiment], color=["Blue", "Red"], tick_label=['Obama', 'Trump'], align='center')
plt.ylabel("Sentiment score")
plt.title("Sentiment Scores calculated over Obama and Trump speeches")
plt.text(0.6, 0.13, '*Sentiment Values from IMDB')
plt.axis([-0.5, 1.5, -0.15, 0.15])
plt.show()