Our goal: sentiment analysis (categorize movie reviews as positive/negative based upon positivity/negativity of the words used in the review)
Let's get a list of all the distinct words (aclImdb/imdb.vocab
)
import os
def read_vocab():
with open(os.path.join('aclImdb', 'imdb.vocab'), 'r', encoding='utf8') as vocab:
return vocab.read().split()
vocab = read_vocab()
print(vocab[:30])
Let's get a corresponding weight for each word (aclImdb/imdbEr.txt
).
Where did these come from? More later in the class when we talk about machine learning :)
def read_weights():
with open(os.path.join('aclImdb', 'imdbEr.txt'), 'r', encoding='utf8') as weights:
return [float(w) for w in weights.read().split()]
weights = read_weights()
print(weights[:30])
min_weight = min(weights)
min_index = weights.index(min_weight)
print(min_weight, min_index, vocab[min_index])
for weight_index, weight in enumerate(weights):
if weight < -4.1:
print("{}: {}".format(weight, vocab[weight_index]))
max_weight = max(weights)
max_index = weights.index(max_weight)
print(max_weight, max_index, vocab[max_index])
for weight_index, weight in enumerate(weights):
if weight > 4.1:
print("{}: {}".format(weight, vocab[weight_index]))
Let's create a dictionary that associates words with weights
def make_word_weight_dict(vocab, weights):
return {word:weight for word, weight in zip(vocab, weights)}
vocab_dict = make_word_weight_dict(vocab, weights)
print(vocab_dict['terrible'])
print(vocab_dict['boring'])
print(vocab_dict['great'])
print(vocab_dict['hilarious'])
Let's read in all the positive/negative reviews
import random
def read_all_reviews_from_dir(dir_path):
reviews = []
for f in os.listdir(dir_path): # get a list of all files in a directory
with open(os.path.join(dir_path, f), 'r', encoding='utf8') as review_file:
reviews.append((f, review_file.read()))
return reviews
def read_all_reviews():
base_path = os.path.join('aclImdb', 'test')
pos_reviews = read_all_reviews_from_dir(os.path.join(base_path, 'pos'))
pos_labels = ['positive'] * len(pos_reviews)
neg_reviews = read_all_reviews_from_dir(os.path.join(base_path, 'neg'))
neg_labels = ['negative'] * len(neg_reviews)
all_reviews = pos_reviews + neg_reviews
all_labels = pos_labels + neg_labels
reviews_and_labels = list(zip(all_reviews, all_labels))
random.shuffle(reviews_and_labels)
return reviews_and_labels
reviews = read_all_reviews()
print(reviews[0])
Now let's try to score a review based upon the per-word weights (normalized by the number of words).
def score_review(review_text, vocab_dict):
review_words = review_text.split()
score = 0
for word in review_words:
score += vocab_dict.get(word, 0)
return score / len(review_words)
def review_summary(review_of_interest, reviews, vocab_dict):
print(reviews[review_of_interest][0][1])
print()
print("Rating: {}".format(reviews[review_of_interest][1]))
print("Score: {}".format(score_review(reviews[review_of_interest][0][1], vocab_dict)))
review_scores = [score_review(review[0][1], vocab_dict) for review in reviews]
min_review = min(review_scores)
print(min_review)
review_summary(review_scores.index(min_review), reviews, vocab_dict)
max_review = max(review_scores)
print(max_review)
review_summary(review_scores.index(max_review), reviews, vocab_dict)