#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Felix Muzny 11/8/2022 DS 2000 Lecture 18 - finalized code for sentiment visualization We're going to start by creating a visualization where the y-axis is the line number in the lyrics and the x-axis is the word number within that line. We'll start by plotting a point where each word exists. Example: It's me, hi I'm the problem, it's me It's me, hi I'm the problem, it's me It's me, hi Everybody agrees, everybody agrees Becomes: x x x x x x x x x x x x x x x x x x x x x x x """ import matplotlib.pyplot as plt # for making custom legend later import matplotlib.lines as mlines # all the files that we'll use FILENAME = "antihero.txt" # eventually we'll use all of these STOPWORDS = "stopwords.txt" # UPDATED: use dashes here instead of underscores POS_WORDS = "positive-words.txt" NEG_WORDS = "negative-words.txt" # we'll use this function to read in the lyrics def read_data(filename): """ Reads in the data in a given file and stores the values in a list of lists of strings. Assumes that spaces separate words in the given file. Parameters ---------- filename : string name of the file Returns ------- data : list of lists list of lists of words for all lines in the file """ file = open(filename, "r") data = [] for line in file: # we'll split by whitespace instead of by comma pieces = line.strip().split() data.append(pieces) file.close() return data # we'll use this function to read in the word lists def read_data_singlelist(filename): """ Reads in the data in a given file and stores the values in a list of strings. Parameters ---------- filename : string name of the file skip_header: boolean, optional whether or not to skip a header row. Default to False. type_casts: list, optional type specification for each column in the data Returns ------- data : list of strings list of strings for all lines in the file """ file = open(filename, "r") data = [] for line in file: data.append(line.strip()) file.close() return data def make_plot(lyrics, stopwords, pos_words, neg_words): # must be before we do our plotting!!! plt.figure(figsize=(10,10)) for line_num in range(len(lyrics)): # words of a single line words = lyrics[line_num] # x_vals and y_vals could have better names x_vals = [] # same len as x_vals, but all the same number y_vals = [] colors = [] # what gives me the # of words? in this line? for word_num in range(len(words)): x_vals.append(word_num) y_vals.append(line_num) # clean our text word = words[word_num].lower() word = word.replace(",", "") word = word.replace("'", "") word = word.replace("(", "") word = word.replace(")", "") if word in stopwords: # print("STOP:", word) colors.append("k") elif word in pos_words: # print("POS", word) colors.append("r") elif word in neg_words: # print("NEG", word) colors.append("cyan") else: # print("non stop:", word) colors.append("grey") plt.scatter(x_vals, y_vals, color = colors, alpha=0.7) # switch the y-axis plt.gca().invert_yaxis() plt.ylabel("Line Number") plt.xlabel("Word number in the line") plt.title("Visualization for " + FILENAME) plt.savefig("tswift.pdf", bbox_inches="tight") plt.show() def sentiment_line(line, pos_words, neg_words): """ Calculate the sentiment score of a single line Parameters ---------- line : list of strings list of words (not cleaned). pos_words : list/set list of positive words. neg_words : list/set list of negative words. Returns ------- score : int sentiment score for the line. """ # line is a list of strings score = 0 for word in line: # should we have a function to do this? # clean our text word = word.lower() word = word.replace(",", "") word = word.replace("'", "") word = word.replace("(", "") word = word.replace(")", "") if word in pos_words: # sentiment goes up by one score += 1 elif word in neg_words: score -= 1 return score def main(): lyrics = read_data(FILENAME) print(lyrics[:4]) print(len(lyrics)) print() # we'll use this code later :) stopwords = read_data_singlelist(STOPWORDS) print(stopwords[:4]) print(len(stopwords)) print() # Fine to read these in as sets as well pos = read_data_singlelist(POS_WORDS) print(pos[:4]) print(len(pos)) print() neg = read_data_singlelist(NEG_WORDS) print(neg[:4]) print(len(neg)) print() # visualize make_plot(lyrics, stopwords, pos, neg) # example measuring the sentiment of a single line for line in lyrics: print(line) print(sentiment_line(line, pos, neg)) print() if __name__ == "__main__": main()