import re import string import numpy as np import pandas as pd from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import TweetTokenizer from matplotlib.patches import Ellipse import matplotlib.transforms as transforms def process_tweet(tweet): """Process tweet function. Input: tweet: a string containing a tweet Output: tweets_clean: a list of words containing the processed tweet """ stemmer = PorterStemmer() stopwords_english = stopwords.words('english') # remove stock market tickers like $GE tweet = re.sub(r'\$\w*', '', tweet) # remove old style retweet text "RT" tweet = re.sub(r'^RT[\s]+', '', tweet) # remove hyperlinks tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet) # remove hashtags # only removing the hash # sign from the word tweet = re.sub(r'#', '', tweet) # tokenize tweets tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tweet_tokens = tokenizer.tokenize(tweet) tweets_clean = [] for word in tweet_tokens: if (word not in stopwords_english and # remove stopwords word not in string.punctuation): # remove punctuation # tweets_clean.append(word) stem_word = stemmer.stem(word) # stemming word tweets_clean.append(stem_word) return tweets_clean def build_freqs(tweets, ys): """Build frequencies. Input: tweets: a list of tweets ys: an m x 1 array with the sentiment label of each tweet (either 0 or 1) Output: freqs: a dictionary mapping each (word, sentiment) pair to its frequency """ # Convert np array to list since zip needs an iterable. # The squeeze is necessary or the list ends up with one element. # Also note that this is just a NOP if ys is already a list. yslist = np.squeeze(ys).tolist() # Start with an empty dictionary and populate it by looping over all tweets # and over all processed words in each tweet. freqs = {} for y, tweet in zip(yslist, tweets): for word in process_tweet(tweet): pair = (word, y) if pair in freqs: freqs[pair] += 1 else: freqs[pair] = 1 return freqs # Extract the features for a Tweet def extract_features(tweet, freqs, process_tweet=process_tweet): ''' Input: tweet: a string containing one tweet freqs: a dictionary corresponding to the frequencies of each tuple (word, label) Output: x: a feature vector of dimension (1,3) ''' # process_tweet tokenizes, stems, and removes stopwords word_l = process_tweet(tweet) # 3 elements for [bias, positive, negative] counts x = np.zeros(3) # bias term is set to 1 x[0] = 1 ### START CODE HERE ### # loop through each word in the list of words for word in word_l: # increment the word count for the positive label 1 x[1] += freqs[(word, 1.0)] if (word, 1.0) in freqs else 0 # increment the word count for the negative label 0 x[2] += freqs[(word, 0.0)] if (word, 0.0) in freqs else 0 ### END CODE HERE ### x = x[None, :] # adding batch dimension for further processing assert(x.shape == (1, 3)) return x def make_count_matrix(n_plus1_gram_counts, vocabulary): # add to the vocabulary # is omitted since it should not appear as the next word vocabulary = vocabulary + ["", ""] # obtain unique n-grams n_grams = [] for n_plus1_gram in n_plus1_gram_counts.keys(): n_gram = n_plus1_gram[0:-1] n_grams.append(n_gram) n_grams = list(set(n_grams)) # mapping from n-gram to row row_index = {n_gram:i for i, n_gram in enumerate(n_grams)} # mapping from next word to column col_index = {word:j for j, word in enumerate(vocabulary)} nrow = len(n_grams) ncol = len(vocabulary) count_matrix = np.zeros((nrow, ncol)) for n_plus1_gram, count in n_plus1_gram_counts.items(): n_gram = n_plus1_gram[0:-1] word = n_plus1_gram[-1] if word not in vocabulary: continue i = row_index[n_gram] j = col_index[word] count_matrix[i, j] = count count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary) return count_matrix def lookup(freqs, word, label): ''' Input: freqs: a dictionary with the frequency of each pair (or tuple) word: the word to look up label: the label corresponding to the word Output: n: the number of times the word with its corresponding label appears. ''' n = 0 # freqs.get((word, label), 0) pair = (word, label) if (pair in freqs): n = freqs[pair] return n def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs): """ Create a plot of the covariance confidence ellipse of `x` and `y` Parameters ---------- x, y : array_like, shape (n, ) Input data. ax : matplotlib.axes.Axes The axes object to draw the ellipse into. n_std : float The number of standard deviations to determine the ellipse's radiuses. Returns ------- matplotlib.patches.Ellipse Other parameters ---------------- kwargs : `~matplotlib.patches.Patch` properties """ if x.size != y.size: raise ValueError("x and y must be the same size") cov = np.cov(x, y) pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1]) # Using a special case to obtain the eigenvalues of this # two-dimensionl dataset. ell_radius_x = np.sqrt(1 + pearson) ell_radius_y = np.sqrt(1 - pearson) ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2, facecolor=facecolor, **kwargs) # Calculating the stdandard deviation of x from # the squareroot of the variance and multiplying # with the given number of standard deviations. scale_x = np.sqrt(cov[0, 0]) * n_std mean_x = np.mean(x) # calculating the stdandard deviation of y ... scale_y = np.sqrt(cov[1, 1]) * n_std mean_y = np.mean(y) transf = transforms.Affine2D() \ .rotate_deg(45) \ .scale(scale_x, scale_y) \ .translate(mean_x, mean_y) ellipse.set_transform(transf + ax.transData) return ax.add_patch(ellipse)