{ "cells": [ { "cell_type": "markdown", "id": "1e661be8-f1ce-412c-b196-20a1ac4ec172", "metadata": {}, "source": [ "# Basic Sentiment Analysis\n", "\n", "We count 'positive' and 'negative' words and use differences or ratios or windowed averages as a measure of sentiment. This is highly problematic!\n", "\n", "\n", "* This food is not very good! (Negation)\n", "* This lavishly produced movie was blllleeeeccchhh! (Unknown words)\n", "* Professor Muzny is the GOAT. (slang)\n", "* Sure I enjoyed my visit to this air bnb but then again I really enjoy the stacatto thrum of jack hammers at 4am. (Sarcasm)" ] }, { "cell_type": "code", "execution_count": 1, "id": "ca7a8eff-7b1d-4043-908c-57c43526f28f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['support', 'rational', 'unselfish', 'miraculousness', 'nicest', 'tidy', 'regal', 'precious', 'abundance', 'frolic', 'reward', 'sophisticated', 'fortitude', 'intriguing', 'problem-solver', 'easiness', 'faithful', 'glorify', 'loves', 'breakthroughs']\n", "['disrupt', 'condemnable', 'dirts', 'heavyhearted', 'alarmingly', 'farcical-yet-provocative', 'polemize', 'spookiest', 'flout', 'steep', 'fret', 'incite', 'sly', 'helplessness', 'fake', 'beguile', 'mope', 'improbable', 'smuttier', 'unfavorable']\n" ] } ], "source": [ "# Read a list of words\n", "def read_words(filename):\n", " with open(filename, 'r') as file:\n", " lines = file.readlines()\n", " return set([w.strip() for w in lines])\n", "\n", "pos = read_words('positive-words.txt')\n", "neg = read_words('negative-words.txt')\n", "\n", "print(list(pos)[:20])\n", "print(list(neg)[:20])" ] }, { "cell_type": "code", "execution_count": 2, "id": "e21bc1ec-440a-4395-b2e5-98ba8ce8f110", "metadata": {}, "outputs": [], "source": [ "# Read and clean some raw text\n", "def text_to_words(filename):\n", " \"\"\" Convert everything to lower case. Remove punctuation,\n", " new lines, and extra white space \"\"\"\n", "\n", " non_letters = \"0123456789!@#$%^&*()_+-=';:.,>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import dataproc as dp\n", "\n", "def plot_sentiment(sent1, sent2, labels=[\"1\", \"2\"]):\n", " \n", " sz = 100\n", " mavg1 = dp.moving_average(sent1, window_size=sz)\n", " mavg2 = dp.moving_average(sent2, window_size=sz)\n", " #plt.scatter(range(len(sent)),sent, marker='.')\n", " plt.figure(figsize=(8,4), dpi=200)\n", " plt.plot(mavg1, label=labels[0], color='b')\n", " plt.plot(mavg2, label=labels[1], color='r')\n", " plt.title(\"Sentiment analysis comparison\")\n", " plt.xlabel(\"Word #\")\n", " plt.ylabel(\"Sentiment score\")\n", " plt.ylim(-0.2, 0.2)\n", " # plt.xlim(34000, 35000)\n", " plt.grid()\n", " plt.legend()\n", " plt.savefig('sentiment.png')\n", " plt.show()\n", "\n", "plot_sentiment(sent_obama, sent_trump, labels=['obama', 'trump'])" ] }, { "cell_type": "code", "execution_count": 8, "id": "404ac214", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "obama avg wordsize: 4.5579\n", "trump avg wordsize: 4.19308\n" ] } ], "source": [ "print('obama avg wordsize: ', dp.avg([len(w) for w in obama]))\n", "print('trump avg wordsize: ', dp.avg([len(w) for w in trump]))" ] }, { "cell_type": "code", "execution_count": 11, "id": "93d429a5", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "obama_len = [len(w) for w in obama]\n", "trump_len = [len(w) for w in trump]\n", "bins = 50\n", "plt.hist(obama_len, bins=bins, color='b')\n", "plt.hist(trump_len, bins=bins, color='r')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "id": "ee805559", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('president', 61), ('americans', 54), ('government', 48), ('challenges', 40), ('community', 38), ('information', 37), ('education', 36), ('responsibility', 32), ('administration', 31), ('guantanamo', 29)]\n", "\n", "[('president', 106), ('something', 61), ('immigration', 58), ('everybody', 57), ('thousands', 44), ('countries', 32), ('terrorism', 31), ('tremendous', 29), ('incredible', 29), ('americans', 29)]\n" ] } ], "source": [ "# counting words\n", "from collections import Counter\n", "\n", "min_length = 9\n", "obama_big = [w for w in obama if len(w) >= min_length]\n", "trump_big = [w for w in trump if len(w) >= min_length]\n", "\n", "obama_count = Counter(obama_big)\n", "trump_count = Counter(trump_big)\n", "\n", "print(obama_count.most_common(10))\n", "print()\n", "print(trump_count.most_common(10))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }