{ "cells": [ { "cell_type": "markdown", "id": "1e661be8-f1ce-412c-b196-20a1ac4ec172", "metadata": {}, "source": [ "# Basic Sentiment Analysis\n", "\n", "We count 'positive' and 'negative' words and use differences or ratios or windowed averages as a measure of sentiment. This is highly problematic!\n", "\n", "\n", "* This food is not very good! (Negation)\n", "* This lavishly produced movie was blllleeeeccchhh! (Unknown words)\n", "* Dr. Strange is the goat (slang)\n", "* Sure I enjoyed my visit to this air bnb but then again I really enjoy the stacatto thrum of jack hammers at 4am. (Sarcasm)" ] }, { "cell_type": "code", "execution_count": 1, "id": "ca7a8eff-7b1d-4043-908c-57c43526f28f", "metadata": {}, "outputs": [], "source": [ "# Read a list of words\n", "def read_words(filename):\n", " with open(filename, 'r') as file:\n", " lines = file.readlines()\n", " return [w.strip() for w in lines]\n", "\n", "pos = read_words('positive-words.txt')\n", "neg = read_words('negative-words.txt')" ] }, { "cell_type": "code", "execution_count": 2, "id": "e21bc1ec-440a-4395-b2e5-98ba8ce8f110", "metadata": {}, "outputs": [], "source": [ "# Read and clean some raw text\n", "def text_to_words(filename):\n", " \"\"\" Convert everything to lower case. Remove punctuation,\n", " new lines, and extra white space \"\"\"\n", "\n", " non_letters = \"0123456789!@#$%^&*()_+-=';:.,>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import dataproc as dp\n", "\n", "def plot_sentiment(sent1, sent2, labels=[\"1\", \"2\"]):\n", " \n", " sz = 1000\n", " mavg1 = dp.moving_average(sent1, window_size=sz)\n", " mavg2 = dp.moving_average(sent2, window_size=sz)\n", " #plt.scatter(range(len(sent)),sent, marker='.')\n", " plt.figure(figsize=(8,4), dpi=150)\n", " plt.plot(mavg1, label=labels[0], color='b')\n", " plt.plot(mavg2, label=labels[1], color='r')\n", " plt.title(\"Sentiment analysis comparison\")\n", " plt.xlabel(\"Word #\")\n", " plt.ylabel(\"Sentiment score\")\n", " plt.ylim(-0.12, 0.12)\n", " plt.grid()\n", " plt.legend()\n", " plt.savefig('sentiment.png')\n", " plt.show()\n", "\n", "plot_sentiment(sent_obama, sent_trump, labels = ['obama', 'trump'])" ] }, { "cell_type": "code", "execution_count": null, "id": "4fc5d37f-5659-4f29-b387-604f504c0223", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" } }, "nbformat": 4, "nbformat_minor": 5 }