{ "cells": [ { "cell_type": "markdown", "id": "1e661be8-f1ce-412c-b196-20a1ac4ec172", "metadata": {}, "source": [ "# Basic Sentiment Analysis\n", "\n", "We count 'positive' and 'negative' words and use differences or ratios or windowed averages as a measure of sentiment. This is highly problematic!\n", "\n", "\n", "* This food is not very good! (Negation)\n", "* This lavishly produced movie was blllleeeeccchhh! (Unknown words)\n", "* Professor Muzny is the GOAT. (slang)\n", "* Sure I enjoyed my visit to this air bnb but then again I really enjoy the stacatto thrum of jack hammers at 4am. (Sarcasm)" ] }, { "cell_type": "code", "execution_count": 1, "id": "ca7a8eff-7b1d-4043-908c-57c43526f28f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['support', 'rational', 'unselfish', 'miraculousness', 'nicest', 'tidy', 'regal', 'precious', 'abundance', 'frolic', 'reward', 'sophisticated', 'fortitude', 'intriguing', 'problem-solver', 'easiness', 'faithful', 'glorify', 'loves', 'breakthroughs']\n", "['disrupt', 'condemnable', 'dirts', 'heavyhearted', 'alarmingly', 'farcical-yet-provocative', 'polemize', 'spookiest', 'flout', 'steep', 'fret', 'incite', 'sly', 'helplessness', 'fake', 'beguile', 'mope', 'improbable', 'smuttier', 'unfavorable']\n" ] } ], "source": [ "# Read a list of words\n", "def read_words(filename):\n", " with open(filename, 'r') as file:\n", " lines = file.readlines()\n", " return set([w.strip() for w in lines])\n", "\n", "pos = read_words('positive-words.txt')\n", "neg = read_words('negative-words.txt')\n", "\n", "print(list(pos)[:20])\n", "print(list(neg)[:20])" ] }, { "cell_type": "code", "execution_count": 2, "id": "e21bc1ec-440a-4395-b2e5-98ba8ce8f110", "metadata": {}, "outputs": [], "source": [ "# Read and clean some raw text\n", "def text_to_words(filename):\n", " \"\"\" Convert everything to lower case. Remove punctuation,\n", " new lines, and extra white space \"\"\"\n", "\n", " non_letters = \"0123456789!@#$%^&*()_+-=';:.,>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import dataproc as dp\n", "\n", "def plot_sentiment(sent1, sent2, labels=[\"1\", \"2\"]):\n", " \n", " sz = 100\n", " mavg1 = dp.moving_average(sent1, window_size=sz)\n", " mavg2 = dp.moving_average(sent2, window_size=sz)\n", " #plt.scatter(range(len(sent)),sent, marker='.')\n", " plt.figure(figsize=(8,4), dpi=200)\n", " plt.plot(mavg1, label=labels[0], color='b')\n", " plt.plot(mavg2, label=labels[1], color='r')\n", " plt.title(\"Sentiment analysis comparison\")\n", " plt.xlabel(\"Word #\")\n", " plt.ylabel(\"Sentiment score\")\n", " plt.ylim(-0.2, 0.2)\n", " # plt.xlim(34000, 35000)\n", " plt.grid()\n", " plt.legend()\n", " plt.savefig('sentiment.png')\n", " plt.show()\n", "\n", "plot_sentiment(sent_obama, sent_trump, labels=['obama', 'trump'])" ] }, { "cell_type": "code", "execution_count": 8, "id": "404ac214", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "obama avg wordsize: 4.5579\n", "trump avg wordsize: 4.19308\n" ] } ], "source": [ "print('obama avg wordsize: ', dp.avg([len(w) for w in obama]))\n", "print('trump avg wordsize: ', dp.avg([len(w) for w in trump]))" ] }, { "cell_type": "code", "execution_count": 11, "id": "93d429a5", "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjoAAAGdCAYAAAAbudkLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAk0ElEQVR4nO3dfXBUV/3H8c9Ckm1gkishJstOgYlOpGiw2qCB9AHGQgQbsNax1eAOjgzYKQ+mgG2xKrSjCT+qdEajBfyjD2Nr+oeidayRjMUoQ4BMbJQHqTpiE5AllG5uAoUNTc7vD5prNyl5gE02e/J+zdyZ7L3fPfec7F72w9l7c33GGCMAAAALjUt0BwAAAIYLQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYK2URHcgkbq7u/Xf//5XGRkZ8vl8ie4OAAAYBGOMOjo6FAwGNW5c/3M2Yzro/Pe//9XUqVMT3Q0AAHANWlpadOONN/ZbM6aDTkZGhqQrv6jMzMwE9wYAAAxGe3u7pk6d6n2O92dMB52er6syMzMJOgAAJJnBnHbCycgAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1kpJdAeAGD7fwDXGDH8/AABWYEYHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKzFLSAwONyaAQCQhJjRAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsNeSg86c//UlLlixRMBiUz+fTr371q5jtxhht2bJFwWBQ6enpmj9/vo4ePRpTE41GtXbtWmVnZ2vixIlaunSpTp48GVMTiUQUCoXkOI4cx1EoFFJbW1tMTXNzs5YsWaKJEycqOztb69atU2dn51CHBAAALDXkoHPhwgXdfPPNqqqqes/t27Zt0/bt21VVVaWGhgYFAgEtXLhQHR0dXk15ebl2796t6upq7du3T+fPn1dpaam6urq8mrKyMjU1NammpkY1NTVqampSKBTytnd1demuu+7ShQsXtG/fPlVXV+sXv/iFNmzYMNQhIU58vsH9AWUAAEaMuQ6SzO7du73H3d3dJhAImK1bt3rrLl26ZBzHMTt27DDGGNPW1mZSU1NNdXW1V3Pq1Ckzbtw4U1NTY4wx5tixY0aSOXDggFdTX19vJJnjx48bY4x5+eWXzbhx48ypU6e8mp///OfG7/cb13UH1X/XdY2kQdePaVdu8NDv0vPjcO8HADC2DeXzO67n6Jw4cULhcFglJSXeOr/fr3nz5mn//v2SpMbGRl2+fDmmJhgMqqCgwKupr6+X4zgqKiryaubMmSPHcWJqCgoKFAwGvZpPf/rTikajamxsfM/+RaNRtbe3xywAAMBecQ064XBYkpSbmxuzPjc319sWDoeVlpamSZMm9VuTk5PTp/2cnJyYmt77mTRpktLS0rya3iorK71zfhzH0dSpU69hlAAAIFkMy1VXvl4nahhj+qzrrXfNe9VfS827bdq0Sa7rektLS0u/fQIAAMktrkEnEAhIUp8ZldbWVm/2JRAIqLOzU5FIpN+aM2fO9Gn/7NmzMTW99xOJRHT58uU+Mz09/H6/MjMzYxYAAGCvuAadvLw8BQIB1dbWeus6OztVV1en4uJiSVJhYaFSU1Njak6fPq0jR454NXPnzpXrujp06JBXc/DgQbmuG1Nz5MgRnT592qvZs2eP/H6/CgsL4zksAACQpFKG+oTz58/rX//6l/f4xIkTampqUlZWlqZNm6by8nJVVFQoPz9f+fn5qqio0IQJE1RWViZJchxHK1as0IYNGzR58mRlZWVp48aNmjVrlhYsWCBJmjlzphYtWqSVK1dq586dkqRVq1aptLRUM2bMkCSVlJTowx/+sEKhkJ544gm9+eab2rhxo1auXMlMDQAAuGKol3Tt3bvXSOqzLF++3Bhz5RLzzZs3m0AgYPx+v7njjjvM4cOHY9q4ePGiWbNmjcnKyjLp6emmtLTUNDc3x9ScO3fOLFu2zGRkZJiMjAyzbNkyE4lEYmpef/11c9ddd5n09HSTlZVl1qxZYy5dujTosXB5+RBweTkAYJQYyue3zxhjEpizEqq9vV2O48h1XWaBBjKIvwTo05W30nW9owbzFwfH7lsWAKChfX5zrysAAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYKyXRHcDI8fkGrjFm+PsBAMBIYUYHAABYi6ADAACsxVdXSIirfY3GN2cAgHhiRgcAAFiLoAMAAKxF0AEAANYi6AAAAGtxMvJY8M6Zv/2d6OvjNGAAgIWY0QEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGvFPei8/fbb+ta3vqW8vDylp6frAx/4gB5//HF1d3d7NcYYbdmyRcFgUOnp6Zo/f76OHj0a0040GtXatWuVnZ2tiRMnaunSpTp58mRMTSQSUSgUkuM4chxHoVBIbW1t8R4SAABIUnEPOv/3f/+nHTt2qKqqSn//+9+1bds2PfHEE/rRj37k1Wzbtk3bt29XVVWVGhoaFAgEtHDhQnV0dHg15eXl2r17t6qrq7Vv3z6dP39epaWl6urq8mrKysrU1NSkmpoa1dTUqKmpSaFQKN5DAgAAycrE2V133WW++tWvxqy75557zJe//GVjjDHd3d0mEAiYrVu3etsvXbpkHMcxO3bsMMYY09bWZlJTU011dbVXc+rUKTNu3DhTU1NjjDHm2LFjRpI5cOCAV1NfX28kmePHjw+qr67rGknGdd1rG2yykAZcen4c1jYG0dxg9gMAGNuG8vkd9xmd2267TX/4wx/0j3/8Q5L017/+Vfv27dNnPvMZSdKJEycUDodVUlLiPcfv92vevHnav3+/JKmxsVGXL1+OqQkGgyooKPBq6uvr5TiOioqKvJo5c+bIcRyvBgAAjG0p8W7w4Ycfluu6uummmzR+/Hh1dXXpe9/7nr70pS9JksLhsCQpNzc35nm5ubl6/fXXvZq0tDRNmjSpT03P88PhsHJycvrsPycnx6vpLRqNKhqNeo/b29uvcZQAACAZxH1G58UXX9TPfvYzvfDCC/rLX/6iZ599Vt///vf17LPPxtT5fL6Yx8aYPut6613zXvX9tVNZWemduOw4jqZOnTrYYQEAgCQU96DzjW98Q4888oi++MUvatasWQqFQnrwwQdVWVkpSQoEApLUZ9altbXVm+UJBALq7OxUJBLpt+bMmTN99n/27Nk+s0U9Nm3aJNd1vaWlpeX6BgsAAEa1uAedt956S+PGxTY7fvx47/LyvLw8BQIB1dbWets7OztVV1en4uJiSVJhYaFSU1Njak6fPq0jR454NXPnzpXrujp06JBXc/DgQbmu69X05vf7lZmZGbMAAAB7xf0cnSVLluh73/uepk2bpo985CN69dVXtX37dn31q1+VdOXrpvLyclVUVCg/P1/5+fmqqKjQhAkTVFZWJklyHEcrVqzQhg0bNHnyZGVlZWnjxo2aNWuWFixYIEmaOXOmFi1apJUrV2rnzp2SpFWrVqm0tFQzZsyI97AAAEASinvQ+dGPfqRvf/vbeuCBB9Ta2qpgMKivfe1r+s53vuPVPPTQQ7p48aIeeOABRSIRFRUVac+ePcrIyPBqnnzySaWkpOjee+/VxYsXdeedd+qZZ57R+PHjvZrnn39e69at867OWrp0qaqqquI9JAAAkKR8xhiT6E4kSnt7uxzHkeu6dn+NNcBJ3pLk05W3wVXfDfFoYxDNGQ28n0HtAABgraF8fnOvKwAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtYYl6Jw6dUpf/vKXNXnyZE2YMEEf+9jH1NjY6G03xmjLli0KBoNKT0/X/PnzdfTo0Zg2otGo1q5dq+zsbE2cOFFLly7VyZMnY2oikYhCoZAcx5HjOAqFQmpraxuOIQEAgCQU96ATiUR06623KjU1Vb/73e907Ngx/eAHP9D73vc+r2bbtm3avn27qqqq1NDQoEAgoIULF6qjo8OrKS8v1+7du1VdXa19+/bp/PnzKi0tVVdXl1dTVlampqYm1dTUqKamRk1NTQqFQvEeEgAASFYmzh5++GFz2223XXV7d3e3CQQCZuvWrd66S5cuGcdxzI4dO4wxxrS1tZnU1FRTXV3t1Zw6dcqMGzfO1NTUGGOMOXbsmJFkDhw44NXU19cbSeb48eOD6qvrukaScV13SGNMOtKAS8+Pw9rGIJobzH4AAGPbUD6/4z6j89JLL2n27Nn6whe+oJycHH384x/XT3/6U2/7iRMnFA6HVVJS4q3z+/2aN2+e9u/fL0lqbGzU5cuXY2qCwaAKCgq8mvr6ejmOo6KiIq9mzpw5chzHq+ktGo2qvb09ZgEAAPaKe9D597//raeeekr5+fn6/e9/r/vvv1/r1q3Tc889J0kKh8OSpNzc3Jjn5ebmetvC4bDS0tI0adKkfmtycnL67D8nJ8er6a2ystI7n8dxHE2dOvX6BgsAAEa1uAed7u5u3XLLLaqoqNDHP/5xfe1rX9PKlSv11FNPxdT5fL6Yx8aYPut6613zXvX9tbNp0ya5rustLS0tgx0WAABIQnEPOlOmTNGHP/zhmHUzZ85Uc3OzJCkQCEhSn1mX1tZWb5YnEAios7NTkUik35ozZ8702f/Zs2f7zBb18Pv9yszMjFkAAIC94h50br31Vr322msx6/7xj39o+vTpkqS8vDwFAgHV1tZ62zs7O1VXV6fi4mJJUmFhoVJTU2NqTp8+rSNHjng1c+fOleu6OnTokFdz8OBBua7r1QAAgLEtJd4NPvjggyouLlZFRYXuvfdeHTp0SLt27dKuXbskXfm6qby8XBUVFcrPz1d+fr4qKio0YcIElZWVSZIcx9GKFSu0YcMGTZ48WVlZWdq4caNmzZqlBQsWSLoyS7Ro0SKtXLlSO3fulCStWrVKpaWlmjFjRryHBQAAktFwXPb1m9/8xhQUFBi/329uuukms2vXrpjt3d3dZvPmzSYQCBi/32/uuOMOc/jw4ZiaixcvmjVr1pisrCyTnp5uSktLTXNzc0zNuXPnzLJly0xGRobJyMgwy5YtM5FIZND95PJyLi8HACSfoXx++4wxJtFhK1Ha29vlOI5c17X7fJ0BTvKWJJ+uvA2u+m6IRxuDaM5o4P0MagcAAGsN5fObe10BAABrEXQAAIC1CDoAAMBacb/qChgVBjqniPN8AGBMYEYHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKzFLSAwJr37DhHcDQIA7MWMDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWSkl0B4DRzucbuMaY4e8HAGDomNEBAADWIugAAABrEXQAAIC1hj3oVFZWyufzqby83FtnjNGWLVsUDAaVnp6u+fPn6+jRozHPi0ajWrt2rbKzszVx4kQtXbpUJ0+ejKmJRCIKhUJyHEeO4ygUCqmtrW24hwQAAJLEsAadhoYG7dq1Sx/96Edj1m/btk3bt29XVVWVGhoaFAgEtHDhQnV0dHg15eXl2r17t6qrq7Vv3z6dP39epaWl6urq8mrKysrU1NSkmpoa1dTUqKmpSaFQaDiHBAAAkokZJh0dHSY/P9/U1taaefPmma9//evGGGO6u7tNIBAwW7du9WovXbpkHMcxO3bsMMYY09bWZlJTU011dbVXc+rUKTNu3DhTU1NjjDHm2LFjRpI5cOCAV1NfX28kmePHjw+qj67rGknGdd3rHe7oduWioH6Xnh+HtY1BNDeY/cRjzENpLl5dAgDEx1A+v4dtRmf16tW66667tGDBgpj1J06cUDgcVklJibfO7/dr3rx52r9/vySpsbFRly9fjqkJBoMqKCjwaurr6+U4joqKiryaOXPmyHEcr6a3aDSq9vb2mAUAANhrWP6OTnV1tf7yl7+ooaGhz7ZwOCxJys3NjVmfm5ur119/3atJS0vTpEmT+tT0PD8cDisnJ6dP+zk5OV5Nb5WVlXrssceGPiAAAJCU4j6j09LSoq9//ev62c9+phtuuOGqdb5ef4XNGNNnXW+9a96rvr92Nm3aJNd1vaWlpaXf/QEAgOQW96DT2Nio1tZWFRYWKiUlRSkpKaqrq9MPf/hDpaSkeDM5vWddWltbvW2BQECdnZ2KRCL91pw5c6bP/s+ePdtntqiH3+9XZmZmzAIAAOwV96Bz55136vDhw2pqavKW2bNna9myZWpqatIHPvABBQIB1dbWes/p7OxUXV2diouLJUmFhYVKTU2NqTl9+rSOHDni1cydO1eu6+rQoUNezcGDB+W6rlcDAADGtrifo5ORkaGCgoKYdRMnTtTkyZO99eXl5aqoqFB+fr7y8/NVUVGhCRMmqKysTJLkOI5WrFihDRs2aPLkycrKytLGjRs1a9Ys7+TmmTNnatGiRVq5cqV27twpSVq1apVKS0s1Y8aMeA8LAAAkoYTc1POhhx7SxYsX9cADDygSiaioqEh79uxRRkaGV/Pkk08qJSVF9957ry5evKg777xTzzzzjMaPH+/VPP/881q3bp13ddbSpUtVVVU14uMBAACjk8+YsXvf5fb2djmOI9d17T5fZxC33/bpytvgqu+GeLQxiOaM4nSr8AH629PXwTTH3csBYHQZyuc397oCAADWSshXV0BSeGcq56qTXFfdAgAYLZjRAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLa66AkYAf4sHABKDGR0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIt7XQHD6Z2bXPV3Gytfv1sBANeDGR0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtVIS3QEMwOcbuMaY4e8HAABJiBkdAABgLYIOAACwFkEHAABYi6ADAACsxcnIQBIZ6Nx0zksHgFjM6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwVtyDTmVlpT7xiU8oIyNDOTk5uvvuu/Xaa6/F1BhjtGXLFgWDQaWnp2v+/Pk6evRoTE00GtXatWuVnZ2tiRMnaunSpTp58mRMTSQSUSgUkuM4chxHoVBIbW1t8R4SAABIUnEPOnV1dVq9erUOHDig2tpavf322yopKdGFCxe8mm3btmn79u2qqqpSQ0ODAoGAFi5cqI6ODq+mvLxcu3fvVnV1tfbt26fz58+rtLRUXV1dXk1ZWZmamppUU1OjmpoaNTU1KRQKxXtIAAAgWZlh1traaiSZuro6Y4wx3d3dJhAImK1bt3o1ly5dMo7jmB07dhhjjGlrazOpqammurraqzl16pQZN26cqampMcYYc+zYMSPJHDhwwKupr683kszx48cH1TfXdY0k47rudY9z2EgDL3FoY8Cm4tHGIJqLy3gH0d9BNTcCbcTr9zaUNgAg2Q3l83vYz9FxXVeSlJWVJUk6ceKEwuGwSkpKvBq/36958+Zp//79kqTGxkZdvnw5piYYDKqgoMCrqa+vl+M4Kioq8mrmzJkjx3G8GgAAMLalDGfjxhitX79et912mwoKCiRJ4XBYkpSbmxtTm5ubq9dff92rSUtL06RJk/rU9Dw/HA4rJyenzz5zcnK8mt6i0aii0aj3uL29/RpHBgAAksGwzuisWbNGf/vb3/Tzn/+8zzafzxfz2BjTZ11vvWveq76/diorK70Tlx3H0dSpUwczDAAAkKSGLeisXbtWL730kvbu3asbb7zRWx8IBCSpz6xLa2urN8sTCATU2dmpSCTSb82ZM2f67Pfs2bN9Zot6bNq0Sa7rektLS8u1DxAAAIx6cQ86xhitWbNGv/zlL/XKK68oLy8vZnteXp4CgYBqa2u9dZ2dnaqrq1NxcbEkqbCwUKmpqTE1p0+f1pEjR7yauXPnynVdHTp0yKs5ePCgXNf1anrz+/3KzMyMWQAAgL3ifo7O6tWr9cILL+jXv/61MjIyvJkbx3GUnp4un8+n8vJyVVRUKD8/X/n5+aqoqNCECRNUVlbm1a5YsUIbNmzQ5MmTlZWVpY0bN2rWrFlasGCBJGnmzJlatGiRVq5cqZ07d0qSVq1apdLSUs2YMSPewwIAAEko7kHnqaeekiTNnz8/Zv3TTz+tr3zlK5Kkhx56SBcvXtQDDzygSCSioqIi7dmzRxkZGV79k08+qZSUFN177726ePGi7rzzTj3zzDMaP368V/P8889r3bp13tVZS5cuVVVVVbyHBAAAkpTPGGMS3YlEaW9vl+M4cl139H6NNcAJ2pKu/AmV62zDJ9N/U/FoYxDNGcVhvP3toGez/tfGtY45Hm28u504DGtQbQBAshvK5zf3ugIAANYi6AAAAGsRdAAAgLWG9S8jAxgZ3rlN/Z3DM4QTeDgXCIAtmNEBAADWIugAAABrEXQAAIC1CDoW8PkG9+d2AAAYawg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBa3L0cwBXv+vPaV7s5ue+qWwBgdGJGBwAAWIugAwAArMVXVwCGxUA3mjV8CwZgBDCjAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFgrJdEdAGARn8/70Vyt5KpbACD+CDoARq135aarMuQmAP3gqysAAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGtxeTmApGP0ruvOr3YJOtedAxAzOgAAwGIEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1uLycgBjXjzuks6d1oHRiaADYOx6J51cLX/4rroFQLLgqysAAGCtpA86P/nJT5SXl6cbbrhBhYWF+vOf/5zoLgHANfH5Bl4ADE1SB50XX3xR5eXlevTRR/Xqq6/q9ttv1+LFi9Xc3JzorgEYK95JIEbvvQBILJ8xyXt6XFFRkW655RY99dRT3rqZM2fq7rvvVmVl5YDPb29vl+M4cl1XmZmZw9nVazeI/8L1nEdw1VdytLQxiOYG9cFwPTvo2fyucy+udczxaOPd7ST898b7pO/mkXqfDK6pQbUBjAVD+fxO2pOROzs71djYqEceeSRmfUlJifbv3/+ez4lGo4pGo95j13UlXfmFDQvH6X/7O/u/flf6f33DGB1tDOqpcXm9/tfGtTcXjzb+104y/d7G2nivr7lBtPHOvxX9/YvgqOffq4H3OGL/9AAJ0vO5PZi5mqQNOm+88Ya6urqUm5sbsz43N1fhcPg9n1NZWanHHnusz/qpU6cOSx8HNNC/RoNvKA7NjY42BvXUuPze/tfGtTcXjzb+104y/d7G2nivr7nR8z6JTz+A0aOjo0POAG/opA06PXy95nqNMX3W9di0aZPWr1/vPe7u7tabb76pyZMnX/U516q9vV1Tp05VS0vL6P1abAzgdRgdeB1GB16H0YHX4foZY9TR0aFgMDhgbdIGnezsbI0fP77P7E1ra2ufWZ4efr9ffr8/Zt373ve+4eqiJCkzM5M38ijA6zA68DqMDrwOowOvw/UZaCanR9JedZWWlqbCwkLV1tbGrK+trVVxcXGCegUAAEaTpJ3RkaT169crFApp9uzZmjt3rnbt2qXm5mbdf//9ie4aAAAYBZI66Nx33306d+6cHn/8cZ0+fVoFBQV6+eWXNX369ER3TX6/X5s3b+7zVRlGFq/D6MDrMDrwOowOvA4jK6n/jg4AAEB/kvYcHQAAgIEQdAAAgLUIOgAAwFoEHQAAYC2CzjD4yU9+ory8PN1www0qLCzUn//850R3aUzZsmWLfD5fzBIIBBLdLev96U9/0pIlSxQMBuXz+fSrX/0qZrsxRlu2bFEwGFR6errmz5+vo0ePJqazFhvodfjKV77S5/iYM2dOYjprscrKSn3iE59QRkaGcnJydPfdd+u1116LqeGYGBkEnTh78cUXVV5erkcffVSvvvqqbr/9di1evFjNzc2J7tqY8pGPfESnT5/2lsOHDye6S9a7cOGCbr75ZlVVVb3n9m3btmn79u2qqqpSQ0ODAoGAFi5cqI6OjhHuqd0Geh0kadGiRTHHx8svvzyCPRwb6urqtHr1ah04cEC1tbV6++23VVJSogsXLng1HBMjxCCuPvnJT5r7778/Zt1NN91kHnnkkQT1aOzZvHmzufnmmxPdjTFNktm9e7f3uLu72wQCAbN161Zv3aVLl4zjOGbHjh0J6OHY0Pt1MMaY5cuXm89+9rMJ6c9Y1traaiSZuro6YwzHxEhiRieOOjs71djYqJKSkpj1JSUl2r9/f4J6NTb985//VDAYVF5enr74xS/q3//+d6K7NKadOHFC4XA45tjw+/2aN28ex0YC/PGPf1ROTo4+9KEPaeXKlWptbU10l6znuq4kKSsrSxLHxEgi6MTRG2+8oa6urj43Fc3Nze1z81EMn6KiIj333HP6/e9/r5/+9KcKh8MqLi7WuXPnEt21Mavn/c+xkXiLFy/W888/r1deeUU/+MEP1NDQoE996lOKRqOJ7pq1jDFav369brvtNhUUFEjimBhJSX0LiNHK5/PFPDbG9FmH4bN48WLv51mzZmnu3Ln64Ac/qGeffVbr169PYM/AsZF49913n/dzQUGBZs+erenTp+u3v/2t7rnnngT2zF5r1qzR3/72N+3bt6/PNo6J4ceMThxlZ2dr/PjxfdJ4a2trn9SOkTNx4kTNmjVL//znPxPdlTGr56o3jo3RZ8qUKZo+fTrHxzBZu3atXnrpJe3du1c33nijt55jYuQQdOIoLS1NhYWFqq2tjVlfW1ur4uLiBPUK0WhUf//73zVlypREd2XMysvLUyAQiDk2Ojs7VVdXx7GRYOfOnVNLSwvHR5wZY7RmzRr98pe/1CuvvKK8vLyY7RwTI4evruJs/fr1CoVCmj17tubOnatdu3apublZ999/f6K7NmZs3LhRS5Ys0bRp09Ta2qrvfve7am9v1/LlyxPdNaudP39e//rXv7zHJ06cUFNTk7KysjRt2jSVl5eroqJC+fn5ys/PV0VFhSZMmKCysrIE9to+/b0OWVlZ2rJliz7/+c9rypQp+s9//qNvfvObys7O1uc+97kE9to+q1ev1gsvvKBf//rXysjI8GZuHMdRenq6fD4fx8RISeg1X5b68Y9/bKZPn27S0tLMLbfc4l1OiJFx3333mSlTppjU1FQTDAbNPffcY44ePZrobllv7969RlKfZfny5caYK5fTbt682QQCAeP3+80dd9xhDh8+nNhOW6i/1+Gtt94yJSUl5v3vf79JTU0106ZNM8uXLzfNzc2J7rZ13us1kGSefvppr4ZjYmT4jDFm5OMVAADA8OMcHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACs9f8J3A/OzIoOkAAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "obama_len = [len(w) for w in obama]\n", "trump_len = [len(w) for w in trump]\n", "bins = 50\n", "plt.hist(obama_len, bins=bins, color='b')\n", "plt.hist(trump_len, bins=bins, color='r')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "id": "ee805559", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('president', 61), ('americans', 54), ('government', 48), ('challenges', 40), ('community', 38), ('information', 37), ('education', 36), ('responsibility', 32), ('administration', 31), ('guantanamo', 29)]\n", "\n", "[('president', 106), ('something', 61), ('immigration', 58), ('everybody', 57), ('thousands', 44), ('countries', 32), ('terrorism', 31), ('tremendous', 29), ('incredible', 29), ('americans', 29)]\n" ] } ], "source": [ "# counting words\n", "from collections import Counter\n", "\n", "min_length = 9\n", "obama_big = [w for w in obama if len(w) >= min_length]\n", "trump_big = [w for w in trump if len(w) >= min_length]\n", "\n", "obama_count = Counter(obama_big)\n", "trump_count = Counter(trump_big)\n", "\n", "print(obama_count.most_common(10))\n", "print()\n", "print(trump_count.most_common(10))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }