import re from collections import Counter import numpy as np import pandas as pd from utils import extract_featuers nltk.download('twitter_samples') nltk.download('stopwords') #@title Q1 Process Data def process_data(file_name): """ Input: A file_name which is found in your current directory. You just have to read it in. Output: wordprobs: a dictionary where keys are all the processed lowercase words and the values are the frequency that it occurs in the corpus (text file you read). """ words = [] # return this variable correctly ### START CODE HERE ### #Open the file, read its contents into a string variable data = None # convert all letters to lower case lower_case_data = None # with words as keys, ensure that we have a count of their occurrences words = dict() ### END CODE HERE ### return words #@title Q2: N-gram most likely next words def probable_substitutes(word, probs, maxret = 10): """ Determine the most probable words for a misspelled string that are TWO edits away. The edits that are possible are: * delete_letter: removing a character * switch_letters: switching two adjacent characters * replace_letter: replacing one character by another different one * insert_letter: inserting a character There may be fewer but no more than maxret words. Input: word - The misspelled word probs - A dictionary of word --> prob maxret - Maximum number of words to return Returns: Tuples of the words and their probabilities, ordered by highest frequency. [(word1, prob1), ... ] """ def example_function(word): """ To make your code modular, feel free to add sub-functions (e.g., like the one we've templated below). """ return None return [("hello", 0.5), ("world", 0.3)] #@title Q3: The Minimum Edit Distnance def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2): ''' Input: source: starting string target: ending string ins_cost: integer representing insert cost del_cost: integer representing delete cost rep_cost: integer representing replace cost Output: D: matrix of size (len(source)+1 , len(target)+1) with minimum edit distances med: the minimum edit distance required to convert source to target ''' return D, med