import nltk nltk.download('punkt') nltk.download('punkt_tab') nltk.data.path.append('.') class SpecialTokens: """ Class of special tokens """ def __init__(self, start_token = "", end_token = "", unknown_token = ""): self.start_token = start_token self.end_token = end_token self.unknown_token = unknown_token class NGramModel(): """ This class holds your n-gram model and all its parameters, which include: - previous_n_gram: the last n-gram that appeared prior to the current one - n-gram_counts: a dictionary specifying the context words """ def __init__(self, n_gram_counts, n_plus1_gram_counts, vocabulary, special_tokens, k=1): self.n_gram_counts = n_gram_counts self.n_plus1_gram_counts = n_plus1_gram_counts self.vocabulary = vocabulary self.special_tokens = special_tokens self.k = k #@title Question 1 def preprocess_data(filename, count_threshold, special_tokens, sample_delimiter='\n', split_ratio=0.8): """ Ungraded: You do not need to change this function. Preprocess data, i.e., - Find tokens that appear at least N times in the training data. - Replace tokens that appear less than N times by "" . Args: count_threshold: Words whose count is less than this are treated as unknown. Returns: training_data = list of lists denoting tokenized sentence. This looks like the following: [ ["this", "", "example"], ["another", "sentence", "", "right"], ... ] test_data = Same format as above. vocabulary = list of vocabulary words. This looks like the following: ["vocab-word-1", "vocab-word-2", etc.] """ # Create sentences and tokenize the data to create a list of strings. tokenized_data = read_and_tokenize_sentences(filename, sample_delimiter) # Create the training / test splits train_size = int(len(tokenized_data) * split_ratio) train_data = tokenized_data[0:train_size] test_data = tokenized_data[train_size:] # Get the closed vocabulary using the train data vocabulary = get_words_with_nplus_frequency(train_data, count_threshold) # For the train data, replace less common words with unknown token train_data_replaced = replace_oov_words_by_unk( train_data, vocabulary, unknown_token = special_tokens.unknown_token) # For the test data, replace less common words with "" test_data_replaced = replace_oov_words_by_unk( test_data, vocabulary, unknown_token = special_tokens.unknown_token) return train_data_replaced, test_data_replaced, vocabulary def preprocess_data_test(): """ Ungraded: You can use this function to test out preprocess_data. """ tmp_train = "the sky is blue.\nleaves are green.\nsmell all the roses." tmp_test = "roses are red." with open('tmp_data.txt', 'w') as f: f.write(str(tmp_train) + '\n') f.write(str(tmp_test) + '\n') special_tokens = SpecialTokens() count_threshold = 1 tmp_train_repl, tmp_test_repl, tmp_vocab = preprocess_data( "tmp_data.txt", count_threshold, special_tokens, split_ratio = 0.75) assert tmp_test_repl == [['roses', 'are', '', '.']] or \ tmp_test_repl == [[special_tokens.start_token, 'roses', 'are', '', special_tokens.end_token]] or \ tmp_test_repl == [[special_tokens.start_token, 'roses', 'are', '', '.', special_tokens.end_token]], \ print("tmp_test_repl is not correct") assert tmp_train_repl == [['the', 'sky', 'is', 'blue', '.'], ['leaves', 'are', 'green', '.'], ['smell', 'all', 'the', 'roses', '.']] or \ tmp_train_repl == [[special_tokens.start_token, 'the', 'sky', 'is', 'blue', special_tokens.end_token], [special_tokens.start_token, 'leaves', 'are', 'green', special_tokens.end_token], [special_tokens.start_token, 'smell', 'all', 'the', 'roses', special_tokens.end_token]] or \ tmp_train_repl == [[special_tokens.start_token, 'the', 'sky', 'is', 'blue', '.', special_tokens.end_token], [special_tokens.start_token, 'leaves', 'are', 'green', '.', special_tokens.end_token], [special_tokens.start_token, 'smell', 'all', 'the', 'roses', '.', special_tokens.end_token]], \ print("tmp_train_repl is not correct") print("\033[92m Successful test") return #@title Q1.1 Read / Tokenize Data from Sentences def read_and_tokenize_sentences(filename, sample_delimiter="\n"): """ Args: - filename = (e.g., "en_US.twitter.txt") - sample_delimiter = delimits each sample (i.e., each tweet) Example usage: $ read_and_tokenize_sentences(sentences) [['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green'], ['roses', 'are', 'red', '.']]A You can use nltk's tokenize function here. nltk.word_tokenize(sentence) """ # return None def get_words_with_nplus_frequency(train_data, count_threshold): # return None #@title Q1.2 Replace OOV Words with Special Token def replace_oov_words_by_unk(data, vocabulary, unknown_token=""): # return None #@title Q2 Count N-Grams def count_n_grams(data, n, special_tokens): """ Count all n-grams in the data Args: data: List of lists of words n: Number of words in a sequence special_tokens: A structure that contains: - start_token = "" - end_token = "" - unknown_token = "unk" Returns: A dictionary that maps a tuple of n-words to its frequency """ # Initialize dictionary of n-grams and their counts n_grams = {} # return n_grams def count_n_grams_test(): """ Ungraded: You can use this function to test out count_n_grams. """ tmp_data = "i like a cat\nthis dog is like a cat" with open('tmp_data.txt', 'w') as f: f.write(tmp_data + '\n') sentences, _, _ = preprocess_data( "tmp_data.txt", 0, SpecialTokens(), split_ratio = 1.0) received = count_n_grams(sentences, 2, SpecialTokens()) expected = { ('', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', ''): 2, ('', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1} assert received == expected, print("Received: \n", received, "\n\nExpected: \n", expected) print("\033[92m Successful test") return count_n_grams_test() #@title Q3 Estimate the Probabilities def estimate_probabilities(context_tokens, ngram_model): """ Estimate the probabilities of a next word using the n-gram counts with k-smoothing Args: word: next word previous_n_gram: A sequence of words of length n ngram_model: a structure that contains: - n_gram_counts: Dictionary of counts of n-grams - n_plus1_gram_counts: Dictionary of counts of (n+1)-grams - vocabulary_size: number of words - k: positive constant, smoothing parameter Returns: A dictionary mapping from next words to probability """ probabilities = {} # return probabilities def estimate_probabilities_test(): """ Ungraded: You can use this function to test out estimate_probabilities. """ tmp_data = "i like a cat\nthis dog is like a cat" with open('tmp_data.txt', 'w') as f: f.write(tmp_data + '\n') sentences, _, vocabulary = preprocess_data( "tmp_data.txt", 0, SpecialTokens(), split_ratio = 1.0) # unique_words = list(set(sentences[0] + sentences[1])) unigram_counts = count_n_grams(sentences, 1, SpecialTokens()) bigram_counts = count_n_grams(sentences, 2, SpecialTokens()) ngram_model = NGramModel(unigram_counts, bigram_counts, vocabulary, SpecialTokens(), k=1) expected = {'i': 0.09090909090909091, 'like': 0.09090909090909091, 'a': 0.09090909090909091, 'cat': 0.2727272727272727, 'this': 0.09090909090909091, 'dog': 0.09090909090909091, 'is': 0.09090909090909091, '': 0.09090909090909091, '': 0.09090909090909091} assert estimate_probabilities(["a"], ngram_model) == expected, \ print("estimate_probabilities is not correct") print("\033[92m Successful test") #@title Q4 Inference def predict_next_word(sentence_beginning, model): """ Args: sentence_beginning: a string model: an NGramModel object Returns: next_word = a string with the next word that his most likely to appear after the sentence_beginning input based ont he model. (You do not need to add in any top K or random sampling.) probability = corresponding probability of that word """ # return None #@title Q5 Extra Credit class StyleGram: def __init__(self, style_files): """ We will only be passing style_files in. All your processing and training should be done by the time this function retunrs. """ self.style_files = style_files # return def write_in_style_ngram(self, passage): """ Takes a passage in, matches it with a style, given a list of filenames, and predicts the next word that will appear using a bigram model. Args: passage: A string that contains a passage style_file: a list of filenames to be used to determine the style Returns: single word probability associated with the word index of "style" it originated from (e.g., 0 for 1st file) probability associated with the style """ # return word, probability_word, style_file, probability_style # Example Usage if __name__ == "__main__": # Create an instance of your NGramModel here, using your training data special_tokens = SpecialTokens() count_threshold = 10 train_data_replaced, test_data_replaced, vocabulary = preprocess_data( "data/en_US.twitter.txt", count_threshold, special_tokens ) # n=2 unigram_counts = count_n_grams(train_data_replaced, 1, special_tokens) bigram_counts = count_n_grams(train_data_replaced, 2, special_tokens) ngram_model = NGramModel(unigram_counts, bigram_counts, vocabulary, special_tokens, k=1) partial_sentence = "i love" # Example partial sentence predicted_word = predict_next_word(partial_sentence, ngram_model) print(f"The predicted next word for '{partial_sentence}' is: {predicted_word}")