import pandas as pd import re def get_train_test_data(data_dir): # Get the train data train_data = pd.read_json(f"{data_dir}/train.json") train_data.drop(['id'], axis=1, inplace=True) # Get the test data test_data = pd.read_json(f"{data_dir}/test.json") test_data.drop(['id'], axis=1, inplace=True) return train_data, test_data def preprocess(input_data): # Define the custom preprocessing function def preprocess_util(input_data): # Convert all text to lowercase lowercase = input_data.lower() # Remove newlines and double spaces removed_newlines = re.sub("\n|\r|\t", " ", lowercase) removed_double_spaces = ' '.join(removed_newlines.split(' ')) # Add start of sentence and end of sentence tokens s = '[SOS] ' + removed_double_spaces + ' [EOS]' return s # Apply the preprocessing to the train and test datasets input_data['summary'] = input_data.apply(lambda row : preprocess_util(row['summary']), axis = 1) input_data['dialogue'] = input_data.apply(lambda row : preprocess_util(row['dialogue']), axis = 1) document = input_data['dialogue'] summary = input_data['summary'] return document, summary