# cleaning the texts import pandas as pd import re import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer nltk.download('stopwords') def clean_text(training_data, testing_data): ps = PorterStemmer() def portstem(review): return [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] new_data = pd.concat( [training_data, testing_data] ) processed_reviews = new_data['Review'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x)) processed_reviews = processed_reviews.map(lambda x: x.lower()) processed_reviews = processed_reviews.map(lambda x: x.split()) processed_reviews = processed_reviews.map(portstem) processed_reviews = processed_reviews.map(lambda x: ' '.join(x)) training_corpus = processed_reviews[:len(training_data)] testing_corpus = processed_reviews[len(training_data):] return training_corpus, testing_corpus