Data can be downloaded from -
Step 1 - loading required libraries
import os # to check working path
from sklearn.datasets import load_files # load_files automatically labels classes when input data is present in different folders
import re # for regular expressions
import nltk # for nlp
from nltk.stem import WordNetLemmatizer # to use WordNet dataset for stemming
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer # get tf-idf values
from sklearn.model_selection import train_test_split # to split testand train dataset
from sklearn.ensemble import RandomForestClassifier # for classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle # to save model
Step 2 - loading data
movie_data = load_files("C:\\D\\Learning\\Sentiment Analysis usinf sklearn\\txt_sentoken")
X,y=movie_data.data, movie_data.target
Step 3- data preprocessing and converting into tf-idf values ( documents are converted into array of all the words ( tf-idf value of every word in every documents)
new_X= []
for data in X:
data1= str(data)
data2= re.sub(r'[^\w]', " ", data1) # replaces all special characters
data3= re.sub(r'[\s+\W+\s]', " ", data2) # replaces all single letter word
data4= re.sub(r'[ ][ ]+', " ", data3) # removes multiple spaces
data5 = re.sub(r'^b\s+', '', data4) # removes leading b
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', data5) # removes single letter
document_splitted= document.lower()
document_splitted= document.split() # stemming has to be done on strings
stemmer = WordNetLemmatizer()
stemmed_doc= [stemmer.lemmatize(word) for word in document_splitted]
stemmed_str= " ".join(stemmed_doc) # converting list back to str
new_X.append(stemmed_str) # creating list of documents
vectorizer = TfidfVectorizer()
X= vectorizer.fit_transform(new_X)
X_arr= X.toarray()
Step 4- Getting train and test set and fitting classification
X_train, X_test, y_train, y_test = train_test_split(X_arr, y, test_size= .2)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
Step 5- model Evaluation-
# model evaluation on train data
y_predicted= classifier.predict(X_train)
cf= confusion_matrix(y_train, y_predicted)
print(classification_report(y_train, y_predicted))
# model evaluation on test data
y_test_predicted= classifier.predict(X_test)
print(confusion_matrix(y_test, y_test_predicted))
print(classification_report(y_test, y_test_predicted))
Step 6- storing and loading model again-
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
with open('text_classifier', 'rb') as mfile:
model= pickle.load(mfile)
Step 7- test on new document
file1 = open("nerw_review.txt","r")
data_file= file1.readlines()
X1= vectorizer.transform(data_file) # vectorizer.transform is used to convert new doc into tf-idf
predict_review= classifier.predict(X1)
predict_review.view()
Comments