Sentiment analysis in text
Get dataset
import os
import urllib.request
import tarfile
import pandas as pd
source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = '/tmp/aclImdb_v1.tar.gz'
if not os.path.isdir('/tmp/aclImdb') and not os.path.isfile('/tmp/aclImdb_v1.tar.gz'):
urllib.request.urlretrieve(source, target)
if not os.path.isdir('/tmp/aclImdb'):
with tarfile.open(target, 'r:gz') as tar:
tar.extractall(path='/tmp/')
# Build dataframe
basepath = '/tmp/aclImdb'
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for s in ('test', 'train'):
for l in ('pos', 'neg'):
path = os.path.join(basepath, s, l)
for file in sorted(os.listdir(path)):
with open(os.path.join(path, file),
'r', encoding='utf-8') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]],
ignore_index=True)
df.columns = ['review', 'sentiment']
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# Shuffling the dataset
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('/tmp/movie_data.csv', index=False, encoding='utf-8')
df.head(3)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Cleaning text with regex
import re
def preprocessor(text):
text = re.sub('<[^>]*>', '', text) # removes anything that is between <>
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', # checkout https://regexr.com/: captures emoticons with the pattern (:|;|=) + (- or not) + ()
text)
text = (re.sub('[\W]+', ' ', text.lower()) + # matches any non-word character
' '.join(emoticons).replace('-', ''))
return text
preprocessor('is seven.<br /><br />Title (Brazil): Not Available')
preprocessor('</a>This :) is :( a test :-)!')
df['review'] = df['review'].apply(preprocessor)
df.head(3)
Processing documents into tokens
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
porter = PorterStemmer()
def tokenizer(text):
return text.split()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
stop = stopwords.words('english')
Train logistic regression model
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=False,
preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)], # 1
'vect__stop_words': [stop, None], # 2
'vect__tokenizer': [tokenizer, tokenizer_porter], # 2
'clf__penalty': ['l1', 'l2'], # 2
'clf__C': [1.0, 10.0, 100.0]}, # 3
{'vect__ngram_range': [(1, 1)], # 1
'vect__stop_words': [stop, None], # 2
'vect__tokenizer': [tokenizer, tokenizer_porter], # 2
'vect__use_idf':[False], # 1
'vect__norm':[None], # 1
'clf__penalty': ['l1', 'l2'], # 2
'clf__C': [1.0, 10.0, 100.0]}, # 3
]
lr_tfidf = Pipeline([('vect', tfidf),
('clf', LogisticRegression(random_state=0, solver='liblinear'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
scoring='accuracy',
cv=5,
verbose=2,
n_jobs=-1)
There are 22235 + 22235 = 240 models to fit. THIS TAKES SO LONG!!!!
gs_lr_tfidf.fit(X_train, y_train)
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))