Sentiment analysis in text

Get dataset

import os
import urllib.request
import tarfile
import pandas as pd

source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = '/tmp/aclImdb_v1.tar.gz'


if not os.path.isdir('/tmp/aclImdb') and not os.path.isfile('/tmp/aclImdb_v1.tar.gz'):
    urllib.request.urlretrieve(source, target)
    
if not os.path.isdir('/tmp/aclImdb'):
    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall(path='/tmp/')
# Build dataframe

basepath = '/tmp/aclImdb'

labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
df.columns = ['review', 'sentiment']
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

# Shuffling the dataset

import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('/tmp/movie_data.csv', index=False, encoding='utf-8')
df.head(3)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

Cleaning text with regex

import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # removes anything that is between <>
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', # checkout https://regexr.com/: captures emoticons with the pattern (:|;|=) + (- or not) + ()
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) + # matches any non-word character
            ' '.join(emoticons).replace('-', ''))
    return text
preprocessor('is seven.<br /><br />Title (Brazil): Not Available')
preprocessor('</a>This :) is :( a test :-)!')
df['review'] = df['review'].apply(preprocessor)
df.head(3)

Processing documents into tokens

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

stop = stopwords.words('english')

Train logistic regression model

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)], # 1
               'vect__stop_words': [stop, None], # 2
               'vect__tokenizer': [tokenizer, tokenizer_porter], # 2
               'clf__penalty': ['l1', 'l2'], # 2
               'clf__C': [1.0, 10.0, 100.0]}, # 3
              {'vect__ngram_range': [(1, 1)], # 1
               'vect__stop_words': [stop, None], # 2
               'vect__tokenizer': [tokenizer, tokenizer_porter], # 2
               'vect__use_idf':[False], # 1
               'vect__norm':[None], # 1
               'clf__penalty': ['l1', 'l2'], # 2
               'clf__C': [1.0, 10.0, 100.0]}, # 3
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0, solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

There are 22235 + 22235 = 240 models to fit. THIS TAKES SO LONG!!!!

gs_lr_tfidf.fit(X_train, y_train)
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))