Processing documents into tokens (tokenization and stop words)

Tokenization

from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer('runners like running and thus they run')
['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
tokenizer_porter('runners like running and thus they run')
['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

Stop words

import nltk

nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/othrif/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.





True
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]
['runner', 'like', 'run', 'run', 'lot']