Tokenization
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer(text):
return text.split()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
tokenizer('runners like running and thus they run')
['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
tokenizer_porter('runners like running and thus they run')
['runner', 'like', 'run', 'and', 'thu', 'they', 'run']
Stop words
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data] /Users/othrif/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
True
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]
['runner', 'like', 'run', 'run', 'lot']