반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Classify SPAM and HAM using Naive Bayes w/ NLTK
-- Reference : clarkgrubb.com
-- Key word : naivebayes naive bayes word_token word tokenizer word tokenization word count nltk
word embedding word feature spam mail ham classfication classifier word frequency 스펨 햄
나이브 베이즈 분류기 단어 토큰 토크나이저 토크나이징 단어 빈도수 단어 빈도 단어 임베딩
*******************************************************************************************************************/
■ Scripts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import nltk # ======================================= # -- Sourcing # ======================================= data = [('Hi John, I will see you at 9:00pm', 'ham'), ('Buy Viagra at low, low prices. One time offer.', 'spam'), ('Your Amazon Order is for the book "Harry Potter"', 'ham'), ('Earn a million dollars by working at home.', 'spam')] print(type(data)) print(data) print("... source", "." * 100, "\n") # ======================================= # -- Word_token & Word featuring # ======================================= def features(s): dict_vec = {} for wtoken in nltk.tokenize.word_tokenize(s): dict_vec[wtoken] = 1 if wtoken not in dict_vec else dict_vec[wtoken] + 1 return dict_vec processed_data = [(features(tup[0]), tup[1]) for tup in data] print(type(processed_data)) print(processed_data) print(",,, word_featuring", "," * 100, "\n") # ======================================= # -- Training and Testing # ======================================= # -- Training # -- nltk.NaiveBayesClassifier.train( [( {'John': 1, ... }, 'ham'), ...] ) classifier = nltk.NaiveBayesClassifier.train(processed_data) # -- Testing print(classifier.classify(features("This is good Viagra!"))) print(classifier.classify(features("We will meet you at 7pm. Take care"))) | cs |
반응형