반응형
/*******************************************************************************************************************
-- Title : [Py3.5] TF-IDF including Stopwords w/ SKLearn
-- Reference : stackoverflow.com/questions/36369870
-- Key word : tf-idf tfidf stopword stop word stopwords stop words scikit-learn scikit learn sklearn
tfidfvectorizer tfidf vectorizer 사이킷런 불용어
*******************************************************************************************************************/
■ Scripts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | from sklearn.feature_extraction import stop_words from sklearn.feature_extraction.text import TfidfVectorizer # ======================================= # -- Frozon Stopwords # ======================================= print(stop_words.ENGLISH_STOP_WORDS) print("... corpus", "." * 100, "\n") # ======================================= # -- Add stopword in Tfidf vectorizer # ======================================= corpus = ['Saya benci awak aaa', 'Saya cinta awak aaa', 'Saya x happy awak bbb', 'Saya geram awak bbb', 'Saya taubat awak ccc'] # -- # -- Add custom stopwords # -- stop_words = ["aaa","bbb","ccc"] # -- # -- Get TFIDF without Stopwords # -- vectorizer = TfidfVectorizer(analyzer='word') X = vectorizer.fit_transform(corpus) idf = vectorizer.idf_ print (dict(zip(vectorizer.get_feature_names(), idf))) print(",,, tfidf without Stopwords", "," * 100, "\n") # -- # -- Get TFIDF with Stopwords # -- vectorizer2 = TfidfVectorizer(analyzer='word', stop_words = stop_words) X2 = vectorizer2.fit_transform(corpus) idf2 = vectorizer2.idf_ print (dict(zip(vectorizer2.get_feature_names(), idf2))) print("::: tfidf with Stopwords", ":" * 100, "\n") | cs |
반응형