/*******************************************************************************************************************
-- Title : [Py3.5] TF-IDF including Stopwords w/ SKLearn
-- Reference : stackoverflow.com/questions/36369870
-- Key word : tf-idf tfidf stopword stop word stopwords stop words scikit-learn scikit learn sklearn
                  tfidfvectorizer tfidf vectorizer 사이킷런 불용어 
*******************************************************************************************************************/

■ Scripts

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
 
# =======================================
# -- Frozon Stopwords
# =======================================
print(stop_words.ENGLISH_STOP_WORDS)
print("... corpus""." * 100"\n")
 
# =======================================
# -- Add stopword in Tfidf vectorizer
# =======================================
corpus = ['Saya benci awak aaa',
          'Saya cinta awak aaa',
          'Saya x happy awak bbb',
          'Saya geram awak bbb',
          'Saya taubat awak ccc']
 
# --
# -- Add custom stopwords
# --
stop_words = ["aaa","bbb","ccc"]
 
# --
# -- Get TFIDF without Stopwords
# --
vectorizer = TfidfVectorizer(analyzer='word')
= vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
 
print (dict(zip(vectorizer.get_feature_names(), idf)))
print(",,, tfidf without Stopwords""," * 100"\n")
 
# --
# -- Get TFIDF with Stopwords
# --
vectorizer2 = TfidfVectorizer(analyzer='word', stop_words = stop_words)
X2 = vectorizer2.fit_transform(corpus)
idf2 = vectorizer2.idf_
 
print (dict(zip(vectorizer2.get_feature_names(), idf2)))
print("::: tfidf with Stopwords"":" * 100"\n")
cs



저작자 표시 비영리 변경 금지
신고

+ Recent posts

티스토리 툴바