반응형

/*******************************************************************************************************************
-- Title : [Py3.5] Stop words w/ NLTK
-- Reference : pythionprogramming.net
-- Key word : nlp nltk stop word stop words stop_words word_tokenize word tokenize sent_tokenize
                  sent tokenize 자연어 불용어 단어 토큰 문장 토큰 word_token word token
*******************************************************************************************************************/

-- Python

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
# -- 샘플 문장
example_sent = "This is a sample sentence, showing off the stop words filtration."
print(example_sent)
 
print("[Example Sentence]""-" * 200"\n")
 
# -- 불용어 설정
stop_words = set(stopwords.words('english'))
print(stop_words)
 
print("[Default Stopword]""-" * 200"\n")
 
# -- 단어 분리 토큰
word_tokens = word_tokenize(example_sent)
print(word_tokens)
 
print("[Word Tokenize]""-" * 200"\n")
 
# -- 불용어 제외 방법-1
filtered_sentence1 = [w for w in word_tokens if not w in stop_words]
print(filtered_sentence1)
 
print("[Stopword-1]""-" * 200"\n")
 
# -- 불용어 제외 방법-2
filtered_sentence2 = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence2.append(w)
 
print(filtered_sentence2)
 
print("[Stopword-2]""-" * 200"\n")
# -- 불용어 제공 국가 확인
print(stopwords.fileids())
cs

반응형

+ Recent posts