반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Stop words w/ NLTK
-- Reference : pythionprogramming.net
-- Key word : nlp nltk stop word stop words stop_words word_tokenize word tokenize sent_tokenize
sent tokenize 자연어 불용어 단어 토큰 문장 토큰 word_token word token
*******************************************************************************************************************/
-- Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | from nltk.corpus import stopwords from nltk.tokenize import word_tokenize # -- 샘플 문장 example_sent = "This is a sample sentence, showing off the stop words filtration." print(example_sent) print("[Example Sentence]", "-" * 200, "\n") # -- 불용어 설정 stop_words = set(stopwords.words('english')) print(stop_words) print("[Default Stopword]", "-" * 200, "\n") # -- 단어 분리 토큰 word_tokens = word_tokenize(example_sent) print(word_tokens) print("[Word Tokenize]", "-" * 200, "\n") # -- 불용어 제외 방법-1 filtered_sentence1 = [w for w in word_tokens if not w in stop_words] print(filtered_sentence1) print("[Stopword-1]", "-" * 200, "\n") # -- 불용어 제외 방법-2 filtered_sentence2 = [] for w in word_tokens: if w not in stop_words: filtered_sentence2.append(w) print(filtered_sentence2) print("[Stopword-2]", "-" * 200, "\n")
# -- 불용어 제공 국가 확인 | cs |
반응형