반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Add and Remove Stop Words w/ NLTK
-- Reference : pythionprogramming.net
-- Key word : nlp nltk stop word stop words stop_words word_tokenize word tokenize sent_tokenize
sent tokenize 자연어 불용어 단어 토큰 문장 토큰 update remove
*******************************************************************************************************************/
-- Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | from nltk.corpus import stopwords from nltk.tokenize import word_tokenize # ------------------------------ # -- 샘플 문장 # ------------------------------ example_sent = "This is a sample sentence. kaka, yoyo" # ------------------------------ # -- 불용어 설정 # ------------------------------ stop_words = set(stopwords.words('english')) print(stop_words) print("[1] "+"-"*200) # ------------------------------ # -- 워드 토큰 # ------------------------------ word_tokens = word_tokenize(example_sent) #print(word_tokens) filtered_sentence1 = [w for w in word_tokens if not w in stop_words] print(filtered_sentence1) # ['This', 'sample', 'sentence', '.', 'kaka', ',', 'yoyo'] print("[2] "+"-"*200) # ------------------------------ # -- 불용어 추가 # ------------------------------ stop_words.update(('kaka','yoyo')) print(stop_words) # 리스트에 나온당~ print("[3] "+"-"*200) # ------------------------------ # -- 워드 토큰 # ------------------------------ word_tokens = word_tokenize(example_sent) #print(word_tokens) filtered_sentence2 = [w for w in word_tokens if not w in stop_words] print(filtered_sentence2) # ['This', 'sample', 'sentence', '.', ','] print("[4] "+"-"*200) # ------------------------------ # -- 불용어 제거 # ------------------------------ stop_words.remove(('yoyo')) print(stop_words) # 'yoyo' 제거됨 print("[5] "+"-"*200) # ------------------------------ # -- 워드 토큰 # ------------------------------ word_tokens = word_tokenize(example_sent) #print(word_tokens) filtered_sentence3 = [w for w in word_tokens if not w in stop_words] print(filtered_sentence3) # ['This', 'sample', 'sentence', '.', ',', 'yoyo'] print("[6] "+"-"*200) |
반응형