반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Filtering words using Stop Words - dBRang
-- Reference : dBRang
-- Key word : nlp nltk stopword stopwords stopwords.word word_tokenize 자연어처리 자연어 처리
불용어 단어 토큰 워드 토큰 stop word stop words tokenizing
*******************************************************************************************************************/
■ Implement on Dataframe
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | # -*- coding: utf-8 -*- import re from pandas import Series, DataFrame from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.tokenize.regexp import RegexpTokenizer from nltk.tokenize.regexp import regexp_tokenize import pandas as pd # ------------------------------ # -- Set Dataframe Option # ------------------------------ pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # ------------------------------ # -- Corpus # ------------------------------ # -- # -- Sourcing Corpus # -- raw_sent = """Hello Mr. Heyden 2488, How are you doing 'today'? This Hyper-Computer is _great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard. """ print(raw_sent) print("... Raw_Corpus", "." * 100, "\n") # ------------------------------ # -- Tokenizing # ------------------------------ # -- # -- Tokenize to Sentence # -- token_sent = re.split("[\n\.\?]", raw_sent) token_sent = [temp for temp in token_sent if temp != '' and temp != ' '] # 빈 값인 경우 제외 print(token_sent) print(",,, Token_Sent", "," * 100, "\n") # -- # -- List to Dataframe # -- df_sent = DataFrame(token_sent) df_sent.columns = ["title"] df_sent.index.name = "idx" print(df_sent) print(",,, Sent_List to Dataframe", "," * 100, "\n") # -- # -- Tokenize to Word # -- df_sent["token_word"] = df_sent["title"].apply(lambda x: re.compile('[a-z]+[\-]?[a-z]+', re.I).findall(x)) print(df_sent["token_word"]) print(",,, Token_Word", "," * 100, "\n") # ------------------------------ # -- Filtering using Stopwords # ------------------------------ # -- # -- Add Stopwords # -- stop_words = stopwords.words("english") with open('StopWordList.txt', encoding='utf-8') as f: for i in f: stop_words.append(i.strip()) stop_words = set(stop_words) print(stop_words) print(";;; Token_Word", ";" * 100, "\n") # -- # -- Filter with Stopwords # -- df_sent["stop_word"] = df_sent["token_word"].apply(lambda x: [t for t in x if t not in stop_words]) print(df_sent[["token_word", "stop_word"]]) print(";;; Filter by Stopwords", ";" * 100, "\n") |
■ 불용어(Stopword) 추가/제거 및 적용
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | # -*- coding: utf-8 -*- import re from pandas import Series, DataFrame from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.tokenize.regexp import RegexpTokenizer from nltk.tokenize.regexp import regexp_tokenize # ------------------------------ # -- 문장 Sourcing # ------------------------------ # -- # -- 문장 Source # -- raw_corpus = "This sentence for stop_word, hayden, king." print (raw_corpus) print("... Raw_Corpus", "." * 100, "\n") # ------------------------------ # -- 불용어 추가 및 확인 # ------------------------------ # -- # -- 불용어 기본 리스트 확인 # -- stop_words = set(stopwords.words('english')) # set은 세션 내에서 고정 stop_words2 = stopwords.words("english") # 필요하면 나중에 stop_word2 = set(stop_word2) 설정 print (stop_words) print (stop_words2) print (type(stop_words)) # type : set print (type(stop_words2)) # type : list print("*** Default Stopword", "*" * 100, "\n") # -- # -- 불용어 추가 # -- stop_words.update(('hayden', 'king')) stop_words2.append('hayden') stop_words2.append('king') print (stop_words) print (stop_words2) print("*** Add Stopword", "*" * 100, "\n") # -- # -- 불용어 제거 # -- stop_words.remove(('hayden')) stop_words2.remove(('hayden')) print (stop_words) print (stop_words2) print("*** Remove Stopword", "*" * 100, "\n") # ------------------------------ # -- 파일에서 불용어 추가(code by 최남우) # ------------------------------ stop_words = stopwords.words("english") with open('StopWordList.txt', encoding='utf-8') as f: for i in f: stop_words.append(i.strip()) stop_words = set(stop_words) print (stop_words) print("$$$ Add Stopword from File", "$" * 100, "\n") # ------------------------------ # -- 단어 토큰의 불용어 적용 # ------------------------------ token_word = word_tokenize(raw_corpus) print ("Token_Word : ", token_word) filter_word = [w for w in token_word if w not in stop_words] print("Filter_Word : ",filter_word) print("### Filtered Token_Word", "#" * 100, "\n") |
반응형