[Py3.5] Filtering words using Stop Words - dBRang

2017. 5. 1. 00:55

/*******************************************************************************************************************
-- Title : [Py3.5] Filtering words using Stop Words - dBRang
-- Reference : dBRang
-- Key word : nlp nltk stopword stopwords stopwords.word word_tokenize 자연어처리 자연어 처리
불용어 단어 토큰 워드 토큰 stop word stop words tokenizing
*******************************************************************************************************************/

■ Implement on Dataframe

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
 
import re
from pandas import Series, DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.regexp import regexp_tokenize
import pandas as pd
 
 
# ------------------------------
# -- Set Dataframe Option
# ------------------------------
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
 
 
# ------------------------------
# -- Corpus
# ------------------------------
 
# --
# -- Sourcing Corpus
# --
raw_sent = """Hello Mr. Heyden 2488, How are you doing 'today'? 
This Hyper-Computer is _great, and Python is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard.
"""
 
print(raw_sent)
print("... Raw_Corpus", "." * 100, "\n")
 
 
# ------------------------------
# -- Tokenizing
# ------------------------------
 
# --
# -- Tokenize to Sentence
# --
token_sent = re.split("[\n\.\?]", raw_sent)
token_sent = [temp for temp in token_sent if temp != '' and temp != ' ']  # 빈 값인 경우 제외
 
print(token_sent)
print(",,, Token_Sent", "," * 100, "\n")
 
# --
# -- List to Dataframe
# --
df_sent = DataFrame(token_sent)
df_sent.columns = ["title"]
df_sent.index.name = "idx"
 
print(df_sent)
print(",,, Sent_List to Dataframe", "," * 100, "\n")
 
# --
# -- Tokenize to Word
# --
df_sent["token_word"] = df_sent["title"].apply(lambda x: re.compile('[a-z]+[\-]?[a-z]+', re.I).findall(x))
 
print(df_sent["token_word"])
print(",,, Token_Word", "," * 100, "\n")
 
 
# ------------------------------
# -- Filtering using Stopwords
# ------------------------------
 
# --
# -- Add Stopwords
# --
stop_words = stopwords.words("english")
with open('StopWordList.txt', encoding='utf-8') as f:
    for i in f:
        stop_words.append(i.strip())
stop_words = set(stop_words)
 
print(stop_words)
print(";;; Token_Word", ";" * 100, "\n")
 
# --
# -- Filter with Stopwords
# --
df_sent["stop_word"] = df_sent["token_word"].apply(lambda x: [t for t in x if t not in stop_words])
 
print(df_sent[["token_word", "stop_word"]])
print(";;; Filter by Stopwords", ";" * 100, "\n")
 
 

StopWordList.txt


■ 불용어(Stopword) 추가/제거 및 적용

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
 
import re
from pandas import Series, DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.regexp import regexp_tokenize
 
# ------------------------------
# -- 문장 Sourcing
# ------------------------------
 
# --
# -- 문장 Source
# --
raw_corpus = "This sentence for stop_word, hayden, king."
 
print (raw_corpus)
print("... Raw_Corpus", "." * 100, "\n")
 
 
# ------------------------------
# -- 불용어 추가 및 확인
# ------------------------------
 
# --
# -- 불용어 기본 리스트 확인
# --
stop_words = set(stopwords.words('english')) # set은 세션 내에서 고정
stop_words2 = stopwords.words("english")     # 필요하면 나중에 stop_word2 = set(stop_word2) 설정
print (stop_words)
print (stop_words2)
 
print (type(stop_words))   # type : set
print (type(stop_words2))  # type : list
 
print("*** Default Stopword", "*" * 100, "\n")
 
# --
# -- 불용어 추가
# --
stop_words.update(('hayden', 'king'))
stop_words2.append('hayden')
stop_words2.append('king')
 
print (stop_words)
print (stop_words2)
 
print("*** Add Stopword", "*" * 100, "\n")
 
# --
# -- 불용어 제거
# --
stop_words.remove(('hayden'))
stop_words2.remove(('hayden'))
 
print (stop_words)
print (stop_words2)
 
print("*** Remove Stopword", "*" * 100, "\n")
 
 
# ------------------------------
# -- 파일에서 불용어 추가(code by 최남우)
# ------------------------------
stop_words = stopwords.words("english")
with open('StopWordList.txt', encoding='utf-8') as f:
    for i in f:
        stop_words.append(i.strip())
stop_words = set(stop_words)
 
print (stop_words)
 
print("$$$ Add Stopword from File", "$" * 100, "\n")
 
 
# ------------------------------
# -- 단어 토큰의 불용어 적용
# ------------------------------
token_word = word_tokenize(raw_corpus)
print ("Token_Word : ", token_word)
 
filter_word = [w for w in token_word if w not in stop_words]
print("Filter_Word : ",filter_word)
 
print("### Filtered Token_Word", "#" * 100, "\n")

저작자표시 비영리 변경금지

디비랑[dɪ'bɪraŋ]

[Py3.5] Filtering words using Stop Words - dBRang

+ Recent posts

티스토리툴바