[Py3.5] Count Vector & Term-Docmuent Matrix w/ NLTK- ver.dBRang

2017. 5. 17. 18:48

/*******************************************************************************************************************
-- Title : [Py3.5] Count Vector & Term-Docmuent Matrix w/ NLTK- ver.dBRang
-- Key word : nlp nltk count vector list-in-list dataframe sent_tokenize word_tokenize token tokenizing
tokenization mop pos tagging pos tag lemma lemmatize lemmatizing lemmatization
stopword stopwords 자연어 처리 자연어처리 워드 벡터 데이터프레임 데이터 프레임 토큰
토크나이저 품사 태깅 불용어 리스트 term-document matrix termdocumentmatrix tdm
word embedding 워드 임베딩 단어 임베딩 word vectors word vector 워드 벡터 단어 벡터
word features word feature
*******************************************************************************************************************/

■ Scripts

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

# -*- coding: utf-8 -*-
 
import re
import pandas as pd
from pandas import Series, DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.regexp import regexp_tokenize
from nltk.tag import pos_tag
 
 
# ------------------------------
# -- Set Dataframe Option
# ------------------------------
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
 
 
# ------------------------------
# -- Create Dataframe with Sample
# ------------------------------
dict_sample = \
    {'aKey'  : [0, 1, 2, 3, 4, 5, 6, 7],
     'bTitle': ['Computer and computer system having the bond former function of power-off with the extension system.',
                'Computer and computer system having the bond former function of power-off with the extension system.',
                'Beautiful computer system having a function of auto power off before connecting with an expansion ' \
                'system and having push button control thereof.',
                'The car which is using a electric energy.',
                'DRIVE WHEEL CONVERTING DEVICE OF A CAR.',
                'ALL Computer having a function of auto power off before connecting with an expansion system.',
                'This Computer having a function of auto power off before connecting with an expansion system.',
                'Computer having a function of auto power off before connecting with an expansion system.']}
 
df_corpus = DataFrame(dict_sample)
df_corpus.index.name = "id"
 
# -- 소문자 처리
df_corpus["bTitle"] = df_corpus["bTitle"].str.lower()
 
print(df_corpus)
print("... corpus", "." * 100, "\n")
 
 
# ------------------------------
# -- Word Tokenizing
# ------------------------------
# Ref : dbrang.tistory.com/1183
df_corpus["cWord"] = df_corpus["bTitle"].apply(lambda x: re.compile('[a-z]+[\-]?[a-z]+', re.I).findall(x))
 
print(df_corpus)
print(",,, word_token", "," * 100, "\n")
 
 
# ------------------------------
# -- POS Tagging
# ------------------------------
 
df_corpus["dPostag"] = df_corpus["cWord"].apply(lambda x: pos_tag(x))
 
print(df_corpus[["aKey", "cWord", "dPostag"]])
print(";;; pos_tagging", ";" * 100, "\n")
 
 
# ------------------------------
# -- Lemmatizing
# ------------------------------
# Ref : dbrang.tistory.com/1189
 
# --
# -- 처리 대상 선정
# --
lst_pos = list('NJR')
fn_j2a = lambda tag: tag if tag != 'j' else 'a'
 
df_corpus["ePostag2"] = df_corpus["dPostag"].apply(
    lambda x: [(word, fn_j2a(pos[0].lower())) for word, pos in x if pos[0] in lst_pos])
 
print(df_corpus[["aKey", "dPostag", "ePostag2"]])
print("!!! pos_tag", "!" * 100, "\n")
 
# --
# -- Lemmatize
# --
lemmatizer = WordNetLemmatizer()
df_corpus["fLemma"] = df_corpus["ePostag2"].apply(lambda x: [(lemmatizer.lemmatize(word, pos), pos) for word, pos in x])
 
print(df_corpus[["aKey", "dPostag", "ePostag2", "fLemma"]])
print("!!! Lemmatizing", "!" * 100, "\n")
 
 
# ------------------------------
# -- Remove Stopword
# ------------------------------
# Ref : dbrang.tistory.com/1184
 
# --
# -- Add Stopwords
# --
stop_words = stopwords.words("english")
with open('StopWordList.txt', encoding='utf-8') as f:
    for i in f:
        stop_words.append(i.strip())
 
stop_words.append('beautiful')   # 테스트 용
stop_words = set(stop_words)
 
print("stop_words:", stop_words)
print("/// list stop_word", "/" * 100, "\n")
 
# --
# -- Filter with Stopwords
# --
df_corpus["gStopword"] = df_corpus["fLemma"].apply(lambda x: [t for t in [w for w, p in x] if t not in stop_words])
 
print(df_corpus[["aKey", "fLemma", "gStopword"]])
print("/// stop_word", "/" * 100, "\n")
 
 
# ------------------------------
# -- Word Extraction and Duplication in List-in-List
# ------------------------------
 
# --
# -- 단어 중복 제거 in List-in-List
# --
 
# -- dataframe to list
lst_word = df_corpus["gStopword"].tolist()
print ("리스트 변환:", lst_word)
 
lst_nword = list()
 
# -- word duplicatioin in list-in-list
for sublist in lst_word:
    for list in sublist:
        if list not in  lst_nword:
            lst_nword.append(list)
 
print ("중복제거 리스트:",lst_nword)
print("^^^ word dup in list-in-list", "^" * 100, "\n")
 
 
# ------------------------------
# -- Count Vector
# ------------------------------
 
# --
# -- Create CountVector Dataframe
# --
df_Countvec = DataFrame(columns=(lst_nword))
df_Countvec.index.name = "id"
 
cnt_col_df_Countvec = df_Countvec.shape[1] # df 컬럼 개수
 
print ("df_Countvec 컬럼개수:", cnt_col_df_Countvec)
print (df_Countvec.columns)
print("^^^ col name and col count of df_Countvec ", "^" * 100, "\n")
 
# --
# -- Word Embedding or Term-Document Matrix(tdm)
# --
for idx1, row in df_corpus.iterrows():
    print ("[",idx1,"]", row["gStopword"])
    lst_row = row["gStopword"]
    lst_vector = []
 
    for idx2, val in enumerate(lst_nword):
        print ("[",idx1,"]", ":", idx2, ":", val, ":", lst_row.count(val))
        lst_vector.append(lst_row.count(val))
 
    print ("[",idx1,"]", lst_vector, "\n")
 
    # -- df_Countvec에 행 추가
    df_Countvec.loc[idx1] = lst_vector
 
print (df_Countvec)
print("^^^ result of df_Countvec ", "^" * 100, "\n")
 
 
# ------------------------------
# -- Merge Dataframe
# ------------------------------
print (pd.merge(df_corpus[["aKey", "gStopword"]], df_Countvec, how="left", left_index=True, right_index=True))
print("*** Merged Dataframe ", "*" * 100, "\n")
 

 

■ Files

StopWordList.txt

다운로드

저작자표시 비영리 변경금지

디비랑[dɪ'bɪraŋ]

[Py3.5] Count Vector & Term-Docmuent Matrix w/ NLTK- ver.dBRang

+ Recent posts

티스토리툴바