[Py3.5] Count Vector & Term-Document Matrix w/ SKLearn - ver.dBRang

2017. 6. 6. 16:33

/*******************************************************************************************************************
-- Title : [Py3.5] Count Vector & Term-Document Matrix w/ SKLearn - ver.dBRang
-- Key word : word count vectorize countvector countvectorize term-document matrix term document matrix
tdm stopwords tfidf tf-idf fit_transform scikit learn sklearn scikit-learn word count vector
word embedding 워드 임베딩 단어 임베딩 word vectors 워드 벡터 단어 벡터 word vector
word features word feature
*******************************************************************************************************************/

■ Scripts

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

# -*- coding: utf-8 -*-
 
import re
import pandas as pd
from pandas import Series, DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.regexp import regexp_tokenize
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
 
# Scikit-learn을 통한 TF-IDF 구현
# Ref : dbrang.tistory.com/1189
 
# ------------------------------
# -- Set Dataframe Option
# ------------------------------
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
 
 
# ------------------------------
# -- Create Dataframe with Sample
# ------------------------------
dict_sample = \
    {'aKey': [0, 1, 2, 3, 4, 5, 6, 7],
     'bTitle': ['Computer and computer system having the bond former function of power-off with the extension system.',
                'Computer and computer system having the bond former function of power-off with the extension system.',
                'Beautiful computer system having a function of auto power off before connecting with an expansion ' \
                'system and having push button control thereof.',
                'The car which is using a electric energy.',
                'DRIVE WHEEL CONVERTING DEVICE OF A CAR.',
                'ALL Computer having a function of auto power off before connecting with an expansion system.',
                'This Computer having a function of auto power off before connecting with an expansion system.',
                'Computer having a function of auto power off before connecting with an expansion system.']}
 
df_corpus = DataFrame(dict_sample)
df_corpus.index.name = "id"
 
# -- 소문자 처리
df_corpus["bTitle"] = df_corpus["bTitle"].str.lower()
 
print(df_corpus)
print("... corpus", "." * 100, "\n")
 
 
# ------------------------------
# -- Extract words(feature)
# ------------------------------
 
# --
# -- Declare CountVectorizer()
# --
# WordNGramAnalyzer 기본 사용
# 기본으로 모두 소문화로 전환하여 처리
# CountVectorizer(min_df=1)에서 min_df는 최소 빈도 회수 지정(이상인 단어만 처리)
 
countvec = CountVectorizer() # 기본 처리기, 소문자화.. 여기에 Default나 사용자 Tokenizer 사용.
 
print (countvec)
print(",,, allocate_CountVectorizer", "," * 100, "\n")
"""
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
"""
 
# --
# -- fit_transform()
# --
 
# 단어 토큰 및 카운트 추출
arr_fittrans = countvec.fit_transform(df_corpus.bTitle)
 
print(arr_fittrans)
print(",,, countvec.fit_transform", "," * 100, "\n")
 
 
# 단어 토큰 출력
word_token = countvec.get_feature_names()
 
print(word_token)
print(",,, word_token(get_feature_names)", "," * 100, "\n")
 
 
# ------------------------------
# -- Remove Stopwords
# ------------------------------
# Ref : dbrang.tistory.com/1205
 
# --
# -- Add User Stopwords
# --
stop_words = stopwords.words("english")
with open('StopWordList.txt', encoding='utf-8') as f:
    for i in f:
        stop_words.append(i.strip())
 
stop_words.append('beautiful')  # 테스트 용
stop_words = set(stop_words)
 
print("stop_words:", stop_words)
print(";;; list stop_word", ";" * 100, "\n")
 
# --
# -- Remove Stopwords in Word_Token
# --
word_token2 = [x for x in word_token if x not in stop_words]
 
print("word_token :", word_token)
print("word_token2:", word_token2)
print("^^^ word_token2_remove_stopword", "^" * 100, "\n")
 
 
# ------------------------------
# -- Term-Document Matrix
# ------------------------------
print (pd.DataFrame(arr_fittrans.toarray(), columns=word_token))
 
## array에서 불용어 필드열을 제거해야 하는데...
## 자체 Tokenizer를 CountVectorizer()에 사용하면 가능한 듯...!!!
Colored by Color Scripter

cs

■ Files

StopWordList.txt

다운로드

저작자표시 비영리 변경금지

디비랑[dɪ'bɪraŋ]

[Py3.5] Count Vector & Term-Document Matrix w/ SKLearn - ver.dBRang

+ Recent posts

티스토리툴바