[R3.3] 비정형텍스트 - 형태소 분석(NLP) 및 워드 클라우드

2016. 12. 9. 00:18
/*********************************************************************************************************
-- Title : [R3.3] 비정형텍스트 - 형태소 분석(NLP) 및 워드 클라우드
-- Reference : hrd-net
-- Key word : R nlp konlp corpus vectorsource sejongdic 세종 사전 단어 구름 wordcloud 자연어처리 자연어
처리 형태소 분석 형태소분석 워드 클라우드 word cloud
*********************************************************************************************************/
-- Chart
-- R
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# ********************************************
# -- 비정형 데이터 처리 패키지 설치
# ********************************************
 
# -- 분석 절차
#    1단계 : 토픽분석(단어의 빈도수)
#    2단계 : 연관어 분석(관련 단어 분석) 
#    3단계 : 감성 분석(단어의 긍정/부정 분석) 
 
# -- 패키지 설치 및 로딩
#    1) java install : http://www.oracle.com
#       -> java 프로그램 설치(64비트 환경 - R(64bit) - java(64bit))
 
# -- rJava 설치 : R에서 java 사용을 위한 패키지
install.packages("rJava")
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_112')
library(rJava)   
 
# -- install.packages
install.packages(c("KoNLP", "tm", "wordcloud"))
 
# -- 패키지 로딩
library(KoNLP)                                                  # 한글사전 
library(tm)                                                     # 텍스트 전처리 
library(wordcloud)                                              # 단어구름 시각화
 
 
# ********************************************
# -- 단계1 - 토픽분석(텍스트 마이닝) 
#    시각화 : 단어 빈도수에 따른 워드 클라우드
# ********************************************
 
#
# -- 1. 텍스트 데이터(facebook_bigdata.txt) 가져오기
#
facebook = file("C:\\RProject\\Rwork\\Part-II\\facebook_bigdata.txt", encoding="UTF-8")
facebook_data = readLines(facebook)                             # 줄 단위 데이터 생성
head(facebook_data)                                             # 앞부분 6줄 보기 - 줄 단위 문장 확인 
str(facebook_data)                                              # chr [1:76]
 
#
# -- 2. Corpus : 텍스트 데이터 -> 자료집(documents) 생성(tm 패키지 제공)
#
facebook_corpus = Corpus(VectorSource(facebook_data)) 
facebook_corpus 
 
# -- 76개 자료집 보기 - 포함된 문자 수 제공
inspect(facebook_corpus)  
 
#
# -- 3. 분석 대상 자료집을 대상으로 NA 처리(공백)
#
facebook_corpus[is.na(facebook_corpus)] = " "
facebook_corpus 
 
#
# -- 4. 세종 사전 사용 및 단어 추가
#
useSejongDic()                                                  # 세종 사전 불러오기
 
# -- 세종 사전에 없는단어 추가
mergeUserDic(data.frame(c("R 프로그래밍","페이스북","소셜네트워크"), c("ncn")))     
                                                                # ncn : 명사지시코드
 
# -- 명사 추출 연습
extractNoun("나는 홍길동 입니다, 우리나라 만세 !!")
 
#
# -- 5. 단어추출 사용자 함수 정의
#
 
# -- 사용자 정의 함수 실행 순서 : 문자변환 -> 명사 단어추출 -> 공백으로 합침
exNouns = function(x) { 
  paste(extractNoun(as.character(x)), collapse=" ")
}
 
# -- exNouns 함수 이용 단어 추출
facebook_nouns = sapply(facebook_corpus, exNouns)               # 형식) sapply(적용 데이터, 적용함수)
 
# -- 단어 추출 결과 : 
class(facebook_nouns)                                           # [1] "character" -> Vector 타입
facebook_nouns[1]                                               # 단어만 추출된 첫 줄 보기 
facebook_nouns[2]
 
#
# -- 6. 데이터 전처리   
#
 
# -- 추출된 단어 이용하여 자료집 생성
myCorputfacebook = Corpus(VectorSource(facebook_nouns))
 
# -- 데이터 전처리 
myCorputfacebook = tm_map(myCorputfacebook, removePunctuation)                 # 문장부호 제거
myCorputfacebook = tm_map(myCorputfacebook, removeNumbers)                     # 수치 제거
myCorputfacebook = tm_map(myCorputfacebook, tolower)                           # 소문자 변경
myCorputfacebook = tm_map(myCorputfacebook, removeWords, stopwords('english')) # 불용어제거
 
# -- stop woard 확인
stopwords('english') # 174
 
# -- 전처리 결과 확인 
inspect(myCorputfacebook[1:5])                                            # 데이터 전처리 결과 확인
 
#
# -- 7. 단어 선별(단어 길이 2개 이상)
#
 
# -- 자료집 -> 일반문서 변경
myCorputfacebook_txt = tm_map(myCorputfacebook, PlainTextDocument) 
 
# -- TermDocumentMatrix() : 단어 선별(단어길이 2개 이상인 단어 선별 -> matrix 변경)
myCorputfacebook_txt = TermDocumentMatrix(myCorputfacebook_txt, control=list(wordLengths=c(2,Inf)))
myCorputfacebook_txt
 
# -- matrix -> data.frame 변경(빈도분석을 위해서)
myTermfacebook.df = as.data.frame(as.matrix(myCorputfacebook_txt)) 
dim(myTermfacebook.df)                                                    # [1] 876  76
 
#
# -- 8. 단어 빈도수 구하기(행 단위 합계 -> 내림차순 정렬)
#
wordResult = sort(rowSums(myTermfacebook.df), decreasing=TRUE)            # 빈도수로 내림차순 정렬
wordResult[1:100]
 
#
# -- 9. 단어 구름(wordcloud) 생성  생성 - 디자인 적용 전
#
wordcloud(c("한국", "일본"), c(10, 5))
myName = names(wordResult)                                                # 단어 이름 추출(빈도수 이름) 
wordcloud(myName, wordResult)                                             # 단어구름 시각화 
 
#
# -- 10. 단어구름에 디자인 적용(빈도수, 색상, 랜덤, 회전 등)
#
 
# -- 단어이름과 빈도수로 data.frame 생성
word.df = data.frame(word=myName, freq=wordResult) 
str(word.df)                                                              # word, freq 변수
 
# -- 단어 색상과 글꼴 지정
pal = brewer.pal(12,"Paired")                                             # 12가지 색상 pal = brewer.pal(9,"Set1") # Set1~ Set3
 
# -- 폰트 설정세팅 : "맑은 고딕", "서울남산체 B"
windowsFonts(malgun=windowsFont("맑은 고딕"))                             # windows
 
# -- 별도의 창을 띄우는 함수
x11( ) 
 
# -- wordcloud(단어, 빈도수, 5:1비율 크기,최소빈도수,랜덤순서,랜덤색상, 회전비율, 색상(파렛트),컬러,글꼴 )
wordcloud(word.df$word, word.df$freq, 
          scale=c(5,1), min.freq=3, random.order=F, 
          rot.per=.1, colors=pal, family="malgun")
 
#
# -- 11. 차트 시각화 
#
 
# -- 상위 10개 토픽추출
topWord = head(sort(wordResult, decreasing=T), 10)                        # 상위 10개 토픽추출 
 
# -- 파일 차트 생성 
pie(topWord, col=rainbow(10), radius=1)                                   # 파이 차트-무지개색, 원크기
 
# -- 빈도수 백분율 적용 
pct = round(topWord/sum(topWord)*100, 1)                                  # 백분율
names(topWord)
 
# -- 단어와 백분율 하나로 합친다.
lab = paste(names(topWord), "\n", pct, "%")
 
# -- 파이차트에 단어와 백분율을 레이블로 적용 
pie(topWord, main="SNS 빅데이터 관련 토픽분석", col=rainbow(10), cex=0.8, labels=lab)
 
Colored by Color Scripter
cs
-- Files
facebook_bigdata.txt
저작자표시 비영리 변경금지
디비랑[dɪ'bɪraŋ]

[R3.3] 비정형텍스트 - 형태소 분석(NLP) 및 워드 클라우드

+ Recent posts

티스토리툴바