[R3.3] 비정형텍스트 - 감성 분석 및 시각화

2016. 12. 10. 17:04

/*********************************************************************************************************
-- Title : [R3.3] 비정형텍스트 - 감성 분석 및 시각화
-- Reference : hrd-net
-- Key word : R 비정형 텍스트 감성 분석 긍정 부정 패키지 package
*********************************************************************************************************/

-- Chart

-- R

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# ************************************************
# -- 감성 분석(단어의 긍정/부정 분석) 
#    시각화 : 파랑/빨강 -> 불만고객 시각화
# ************************************************
 
# ------------------------------
# -- 1) 데이터 가져오기("../Rwork/Part-II/reviews.csv") 
# ------------------------------
data=read.csv(file.choose())                                         # file.choose() 파일 선택
head(data,2)
dim(data)                                                            # 100   2
str(data)                                                            # 변수명 : company, review = 고객 인터뷰 내용
 
# ------------------------------
# -- 2) 단어 사전에 단어추가
# ------------------------------
 
# -- 긍정단어와 부정단어를 카운터하여 긍정/부정 형태로 빈도 분석
#    neg.txt : 부정어 사전
#    pos.txt : 긍정어 사전
 
# -- (1) 긍정어/부정어 영어 사전 가져오기
setwd("C:\\RProject\\Rwork\\Part-II")
posDic = readLines("posDic.txt")
negDic = readLines("negDic.txt")
length(posDic)                                                       # 2006
length(negDic)                                                       # 4783
 
# -- (2) 긍정어/부정어 단어 추가 
posDic.final =c(posDic, 'victor')
negDic.final =c(negDic, 'vanquished')
 
# -- 마지막에 단어 추가
tail(posDic.final)
tail(negDic.final)
 
# ------------------------------
# -- 3) 감성 분석 함수 정의-sentimental
# ------------------------------
 
# -- (1) 문자열 처리를 위한 패키지 로딩 
library(plyr)                                                        # laply()함수 제공
library(stringr)                                                     # str_split()함수 제공
 
# -- (2) 감성분석을 위한 함수 정의
sentimental = function(sentences, posDic, negDic){
  
  scores = laply(sentences, function(sentence, posDic, negDic) {
    
    sentence = gsub('[[:punct:]]', '', sentence) # 문장부호 제거
    sentence = gsub('[[:cntrl:]]', '', sentence) # 특수문자 제거
    sentence = gsub('\\d+', '', sentence)        # 숫자 제거
    sentence = tolower(sentence)                 # 모두 소문자로 변경(단어가 모두 소문자 임)
    
    word.list = str_split(sentence, '\\s+')      # 공백 기준으로 단어 생성 -> \\s+ : 공백 정규식, +(1개 이상) 
    words = unlist(word.list)                    # unlist() : list를 vector 객체로 구조변경
    
    pos.matches = match(words, posDic)           # words의 단어를 posDic에서 matching
    neg.matches = match(words, negDic)
    
    pos.matches = !is.na(pos.matches)            # NA 제거, 위치(숫자)만 추출
    neg.matches = !is.na(neg.matches)
    
    score = sum(pos.matches) - sum(neg.matches)  # 긍정 - 부정    
    return(score)
  }, posDic, negDic)
  
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}
 
# ------------------------------
# -- 4) 감성 분석 : 두번째 변수(review) 전체 레코드 대상 감성분석
# ------------------------------
result=sentimental(data[,2], posDic.final, negDic.final)
result
names(result)                                                        # "score" "text" 
dim(result)                                                          # 100   2
result$text
result$score                                                         # 100 줄 단위로 긍정어/부정어 사전을 적용한 점수 합계
 
# -- score값을 대상으로 color 칼럼 추가
result$color[result$score >=1] = "blue"
result$color[result$score ==0] = "green"
result$color[result$score < 0] = "red"
 
# -- 감성분석 결과 차트보기
plot(result$score, col=result$color) # 산포도 색생 적용
barplot(result$score, col=result$color, main ="감성분석 결과화면")   # 막대차트
 
# ------------------------------
# -- 5) 단어의 긍정/부정 분석 
# ------------------------------
 
# -- (1) 감성분석 빈도수 
table(result$color)
 
# -- (2) score 칼럼 리코딩 
result$remark[result$score >=1] = "긍정"
result$remark[result$score ==0] = "중립"
result$remark[result$score < 0] = "부정"
 
sentiment_result= table(result$remark)
sentiment_result
 
# -- (3) 제목, 색상, 원크기
pie(sentiment_result, main="감성분석 결과", 
    col=c("blue","red","green"), radius=0.8)                         # ->  1.2
 
 
 

cs

-- Files

negDic.txt

posDic.txt

reviews.csv

저작자표시 비영리 변경금지

디비랑[dɪ'bɪraŋ]

[R3.3] 비정형텍스트 - 감성 분석 및 시각화

+ Recent posts

티스토리툴바