반응형

/*********************************************************************************************************
-- Title : [R3.3] 추론통계분석 - 척도별 기술 통계 및 기술통계량 보고 산출
-- Reference : hrd-net
-- Key word : R 기술 통계량 평균 합계 중위수 최빈수 사분위수 분산 표준편차 최소값 최대값 범위 왜도 척도
                  mean sum median mode quartile var sd min max skewness kurtosis shapiro 귀무가설 대립가설
                  정규분포 알파 추론 통계 분석 summary 
*********************************************************************************************************/

-- Chart


-- Python/R

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# ************************************************
# -- 척도별 기술통계량
# ************************************************
 
# ------------------------------
# -- 기술통계량의 유형 
# ------------------------------
# 대표값 : 평균(Mean), 합계(Sum), 중위수(Median), 최빈수(mode), 사분위수(quartile) 등
# 산포도 : 분산(Variance), 표준편차(Standard deviation), 최소값(Minimum), 최대값(Maximum), 범위(Range) 등 
# 비대칭도 : 왜도(Skewness), 첨도(Kurtosis)
 
# -- 실습파일 가져오기
setwd("C:\\RProject\\Rwork\\Part-III")
data2 = read.csv("descriptive.csv", header=TRUE)
head(data2)                                                          # 데이터셋 확인
 
# -- 전체 데이터 특성 보기
dim(data2)                                                           # 차원보기
length(data2)                                                        # 열 길이
length(data2$survey)                                                 # 컬럼 관찰치  
str(data2)                                                           # 데이터 구조보기 
summary(data2)                                                       # 요약통계량
 
# ------------------------------
# -- 1) 명목척도 변수의 기술통계량
# ------------------------------
length(data2$gender)                                                 # 명목척도
summary(data2$gender)                                                # 명목척도 의미없음
table(data2$gender)                                                  # 성별 빈도수 - outlier 확인(0, 5)
 
data2 = subset(data2,data2$gender==1 | data2$gender==2)              # 성별 outlier제거
= table(data2$gender)                                              # 빈도수 저장
barplot(x)                                                           # 범주형 시각화 -> 막대차트
 
prop.table(x)                                                        # 빈도수 비율 계산
=  prop.table(x)
round(y*1002)                                                      # 백분율 적용(소수점 2자리)
 
# ------------------------------
# -- 2) 서열척도 변수의 기술통계량
# ------------------------------
length(data2$level)                                                  # 학력수준 - 서열
summary(data2$level)                                                 # 명목척도와 함께 의미없음
table(data2$level)                                                   # 빈도분석 - 의미있음
 
x1 = table(data2$level)                                              # 각 학력수준에 빈도수 저장
x1
barplot(x1)                                                          # 명목/서열척도 -> 막대차트
 
# ------------------------------
# -- 3) 등간척도 변수의 기술통계량
# ------------------------------
survey = data2$survey
survey
 
summary(survey)                                                      # 만족도(5점 척도) -> 2.605(평균)
x1=table(survey)                                                     # 빈도수
x1
 
hist(survey)                                                         # 등간척도 시각화 -> 히스토그림
 
# ------------------------------
# -- 4) 비율척도 변수의 기술통계량
# ------------------------------
length(data2$cost)
summary(data2$cost)                                                  # 요약통계량 - 의미있음(mean)
mean(data2$cost)                                                     # NA
data2$cost
 
# -- 데이터 정제(결측치 제거 및 outlier 제거)
plot(data2$cost)
data2 = subset(data2,data2$cost >= 2 & data2$cost <= 10# 총점기준
data2
x= data2$cost
x
 
# -- cost 변수의 대표값 
mean(x)                      # 평균 : 5.354
median(x)                    # 중위수 :  5.4   
sort(x)                      # 오름차순 
sort(x, decreasing=T)        # 내림차순  
quantile(x, 1/4)             # 1 사분위수 : 25%, 4.6
quantile(x, 3/4)             # 3 사분위수 : 75%, 6.2
 
# -- cost 변수의 산포도
var(x)                       # 분산 : 1.296826
sd(x)                        # 표준편차는 분산의 양의 제곱근 : 1.138783 
sqrt(var(x))                 # 1.138783 
min(x)                       # 최소값
max(x)                       # 최대값
range(x)                     # 범위(min ~ max)
 
# -- 연속형(등간/비율척도) 시각화 : 히스토그램 의미없음
table(data2$cost)            # cost 빈도수
hist(data2$cost)  
 
# -- 연속형 -> 범주화(리코딩) : 1,2,3
data2$cost2[data2$cost >=1 & data2$cost <=3=1
data2$cost2[data2$cost >=4 & data2$cost <=6=2
data2$cost2[data2$cost >=7=3
 
hist(data2$cost2)
 
# ------------------------------
# -- 5) cost 비율척도 기술통계량 구하기
# ------------------------------
attach(data2)                                                        # data2를 붙여라!      
length(cost)                                                         # data2$cost
summary(cost)                                                        # 요약통계량 - 의미있음(mean)
mean(cost)                                                           # 가장 의미있음
min(cost)
max(cost)
range(cost)                                                          # min ~ max
sort(cost)                                                           # 오름차순 
sort(cost, decreasing=T)                                             # 내림차순
detach(data2)                                                        # attach(data2) 해제
 
# -- NA가 있는 경우 -> NA 출력
test = c(1:5,NA,10:20)
test
min(test) 
max(test) 
range(test) 
mean(test)   
 
# -- 결측치 데이터 제거 후 통계량 구하기
min(test, na.rm=T) 
max(test, na.rm=T)
range(test, na.rm=T) 
mean(test, na.rm=T)
 
# ------------------------------
# -- 6) 비대칭도 구하기
# 비대칭도 : 분포의 기울어진 방향의 정도와 중심에 집중되는 정도를 나타내는 척도  
# ------------------------------
 
install.packages("moments")                                          # 왜도/첨도 사용을 위한 패키지 설치   
library(moments)
cost = data2$cost     
 
# -- 왜도
#    0보다 작으면 왼쪽 꼬리, 크면 오른쪽 꼬리
skewness(cost)     # -0.297234
 
# -- 첨도 
kurtosis(cost)     # 2.683438                                        # 정규분포 첨도는 3  
 
# ------------------------------
# -- 7) 히스토그램으로 왜도/첨도 확인
# ------------------------------
hist(cost) 
hist(cost, freq = F)
 
# -- (1) 밀도분포곡선, 정규분포 곡선 
lines(density(cost), col='blue')
= seq(080.1)
curve(dnorm(x, mean(cost), sd(cost)), col='red', add = T)
 
# -- (2) Q-Q plot
qqnorm(cost, main = 'cost : Q-Q plot')
qqline(cost, col='red')
# [해설] 점의 분포가 대각선과 일치하면 정규성이다.
 
# ------------------------------
# -- 3. 가설검정 
#       귀무가설 : 정규분포와 차이가 없다.(부정적) : >
#       대립가설 : 정규분포와 차이가 있다.(긍정적) : <
# ------------------------------
shapiro.test(cost)                                                   # W = 0.98187, p-value = 0.002959 < 알파(0.05)=유의수준 
 
# -- 정규분포 모집단 생성 
= rnorm(1000, mean=172.5, sd = 2)
h
hist(h)
shapiro.test(h)                                                      # p-value = 0.1276  > 0.05
 
 
 
 

cs


-- Files

descriptive.csv



반응형

+ Recent posts