[R3.3] Dataframe - 탐색적 조회 방법 및 결측치, 극단치, 역코딩

■■ Data Language ■■/R
[R3.3] Dataframe - 탐색적 조회 방법 및 결측치, 극단치, 역코딩

디비랑 2016. 11. 26. 17:57
/*********************************************************************************************************
-- Title : [R3.3] Dataframe - 탐색적 조회 방법 및 결측치, 극단치, 역코딩
-- Reference : hrd-net
-- Key word : R 데이터 프레임 데이터프레임 결측치 극단치 역코딩 stem length na.omit ifelse is.na
data frame dataframe NA NULL outlier
*********************************************************************************************************/
-- R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
  
# ********************************************
# -- 탐색적 데이터 조회
# ********************************************
 
setwd("C:\\Rwork\\Part-II")
dataset = read.csv("dataset.csv", header=TRUE)   # 헤더가 있는 경우
str(dataset)                                     # 'data.frame':    300 obs. of  7 variables:
 
# -- 데이터셋 전체 보기
print(dataset) 
View(dataset)                                    # 뷰어창 출력
 
# -- 간단이 앞쪽/뒤쪽 조회
head(dataset) 
tail(dataset) 
 
# -- 데이터 셋 구조보기
names(dataset)                                   # 변수명(컬럼)
attributes(dataset)                              # names(), class, row.names
str(dataset)                                     # 데이터 구조보기
 
# -- 데이터 셋 조회 
dataset$age 
dataset[4]
 
dataset$resident
length(dataset$age)                              # data 수-300개 
 
x = dataset$gender                               # 조회결과 변수 저장
y = dataset$price
x;y
 
mode(x);class(x); str(x)
 
plot(dataset$price)                              # 산점도 : 전반적인 가격분포 보기
 
# -- $기호 대신 [""]기호를 이용한 변수 조회
dataset["gender"] 
dataset["price"]
 
# -- 색인(index)으로 칼럼 조회 
dataset[2] # 두번째 컬럼
dataset[6] # 여섯번째 컬럼
dataset[3,] # 3번째 관찰치(행) 전체
dataset[,3] # 3번째 변수(열) 전체
 
# --dataset에서 2개 이상 칼럼 조회
dataset[c("job","price")]
dataset[c(2,6)] 
 
dataset[c(1,2,3)] 
dataset[c(1:3)] 
dataset[c(2,4:6,3,1)] 
 
 
# ********************************************
# -- 결측치(NA) 처리
# ********************************************
#    결측치 처리 방법 : 결측치 제거와 다른 값으로 대체하는 방법
#    결측치를 무조건 제거하게 되면 결측치가 포함된 관측치가 제거되어 
#    해당 정보가  손실되므로 좋은 대안이 아닐 수 있다.
#    따라서 결측치를 0이나 평균으로 대체하는 방법을 선택한다.
 
summary(dataset$price) 
sum(dataset$price)                               # NA 출력
 
# -- 결측데이터 제거 방법1
sum(dataset$price, na.rm=T)                      # 2362.9
 
# -- 결측데이터 제거 방법2 
price2 = na.omit(dataset$price) 
sum(price2)                                      # 2362.9
length(price2)                                   # 270 -> 30개 제거
 
# -- 결측데이터 처리 : 0으로 대체
x = dataset$price                                # price vector 생성 
x[1:30] # 5.1 4.2 4.7 3.5 5.0
dataset$price2 = ifelse(!is.na(x), x, 0)         # 평균으로 대체
dataset$price2[1:30]
 
# -- 결측데이터 처리 : 평균으로 대체
x = dataset$price # price vector 생성 
x[1:30] # 5.1 4.2 4.7 3.5 5.0
 
dataset$price3 = ifelse(!is.na(x), x, round(mean(x, na.rm=TRUE), 2) )     # 평균으로 대체
dataset$price3[1:30]
 
dataset[c('price', 'price2', 'price3')]
 
 
# ********************************************
# -- 극단치 발견과 정제
# ********************************************
 
# -- 범주형 변수 극단치 처리
gender = dataset$gender
gender
 
# -- outlier 확인
#    hist(gender)                               # 히스토그램
table(gender)                                   # 빈도수
pie(table(gender))                              # 파이 차트
 
# -- gender 변수 정제(1,2)
dataset = subset(dataset, gender==1 | gender==2)
dataset                                         # gender변수 데이터 정제
 
length(dataset$gender)                          # 297개 - 3개 정제됨
pie(table(dataset$gender))
 
# -- 비율척도 극단치 처리
dataset$price                                   # 세부데이터 보기
length(dataset$price)                           # 300개(NA포함)
plot(dataset$price)                             # 산점도 
summary(dataset$price)                          # 범위확인
 
# -- price변수 정제
dataset2 = subset(dataset, dataset$price >= 2 & dataset$price <= 10)
length(dataset2$price)                          # 248 
stem(dataset2$price)                            # 줄기와 잎 도표보기
 
 
# ********************************************
# -- 코딩 변경 
#    데이터의 가독성, 척도 변경, 최초 코딩 내용 변경을 목적으로 수행
# ********************************************
 
# -- 가독성을 위한 코딩변경 
#    dataset$새칼럼[조건식] = "값"
dataset2$resident2[dataset2$resident == 1] ='1.서울특별시'
dataset2$resident2[dataset2$resident == 2] ='2.인천광역시'
dataset2$resident2[dataset2$resident == 3] ='3.대전광역시'
dataset2$resident2[dataset2$resident == 4] ='4.대구광역시'
dataset2$resident2[dataset2$resident == 5] ='5.시구군'
dataset2[c("resident","resident2")] # 2개만 지정
 
dataset2$job2[dataset2$job == 1] = '공무원'
dataset2$job2[dataset2$job == 2] = '회사원'
dataset2$job2[dataset2$job == 3] = '개인사업'
head(dataset2[c('job', 'job2')])
 
# -- 연속형(비율) -> 범주형
dataset2$age2[dataset2$age <= 30] ="청년층"
dataset2$age2[dataset2$age > 30 & dataset2$age <=55] ="중년층"
dataset2$age2[dataset2$age > 55] ="장년층"
head(dataset2)
 
# -- 역코딩 : 긍정순서(5~1)
#    5점 척도 
#    1.매우만족,  ...  5. 매우불만족 -> 6-1, 6-5 -> 5, 1
 
dataset2$survey
survey = dataset2$survey
csurvey = 6-survey                               # 역코딩
csurvey
survey                                           # 역코딩 결과와 비교
dataset2$survey = csurvey                        # survery 수정
head(dataset2)                                   # survey 결과 확인
 
 
# ********************************************
# -- 정제된 데이터 저장
# ********************************************
setwd("C:/Rwork/Part-II")
 
# -- 정제된 데이터 저장
write.csv(dataset2,"cleanData.csv", quote=F, row.names=F) 
 
# -- 저장된 파일 불러오기/확인
new_data = read.csv("cleanData.csv", header=TRUE)
new_data 
dim(new_data)                                    # 248  13
 

cs
-- Files
dataset.csv
저작자표시 비영리 변경금지