반응형

/*******************************************************************************************************************
-- Title : [MSR] MSDN - Get Started w/ ScaleR
-- Reference : microsoft
-- Key word : microsoft r scaler rxgetopen sampledatadir rximport rxsummary rxhistogram rxdatastep
                  rxlogit logistic regression 회귀분석 
*******************************************************************************************************************/

-- Chart

-- Microsoft R
-- https://msdn.microsoft.com/en-us/microsoft-r/scaler-getting-started

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
***********************************************
-- 샘플 디렉토리에서 파일 가져오기
***********************************************
 
------------------------------
-- Case-1(RevoScaleR이용).
------------------------------
 
-- sampleDataDir 경로 확인 
rxGetOption("sampleDataDir")
  # sampleDataDir : C:\Program Files\Microsoft SQL Server\130\R_SERVER\library\RevoScaleR\SampleData
 
-- 경로/파일 정의 
inFile <- file.path(rxGetOption("sampleDataDir"), "AirlineDemoSmall.csv"
 
-- 파일 가져오기 
airData <- rxImport(inData=inFile, outFile = "airExample.xdf",
                    stringsAsFactors = TRUE, missingValueString = "M", rowsPerRead = 200000,
                    overwrite = TRUE)
-- 가져온 파일 확인
head(airData)
 
------------------------------
-- Case-2(RevoScaleR이용).
------------------------------
 
-- 사용자 경로/파일 설정 
inFile2 <- file.path("c:\\""AirlineDemoSmall.csv")
inFile2
 
-- 파일 가져오기 
airData2 <- rxImport(inData=inFile2, outFile = "airExample2.xdf",
                     stringsAsFactors = TRUE, missingValueString = "M", rowsPerRead = 200000,
                     overwrite = TRUE)
head(airData2)
 
------------------------------
-- Case-3(R이용).
------------------------------
airData3 = read.csv(file="c:\\AirlineDemoSmall.csv", na.strings="-")
head(airData3)
 
------------------------------
-- 데이터셋 정보 확인
------------------------------
 
-- 데이터셋 정보
rxGetInfo(airData, getVarInfo=TRUE)
rxGetVarInfo(airData)
 
-- 기본 통계
rxSummary(~ ArrDelay, data = airData)
rxSummary(~ CRSDepTime, data = airData)
 
summary(airData3)
 
-- 요일별 도착 지연 분포 
rxHistogram(~ArrDelay|DayOfWeek,  data = airData)
 
 
------------------------------
-- 통계 분석 수행 
------------------------------
 
-- 파생 필드 생성
airData4 <- rxDataStep(inData = airData, outFile = "airExample.xdf",
                       transforms=list(VeryLate = (ArrDelay > 120 | is.na(ArrDelay))),
                       overwrite = TRUE)
head(airData4, 10)
 
-- logistic regression 추정(회귀분석)
logitResults <- rxLogit(VeryLate ~ DayOfWeek, data = airData4 )           # 종속 ~ 독ㄹ
head(logitResults)
 
-- 회귀분석 결과
summary(logitResults)
 #                     Estimate Std. Error z value Pr(>|z|)    
 # (Intercept)         -3.29095    0.01745 -188.64 2.22e-16 ***
 # DayOfWeek=Monday     0.40086    0.02256   17.77 2.22e-16 ***
 # DayOfWeek=Tuesday    0.84018    0.02192   38.33 2.22e-16 ***           # 화요일이 가장 확률적
 # DayOfWeek=Wednesday  0.36982    0.02378   15.55 2.22e-16 ***
 # DayOfWeek=Thursday   0.29396    0.02400   12.25 2.22e-16 ***
 # DayOfWeek=Friday     0.54427    0.02274   23.93 2.22e-16 ***
 # DayOfWeek=Saturday   0.48319    0.02282   21.18 2.22e-16 ***
 # DayOfWeek=Sunday     Dropped    Dropped Dropped  Dropped
 
logitResults2 <- rxLogit(VeryLate ~ DayOfWeek - 1, data = airData4 )
summary(logitResults2)

cs

반응형

+ Recent posts