/*******************************************************************************************************************
-- Title : [MSR] MSDN - Analyzing a Large Data Set w/ ScaleR
-- Reference : microsoft.com
-- Key word : microsoft r scaler revoscaler rxxdfdata rxgetvarinfo rxgetinfo rxreadxdf summary nrow
linear model regression 단순 회귀 분석 회귀분석 회귀모델 회귀 모델 lm system.time
rxlinmod rxreadxdf dataframe data frame data.frame rxlineplot sort coef dccoef sprintf
dclhcoef 선형 모델 선형모델 intercept 절편 기울기 lm linear model rxresultsdf
*******************************************************************************************************************/
-- MSDN
* https://msdn.microsoft.com/en-us/microsoft-r/scaler-getting-started-3-analyze-large-data
-- Chart
-- Airline DS 다운로드
* Download URL : http://packages.revolutionanalytics.com/datasets/
* Airline on-time performance : http://stat-computing.org/dataexpo/2009/
-- Microsoft R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | # *********************************************** # -- Processing the airline dataset # *********************************************** # ------------------------------ # -- Download the airline dataset # ------------------------------ # -- 디렉토리 설정 bigDataDir <- "C:\\RProject\\MRS\\Data" # -- Airline DS 다운로드 # Download URL : http://packages.revolutionanalytics.com/datasets/ # Airline on-time performance : http://stat-computing.org/dataexpo/2009/ # -- 파일 설정 airDataName <- file.path(bigDataDir, "AirOnTime7Pct.xdf") # -- DS 가져오기 bigAirDS <- RxXdfData( airDataName ) head(bigAirDS) # -- 추가 변수 정보 보기 rxGetVarInfo(bigAirDS) rxGetInfo(bigAirDS, getVarInfo=TRUE) # ------------------------------ # -- Create Dataframe from DS # ------------------------------ # -- create dataframe myData <- rxReadXdf(bigAirDS) head(myData) # -- Export to CSV write.csv(myData,"e:\\AirOnTime7Pct.csv", row.names=TRUE) # ------------------------------ # -- Reading a chunk of data # ------------------------------ # -- 데이터프레임으로 가져오기 testDF <- rxReadXdf(file = bigAirDS, varsToKeep = c("ArrDelay","DepDelay", "DayOfWeek"), startRow = 100000, numRows = 1000) summary(testDF) nrow(testDF) # -- 단순 회귀분석 샘플([MSR]에서) lmObj <- lm(ArrDelay~DayOfWeek, data = testDF) summary(lmObj) # Intercept : 절편 # -- 회귀모델 생성([R]에서) result.lm = lm(formula=ArrDelay~DayOfWeek, data=testDF) result.lm # *********************************************** # -- Estimating a Linear Model with a Huge Data Set # *********************************************** # 대량건을 처리할 경우 "cannot allocate vector" error 발생 가능 # 이 경우 RevoScaleR로 처리 # ------------------------------ # -- 처리 시스템 시간 추정 # ------------------------------ # -- blocksPerRead = 30 system.time( delayArr <- rxLinMod(ArrDelay ~ DayOfWeek, data = bigAirDS, cube = TRUE, blocksPerRead = 30) ) summary(delayArr) # -- blocksPerRead = 300 system.time( delayArr <- rxLinMod(ArrDelay ~ DayOfWeek, data = bigAirDS, cube = TRUE, blocksPerRead = 300) ) summary(delayArr) # -- cube = TRUE # Condition Number가 변경됨(뭔지 모르겠다). system.time( delayArr <- rxLinMod(ArrDelay ~ DayOfWeek, data = bigAirDS, cube = FALSE, blocksPerRead = 300) ) summary(delayArr) # ------------------------------ # -- 선형 모델 결합 # ------------------------------ # --선형모델 2개 수행 delayArr <- rxLinMod(ArrDelay ~ DayOfWeek, data = bigAirDS, cube = TRUE, blocksPerRead = 300) delayDep <- rxLinMod(DepDelay ~ DayOfWeek, data = bigAirDS, cube = TRUE, blocksPerRead = 30) # -- 데이터프레임으로의 변환(rxResultsDF) 및 결합 # c.f) rxReadXdf cubeResults <- rxResultsDF(delayArr) head(cubeResults) class(cubeResults) # data.frame cubeResults$DepDelay <- rxResultsDF(delayDep)$DepDelay head(cubeResults) # -- Lineplot 출력 rxLinePlot( ArrDelay + DepDelay ~ DayOfWeek, data = cubeResults, title = 'Average Arrival and Departure Delay by Day of Week') # ------------------------------ # -- 진행 사항 출력 유무 # ------------------------------ # -- 진행사항 생략(reportProgress = 0) delayDep <- rxLinMod(DepDelay ~ DayOfWeek, data = bigAirDS, cube = TRUE, blocksPerRead = 30, reportProgress = 0) # -- 진행사항 출력 delayDep <- rxLinMod(DepDelay ~ DayOfWeek, data = bigAirDS, cube = TRUE, blocksPerRead = 30) # *********************************************** # -- Handling Larger Linear Models # *********************************************** # -- 선형 모델 생성 delayCarrier <- rxLinMod(ArrDelay ~ UniqueCarrier, data = bigAirDS, cube = TRUE, blocksPerRead = 30) summary(delayCarrier) # -- sort the coefficient vector(계수 백터 정렬) dcCoef <- sort(coef(delayCarrier)) dcCoef # -- 가장 낮은 지연 시간 head(dcCoef, 10) # -- 가장 높은 지연 시간 tail(dcCoef, 10) # -- 지연 차이 출력 sprintf("United's additional delay compared with Hawaiian: %f", dcCoef["UniqueCarrier=UA"]-dcCoef["UniqueCarrier=HA"]) # *********************************************** # -- 다중 회귀 모델 처리 # *********************************************** # -- 여러 변수 처리 delayCarrierLoc <- rxLinMod(ArrDelay ~ UniqueCarrier + Origin+Dest, data = bigAirDS, cube = TRUE, blocksPerRead = 30) dclCoef <- coef(delayCarrierLoc) sprintf( "United's additional delay accounting for dep and arr location: %f", dclCoef["UniqueCarrier=UA"]- dclCoef["UniqueCarrier=HA"]) paste("Number of coefficients estimated: ", length(!is.na(dclCoef))) # -- 비행 시간 변수 추가 delayCarrierLocHour <- rxLinMod(ArrDelay ~ UniqueCarrier + Origin + Dest + F(CRSDepTime), data = bigAirDS, cube = TRUE, blocksPerRead = 30) dclhCoef <- coef(delayCarrierLocHour) dclhCoef # -- 결과 요약 sprintf("United's additional delay compared with Hawaiian: %f", dcCoef["UniqueCarrier=UA"]-dcCoef["UniqueCarrier=HA"]) paste("Number of coefficients estimated: ", length(!is.na(dcCoef))) sprintf( "United's additional delay accounting for dep and arr location: %f", dclCoef["UniqueCarrier=UA"]- dclCoef["UniqueCarrier=HA"]) paste("Number of coefficients estimated: ", length(!is.na(dclCoef))) sprintf( "United's additional delay accounting for location and time: %f", dclhCoef["UniqueCarrier=UA"]-dclhCoef["UniqueCarrier=HA"]) paste("Number of coefficients estimated: ", length(!is.na(dclhCoef))) # *********************************************** # -- 항공사 지연 예측 # *********************************************** # -- 예상 지연을 추정하는 함수 생성 expectedDelay <- function( carrier = "AA", origin = "SEA", dest = "SFO", deptime = "9") { coeffNames <- c( sprintf("UniqueCarrier=%s", carrier), sprintf("Origin=%s", origin), sprintf("Dest=%s", dest), sprintf("F_CRSDepTime=%s", deptime)) return (sum(dclhCoef[coeffNames])) } # ------------------------------ # -- 여행 지연 비교 # ------------------------------ # Go to JFK (New York) from Seattle at 5 in the afternoon on United expectedDelay("AA", "SEA", "JFK", "17") # Go to Newark from Seattle at 5 in the afternoon on United expectedDelay("UA", "SEA", "EWR", "17") # Or go to Honolulu from Seattle at 7 am on Hawaiian expectedDelay("HA", "SEA", "HNL", "7") | cs |