반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Regression - Features, Lables, Training and Test w/ svm.SVR
-- Reference : pythonprogramming.net
-- Key word : quandl sklearn scikit-learn linear_model linear regression svm support vector machine data set
pre-processing label train test training testing svm.svr support vector regression linearregression
kernel 선형 회귀 선형 회귀 모델 서포트 벡터 머신 서포트벡터머신 트레이닝 테스팅 to_csv
cross-validation cross validation shift
*******************************************************************************************************************/
■ Scripts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | # -*- coding: utf-8 -*- import pandas as pd import quandl import math import numpy as np from sklearn import preprocessing, cross_validation, svm from sklearn.linear_model import LinearRegression # ------------------------------ # -- Set Dataframe Option # ------------------------------ pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # ------------------------------ # -- Get Dataset # ------------------------------ # 시나리오 : 일자별 주식 자료를 통해 몇일 후 주가를 예측 df = quandl.get("WIKI/GOOGL") print(df.head()) print("... get_dataset", "." * 100, "\n") # ------------------------------ # -- Manipulate Data # ------------------------------ df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']] print(df.head()) print(",,, pair_down_data", "," * 100, "\n") # -- Add Calculated Column df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0 df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0 df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']] print(df.head()) print(",,, add_calculated_columns", "," * 100, "\n") # -- write to .csv df.to_csv("manipulated_wiki_google.csv") # -- Add Label Column forecast_col = 'Adj. Close' df.fillna(value=-99999, inplace=True) # replace NaN to -99999 forecast_out = int(math.ceil(0.01 * len(df))) # len() : df row count, math.ceil : 가장 작은 정수 올림 print("len(df):", len(df)) print("0.01*len(df):", 0.01 * len(df)) print("math.ceil(0.01*len(df)):", math.ceil(0.01 * len(df))) print("forecast_out:", forecast_out) # -- shift : +) 다음으로 행 이동, -) 이전으로 행 이동 # -- 몇일 후의 값을 예측하기 위한 LABEL을 임으로 이동시킴 df['label'] = df[forecast_col].shift(-forecast_out) print(df.head()) print(",,, row_shift_for_label", "," * 100, "\n") # ------------------------------ # -- Make Training and Testing Set # ------------------------------ # -- drop NaN rows df.dropna(inplace=True) X = np.array(df.drop(['label'], 1)) # label만 빼고 Train Array로 y = np.array(df['label']) # Label만 Test Array로 print("X:", X) print("y:", y) print(";;; array(train:X, label:y)", ";" * 100, "\n") # -- Pre-Processing X = preprocessing.scale(X) # 기계학습 기능을 (-1)~(+1) 범위로 설정 y = np.array(df['label']) print("X:", X) print("y:", y) print(";;; pre-processed X,y", ";" * 100, "\n") # -- 모델 검증(Cross-Validation) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) print("X_train:", X_train[:5], "\n") print("X_test:", X_test[:5], "\n") print("y_train:", y_train[:20], "\n") print("y_test:", y_test[:20]) print(";;; cross-validation", ";" * 100, "\n") # ------------------------------ # -- Model Fitting # ------------------------------ # -- # -- SVM.SVR(Support Vector Regression) # -- clf = svm.SVR() clf.fit(X_train, y_train) # fitting confidence = clf.score(X_test, y_test) print(confidence) print("^^^ try_svm.svr", "^" * 100, "\n") # -- # -- Linear Regression # -- # -- Linear Regression(일반 처리) clf = LinearRegression() clf.fit(X_train, y_train) # fitting confidence = clf.score(X_test, y_test) print(confidence) print("^^^ try_linear_regression", "^" * 100, "\n") # -- Linear Regression(쓰레드 지정) clf = LinearRegression(n_jobs=-1) clf.fit(X_train, y_train) # fitting confidence = clf.score(X_test, y_test) print(confidence) print("^^^ try_linear_regression with thread", "^" * 100, "\n") # ------------------------------ # -- Training(Using Kernel) # ------------------------------ for k in ['linear', 'poly', 'rbf', 'sigmoid']: clf = svm.SVR(kernel=k) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print(k, ":", confidence) print("*** Training w/ Kernel", "*" * 100, "\n") |
■ Files
반응형