[Py3.5] Regression - Features, Lables, Training and Test w/ svm.SVR

2017. 6. 4. 10:25

/*******************************************************************************************************************
-- Title : [Py3.5] Regression - Features, Lables, Training and Test w/ svm.SVR
-- Reference : pythonprogramming.net
-- Key word : quandl sklearn scikit-learn linear_model linear regression svm support vector machine data set
pre-processing label train test training testing svm.svr support vector regression linearregression
kernel 선형 회귀 선형 회귀 모델 서포트 벡터 머신 서포트벡터머신 트레이닝 테스팅 to_csv
cross-validation cross validation shift
*******************************************************************************************************************/

■ Scripts

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
 
import pandas as pd
import quandl
import math
import numpy as np
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
 
# ------------------------------
# -- Set Dataframe Option
# ------------------------------
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
 
 
# ------------------------------
# -- Get Dataset
# ------------------------------
# 시나리오 : 일자별 주식 자료를 통해 몇일 후 주가를 예측
df = quandl.get("WIKI/GOOGL")
 
print(df.head())
print("... get_dataset", "." * 100, "\n")
 
 
# ------------------------------
# -- Manipulate Data
# ------------------------------
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
 
print(df.head())
print(",,, pair_down_data", "," * 100, "\n")
 
# -- Add Calculated Column
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
 
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
 
print(df.head())
print(",,, add_calculated_columns", "," * 100, "\n")
 
# -- write to .csv
df.to_csv("manipulated_wiki_google.csv")
 
# -- Add Label Column
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)  # replace NaN to -99999
forecast_out = int(math.ceil(0.01 * len(df)))  # len() : df row count, math.ceil : 가장 작은 정수 올림
 
print("len(df):", len(df))
print("0.01*len(df):", 0.01 * len(df))
print("math.ceil(0.01*len(df)):", math.ceil(0.01 * len(df)))
print("forecast_out:", forecast_out)
 
# -- shift : +) 다음으로 행 이동, -) 이전으로 행 이동
# -- 몇일 후의 값을 예측하기 위한 LABEL을 임으로 이동시킴
df['label'] = df[forecast_col].shift(-forecast_out)
 
print(df.head())
print(",,, row_shift_for_label", "," * 100, "\n")
 
 
# ------------------------------
# -- Make Training and Testing Set
# ------------------------------
 
# -- drop NaN rows
df.dropna(inplace=True)
 
X = np.array(df.drop(['label'], 1))  # label만 빼고 Train Array로
y = np.array(df['label'])            # Label만 Test Array로
 
print("X:", X)
print("y:", y)
print(";;; array(train:X, label:y)", ";" * 100, "\n")
 
# -- Pre-Processing
X = preprocessing.scale(X)  # 기계학습 기능을 (-1)~(+1) 범위로 설정
y = np.array(df['label'])
 
print("X:", X)
print("y:", y)
print(";;; pre-processed X,y", ";" * 100, "\n")
 
# -- 모델 검증(Cross-Validation)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
 
print("X_train:", X_train[:5], "\n")
print("X_test:", X_test[:5], "\n")
print("y_train:", y_train[:20], "\n")
print("y_test:", y_test[:20])
print(";;; cross-validation", ";" * 100, "\n")
 
 
# ------------------------------
# -- Model Fitting
# ------------------------------
 
# --
# -- SVM.SVR(Support Vector Regression)
# --
clf = svm.SVR()
clf.fit(X_train, y_train)  # fitting
 
confidence = clf.score(X_test, y_test)
 
print(confidence)
print("^^^ try_svm.svr", "^" * 100, "\n")
 
# --
# -- Linear Regression
# --
 
# -- Linear Regression(일반 처리)
clf = LinearRegression()
clf.fit(X_train, y_train)  # fitting
 
confidence = clf.score(X_test, y_test)
 
print(confidence)
print("^^^ try_linear_regression", "^" * 100, "\n")
 
# -- Linear Regression(쓰레드 지정)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)  # fitting
 
confidence = clf.score(X_test, y_test)
 
print(confidence)
print("^^^ try_linear_regression with thread", "^" * 100, "\n")
 
 
# ------------------------------
# -- Training(Using Kernel)
# ------------------------------
for k in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = svm.SVR(kernel=k)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print(k, ":", confidence)
 
print("*** Training w/ Kernel", "*" * 100, "\n")
 
 

■ Files

manipulated_wiki_google.csv

저작자표시 비영리 변경금지

디비랑[dɪ'bɪraŋ]

[Py3.5] Regression - Features, Lables, Training and Test w/ svm.SVR

+ Recent posts

티스토리툴바