반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Regression - Forecasting and Predicting
-- Reference : pythonprogramming.net
-- Key word : quandl sklearn scikit-learn linear_model linearregression csv datetimeindex fillna math.ceil
preprocessing.scale forecast predict ggplot matplotlib pyplot style 선형모델 선형 모델
선형회귀 선형 회귀 예측 추론 플롯 차트 cross-validation cross validation
*******************************************************************************************************************/
■ Figures
■ Scripts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | # -*- coding: utf-8 -*- import quandl, math import numpy as np import pandas as pd from sklearn import preprocessing, cross_validation, svm from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt from matplotlib import style import datetime # ------------------------------ # -- Set Dataframe Option # ------------------------------ pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # ------------------------------ # -- Read .CSV # ------------------------------ # Scenario : 일자별 주식 자료를 통해 몇일 후 주가를 예측 # Ref : dbrang.tistory.com/1227 df = pd.read_csv("manipulated_wiki_google.csv") print(df.head(5)) print("... read_.csv", "." * 100, "\n") # -- 컬럼을 인덱스로 변경 df["Date"] = pd.DatetimeIndex(df["Date"]) # 인덱스를 DatetimeIndex로 만들기 위해 변경 df = df.set_index("Date") print(df.head(5)) print("... change_col_to_idx", "." * 100, "\n") # ------------------------------ # -- Create Label # ------------------------------ # Ref : dbrang.tistory.com/1227 forecast_col = 'Adj. Close' df.fillna(value=-99999, inplace=True) # replace NaN to -99999 forecast_out = int(math.ceil(0.01 * len(df))) # len() : df row count, math.ceil : 가장 작은 정수 올림 print("len(df):", len(df)) print("0.01*len(df):", 0.01 * len(df)) print("math.ceil(0.01*len(df)):", math.ceil(0.01 * len(df))) print("forecast_out:", forecast_out) # -- Create Label df['label'] = df[forecast_col].shift(-forecast_out) # Label 생성위해 Row-Shift print(df.head()) print(",,, add_label_&_shift_rows", "," * 100, "\n") # ------------------------------ # -- Make Training and Testing Set # ------------------------------ X = np.array(df.drop(['label'], 1)) # label만 빼고 Train Array로 X = preprocessing.scale(X) # 기계학습 기능을 (-1)~(+1) 범위로 설정 X_lately = X [-forecast_out :] X = X[:-forecast_out] df.dropna(inplace=True) # NaN rows 삭제 y = np.array(df['label']) # Label을 Test Array로 # -- 모델 검증(cross-validation) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) print("X_train:", X_train[:5], "\n") print("X_test:", X_test[:5], "\n") print("y_train:", y_train[:20], "\n") print("y_test:", y_test[:20]) print(";;; cross-validation", ";" * 100, "\n") # ------------------------------ # -- Model Fitting # ------------------------------ # -- Linear Regression(일반 처리) clf = LinearRegression(n_jobs=-1) clf.fit(X_train, y_train) confidence = clf.score (X_test, y_test) print(confidence) print("^^^ linear_regression", "^" * 100, "\n") # ------------------------------ # -- Forecasting & Predicting # ------------------------------ forecast_set = clf.predict (X_lately) print(forecast_set, confidence, forecast_out) print("*** predictng", "*" * 100, "\n") # ------------------------------ # -- Visualization # ------------------------------ style.use('ggplot') df['Forecast'] = np.nan # NaN으로 필드 추가 last_date = df.iloc[-1].name print("last_date:", last_date) last_unix = last_date.timestamp() print("last_unix:", last_unix) one_day = 86400 next_unix = last_unix + one_day print("next_unix:", next_unix) for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += 86400 df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i] df['Adj. Close'].plot() df['Forecast'].plot() plt.legend(loc=4) plt.xlabel('xlabel_Date') plt.ylabel('ylabel_Price') plt.show() | cs |
■ Files
반응형