[Py2.7] CSV 및 엑셀 파일 읽고 쓰기

2016. 6. 27. 18:31

/*********************************************************************************************************
-- Title : [Py2.7] CSV 및 엑셀 파일 읽고 쓰기
-- Reference : Python for Data Analysis
-- Key word : .csv csv excel 엑셀 xls xlsx pandas 판다스 dataframe 데이터프레임
*********************************************************************************************************/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# -*- coding: utf-8 -*-
 
 
import os
import sys
import pandas as pd
 
# ********************************************
# -- .csv 파일 불러오기(read_csv)
# ********************************************
print("-" * 100 + "{[0]}") # ----- #
 
testCSV_path = "C:\\samples\\olive.csv"
olive_oil = pd.read_csv(testCSV_path)  # option (p.225)
print olive_oil.head(3)
 
 
# ********************************************
# -- 인덱스 및 컬럼명 변경
# ********************************************
print("-" * 100 + "{[1]}") # ----- #
 
# -- 인덱스 컬럼명 변경
olive_oil.index.name = "id"        # 아래 실습에서는 하지 않는당.
 
# -- 컬럼명 변경
olive_oil.columns.values[0] = "id_area1"
olive_oil.rename(columns={olive_oil.columns[0]:"id_area"}, inplace = True)
 
print olive_oil.head(3)
 
 
# ********************************************
# -- 유니크 확인 및 데이터 변경
# ********************************************
print("-" * 100 + "{[2]}") # ----- #
 
# -- 유니크 확인
print len(olive_oil["id_area"].unique())   # unique 확인(572)
print olive_oil.shape                      # shape 확인(572, 11)
 
# -- id_area 재조정(0,1,2,..에 맞게 조정)
print olive_oil["id_area"][0]
print olive_oil["id_area"][0].split(".")
print olive_oil["id_area"][0].split(".")[0]
print olive_oil["id_area"][0].split(".")[1]
 
# -- 문자 변환(인덱스 재조정)
print str(int(olive_oil["id_area"][0].split(".")[0]) - 1) + "." + olive_oil["id_area"][0].split(".")[1]
 
olive_oil["id_area"] = olive_oil["id_area"].apply(lambda x: str(int(x.split(".")[0])-1)+"_"+x.split(".")[1])
print olive_oil["id_area"].head(3)
 
# -- 타입 변환
olive_oil["palmitoleic"] = olive_oil["palmitoleic"].apply(lambda x: x / 100.00)
print olive_oil["palmitoleic"].head(3)
 
 
# ********************************************
# -- .csv 파일 내보내기(to_csv)
# ********************************************
print("-" * 100 + "{[3]}") # ----- #
 
olive_oil.to_csv("C:\samples\olive_new.csv")
 
 
# ********************************************
# -- 콤마(,)외 다른 구분자 파일 불러오기(sep)
# ********************************************
print("-" * 100 + "{[4]}") # ----- #
 
atfile = "C:\\samples\\surveys_withAt.csv"
csvfile = pd.read_csv(atfile, sep="@")
print csvfile.head(3)
 
 
# ********************************************
# -- Chunk 단위로 큰 파일 읽기(concat)
# ********************************************
print("-" * 100 + "{[5]}") # ----- #
 
atfile = "C:\\samples\\bigCSV_File.csv"
csvfile = pd.read_csv(atfile, chunksize=500)
bigCSV = pd.concat([x for x in csvfile], ignore_index=True)
 
print bigCSV.shape
print bigCSV.head(3)
 
 
# ********************************************
# -- 여러 .csv파일을 동시에 호출(glob, vars())
# ********************************************
print("-" * 100 + "{[6]}") # ----- #
 
import glob
 
# -- glob를 통해서 file 리스트 생성
filePathList = glob.glob("C:\\samples\\01\\*.csv")
print filePathList
 
print("-" * 100 + "{[6.1]}") # ----- #
 
# -- 각각의 파일을 여러 DF에 할당하기(vars()이용)
varNameList = []
 
for i in range(0, len(filePathList)):
    temp = os.path.basename(filePathList[i])
    temp1 = os.path.splitext(temp)[0]
 
    print "temp:" + temp
    print "temp1:" + temp1
 
    # vars() : string을 데이터프레임 개체로 만듦.
    vars()["dfdata_" + str(temp1)] = pd.read_csv(filePathList[i])
 
print("-" * 100 + "{[6.2]}") # ----- #
 
print dfdata_1763.head(3)
print dfdata_1764.head(3)
print dfdata_1772.head(3)
 
 
# ********************************************
# -- 여러 .csv 파일을 하나의 DF로 생성
# ********************************************
print("-" * 100 + "{[7]}") # ----- #
 
filePathList = glob.glob("C:\\samples\\01\\*.csv")
print filePathList
 
print("-" * 100 + "{[7.1]}") # ----- #
 
# -- DF 생성
data_tot = pd.DataFrame()
 
# -- loop 돌며 DF에 저장
for i in range(0, len(filePathList)):
    print pd.read_csv(filePathList[i]).columns.values
 
    data_tot = data_tot.append(pd.read_csv(filePathList[i], header=None), ignore_index=True)
 
print("-" * 100 + "{[7.2]}") # ----- #
 
print data_tot.head(3)
print data_tot.shape
 
 
# ********************************************
# -- 엑셀 파일 읽기(read_excel)
# ********************************************
print("-" * 100 + "{[8]}") # ----- #
 
# -- 엑셀 읽기
atfile = "C:\\samples\\surveys_excelformat.xls"
excel_data  = pd.read_excel(atfile, sheetname="Sheet1")
print excel_data.head(3)
 
 
# ********************************************
# -- 엑셀 파일 쓰기(to_excel)
#    [xlsxwriter, xlrd] 패키지가 설치되어 있어야 함
# ********************************************
print("-" * 100 + "{[9]}") # ----- #
 
excelOutPath = "C:\\samples\\surveys_excelformat_new.xlsx"    # .xls는 안되더라?
writer = pd.ExcelWriter(excelOutPath, engine="xlsxwriter")
 
excel_data.to_excel(writer, sheet_name="Sheet1", index=False)
writer.save()
 
 
 
 
 
 
 
 
 
 
 
 

cs