[Py3.5] K-Means 및 기타 클러스터링 알고리즘 구현/소개

2017. 6. 6. 18:35

/*******************************************************************************************************************
-- Title : [Py3.5] K-Means 및 기타 클러스터링 알고리즘 구현/소개
-- Reference : www.kdnuggets.com/2017/03/k-means-clustering-algorithms-intro-python.html
-- Key word : 클러스터링 clustering k-means sklearn scikit-learn networkx matplotlib pyplot cluster 클러스터
spectral clustering model agglomerative clustering model k-means clustering model
affinity propagation clustering model
*******************************************************************************************************************/

■ Figures

■ Scripts

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from sklearn import cluster
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
 
# Ref : http://www.kdnuggets.com/2017/03/k-means-clustering-algorithms-intro-python.html
 
 
# ------------------------------
# -- Declaration of functions
# ------------------------------
 
def drawCommunities(G, partition, pos):
    # G is graph in networkx form
    # Partition is a dict containing info on clusters
    # Pos is base on networkx spring layout (nx.spring_layout(G))
 
    # For separating communities colors
    dictList = defaultdict(list)
    nodelist = []
    for node, com in partition.items():
        dictList[com].append(node)
 
    # Get size of Communities
    size = len(set(partition.values()))
 
    # For loop to assign communities colors
    for i in range(size):
 
        amplifier = i % 3
        multi = (i / 3) * 0.3
 
        red = green = blue = 0
 
        if amplifier == 0:
            red = 0.1 + multi
        elif amplifier == 1:
            green = 0.1 + multi
        else:
            blue = 0.1 + multi
 
        # Draw Nodes
        nx.draw_networkx_nodes(G, pos,
                               nodelist=dictList[i],
                               node_color=[0.0 + red, 0.0 + green, 0.0 + blue],
                               node_size=500,
                               alpha=0.8)
 
    # Draw edges and final plot
    plt.title("Zachary's Karate Club")
    nx.draw_networkx_edges(G, pos, alpha=0.5)
 
def listToDict(list):
    listdict = {}
 
    for i in range(len(list)):
        listdict[i] = list[i]
 
    return listdict
 
def graphToEdgeMatrix(G):
    # Initialize Edge Matrix
    edgeMat = [[0 for x in range(len(G))] for y in range(len(G))]
 
    # For loop to set 0 or 1 ( diagonal elements are set to 1)
    for node in G:
        tempNeighList = G.neighbors(node)
        for neighbor in tempNeighList:
            edgeMat[node][neighbor] = 1
        edgeMat[node][node] = 1
 
    return edgeMat
 
 
# ------------------------------
# -- Initialization
# ------------------------------
 
# Initialize some variables to help us with the generalization of the program
kClusters = 2
results = []
nmiResults = []
arsResults = []
 
# Load and Store both data and groundtruth of Zachary's Karate Club
G = nx.karate_club_graph()
groundTruth = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 
# Transform our graph data into matrix form
edgeMat = graphToEdgeMatrix(G)
 
# Positions the nodes using Fruchterman-Reingold force-directed algorithm
# Too technical to discuss right now, just go with it
pos = nx.spring_layout(G)
drawCommunities(G, listToDict(groundTruth), pos)
 
 
# ------------------------------
# -- Fit Spectral Clustering Model
# ------------------------------
 
# Spectral Clustering Model
spectral = cluster.SpectralClustering(n_clusters=kClusters, affinity="precomputed", n_init=200)
spectral.fit(edgeMat)
 
# Transform our data to list form and store them in results list
results.append(list(spectral.labels_))
 
 
# ------------------------------
# -- Fit Agglomerative Clustering Model
# ------------------------------
 
# Agglomerative Clustering Model
agglomerative = cluster.AgglomerativeClustering(n_clusters=kClusters, linkage="ward")
agglomerative.fit(edgeMat)
 
# Transform our data to list form and store them in results list
results.append(list(agglomerative.labels_))
 
 
# ------------------------------
# -- Fit K-means Clustering Model
# ------------------------------
 
# K-means Clustering Model
kmeans = cluster.KMeans(n_clusters=kClusters, n_init=200)
kmeans.fit(edgeMat)
 
# Transform our data to list form and store them in results list
results.append(list(kmeans.labels_))
 
 
# ------------------------------
# -- Fit Affinity Propagation Clustering Model
# ------------------------------
 
# Affinity Propagation Clustering Model
affinity = cluster.affinity_propagation(S=edgeMat, max_iter=200, damping=0.6)
 
# Transform our data to list form and store them in results list
results.append(list(affinity[1]))
 
# Append the results into lists
for x in results:
    nmiResults.append(normalized_mutual_info_score(groundTruth, x))
    arsResults.append(adjusted_rand_score(groundTruth, x))
 
 
# ------------------------------
# -- Show models
# ------------------------------
 
# Code for plotting results
 
# Average of NMI and ARS
y = [sum(x) / 2 for x in zip(nmiResults, arsResults)]
 
xlabels = ['Spectral', 'Agglomerative', 'Kmeans', 'Affinity Propagation']
 
fig = plt.figure()
ax = fig.add_subplot(111)
 
# Set parameters for plotting
ind = np.arange(len(y))
width = 0.35
 
# Create barchart and set the axis limits and titles
ax.bar(ind, y, width, color='blue', error_kw=dict(elinewidth=2, ecolor='red'))
ax.set_xlim(-width, len(ind) + width)
ax.set_ylim(0, 2)
ax.set_ylabel('Average Score (NMI,ARS)')
ax.set_title('Score Evaluation')
 
# Add the xlabels to the chart
ax.set_xticks(ind + width / 2)
xtickNames = ax.set_xticklabels(xlabels)
plt.setp(xtickNames, fontsize=12)
 
# Add the actual value on top of each chart
for i, v in enumerate(y):
    ax.text(i, v, str(round(v, 2)), color='blue', fontweight='bold')
 
# Show the final plot
plt.show()
Colored by Color Scripter
cs

저작자표시 비영리 변경금지 (새창열림)

디비랑[dɪ'bɪraŋ]

[Py3.5] K-Means 및 기타 클러스터링 알고리즘 구현/소개

+ Recent posts

티스토리툴바