[Py3.5] Document Similarity Methods

2017. 6. 10. 20:41

/*******************************************************************************************************************
-- Title : [Py3.5] Document Similarity Methods
-- Reference : ratsgo.githus.io
-- Key word : document similarity method term-document matrix tdm common features model
simple matching coefficient jaccard similarity cosine simailarity 문서 유사도 유사성
word embedding 워드 임베딩 단어 임베딩
*******************************************************************************************************************/

■ Figures

■ Scripts

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
 
import pandas as pd
from math import *
 
# Ref : ratsgo.github.io/from%20frequency%20to%20semantics/2017/04/20/docsim/
 
 
# =======================================
# -- List of Term-Document Matrix
# =======================================
doc_x1 = [3, 0, 5, 0, 0]
doc_x2 = [0, 0, 3, 2, 1]
doc_x3 = [2, 1, 0, 1, 2]
doc_x4 = [1, 2, 2, 0, 4]
 
doc_t1 = [1, 0, 1, 0, 0]
doc_t2 = [0, 0, 1, 1, 1]
doc_t3 = [1, 1, 0, 1, 1]
doc_t4 = [1, 0, 0, 0, 1]
 
lst_x = [doc_x1, doc_x2, doc_x3, doc_x4]
df_x = pd.DataFrame(lst_x)
 
print(df_x)
print("... df_x", "." * 100, "\n")
 
lst_t = [doc_t1, doc_t2, doc_t3, doc_t4]
df_t = pd.DataFrame(lst_t)
 
print(df_t)
print("... df_t", "." * 100, "\n")
 
t_rows = df_t.shape[0];
t_cols = df_t.shape[1]
x_rows = df_x.shape[0];
x_cols = df_x.shape[1]
 
print("x(", x_rows, ",", x_cols, "), y(", t_rows, ",", t_cols, ")")
print("... (행수, 열수)", "." * 100, "\n")
 
 
# =======================================
# -- 문헌간 Term 존재 유무 Counting
# =======================================
"""
Doc_t1\Doc_t2  Y(1)     N(0)
-------------  -------  -------
         Y(1)  1(=d11)  1(=d10)
         N(0)  2(=d01)  1(=d00)
"""
 
df_t_model = pd.DataFrame(columns=("doc", "d11", "d10", "d01", "d00"))
row_t_model = 0
 
for i1 in range(t_rows):
    for i2 in range(t_rows):
        if i1 < i2:
            doc_key = "d" + str(i1) + "_d" + str(i2)
            d11 = d10 = d01 = d00 = 0
 
            for j in range(t_cols):
 
                if df_t.ix[i1, j] == 1 and df_t.ix[i2, j] == 1:
                    d11 += 1
                if df_t.ix[i1, j] == 1 and df_t.ix[i2, j] == 0:
                    d10 += 1
                if df_t.ix[i1, j] == 0 and df_t.ix[i2, j] == 1:
                    d01 += 1
                if df_t.ix[i1, j] == 0 and df_t.ix[i2, j] == 0:
                    d00 += 1
 
            df_t_model.loc[row_t_model] = [doc_key, d11, d10, d01, d00]
            row_t_model += 1
 
print(df_t_model)
print(",,, compare_existing_by_doc", "," * 100, "\n")
 
 
# =======================================
# -- Document Similarity Method
# =======================================
 
# --
# -- common features model(d11 / (d11 + d10 + d01 + d00))
# --
df_t_model["common"] = df_t_model.apply(lambda d: float(d[1] / (d[1] + d[2] + d[3] + d[4])), axis=1)
 
print(df_t_model)
print(";;; common_features_model", ";" * 100, "\n")
 
# --
# -- ratio model(d11 / (d11 + d10 + d01))
# --
df_t_model["ratio"] = df_t_model.apply(lambda d: float(d[1] / (d[1] + d[2] + d[3])), axis=1)
 
print(df_t_model)
print(";;; ratio_model", ";" * 100, "\n")
 
# --
# -- simple matching coefficient((d11 + d00) / (d11 + d10 + d01 + d00))
# --
df_t_model["simple"] = df_t_model.apply(lambda d: float((d[1] + d[4]) / (d[1] + d[2] + d[3] + d[4])), axis=1)
 
print(df_t_model)
print(";;; simple_matching_eoefficient", ";" * 100, "\n")
 
 
# --
# -- jaccard & cosine similarity function
# --
 
# Ref : dbrang.tistory.com/1201
def jaccard_similarity(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality / float(union_cardinality)
 
 
def square_rooted(x):
    return round(sqrt(sum([a * a for a in x])), 3)
 
 
def cosine_similarity(x, y):
    numerator = sum(a * b for a, b in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
 
    return round(numerator / float(denominator), 3)
 
 
# --
# -- jaccard & cosine similarity
# --
df_x_model = pd.DataFrame(columns=("doc", "jaccard", "cosine"))
row_x_model = 0
 
for i1 in range(x_rows):
    for i2 in range(x_rows):
        if i1 < i2:
            doc_key = "d" + str(i1) + "_d" + str(i2)
 
            lst_x = df_x.ix[i1,].tolist()
            lst_y = df_x.ix[i2,].tolist()
 
            jac_sim = jaccard_similarity(lst_x, lst_y)
            cos_sim = cosine_similarity(lst_x, lst_y)
 
            df_x_model.loc[row_x_model] = [doc_key, jac_sim, cos_sim]
 
            row_x_model += 1
 
print(df_x_model)
print(",,, jaccard_cosine_similarity", "," * 100, "\n")
Colored by Color Scripter
cs

저작자표시 비영리 변경금지

디비랑[dɪ'bɪraŋ]

[Py3.5] Document Similarity Methods

+ Recent posts

티스토리툴바