-- Title : [Py3.5] Document Similarity Methods
-- Reference : ratsgo.githus.io
-- Key word : document similarity method term-document matrix tdm common features model
simple matching coefficient jaccard similarity cosine simailarity 문서 유사도 유사성
word embedding 워드 임베딩 단어 임베딩
■ Figures
■ Scripts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | # -*- coding: utf-8 -*- import pandas as pd from math import * # ======================================= # -- List of Term-Document Matrix # ======================================= doc_x1 = [3, 0, 5, 0, 0] doc_x2 = [0, 0, 3, 2, 1] doc_x3 = [2, 1, 0, 1, 2] doc_x4 = [1, 2, 2, 0, 4] doc_t1 = [1, 0, 1, 0, 0] doc_t2 = [0, 0, 1, 1, 1] doc_t3 = [1, 1, 0, 1, 1] doc_t4 = [1, 0, 0, 0, 1] lst_x = [doc_x1, doc_x2, doc_x3, doc_x4] df_x = pd.DataFrame(lst_x) print(df_x) print("... df_x", "." * 100, "\n") lst_t = [doc_t1, doc_t2, doc_t3, doc_t4] df_t = pd.DataFrame(lst_t) print(df_t) print("... df_t", "." * 100, "\n") t_rows = df_t.shape[0]; t_cols = df_t.shape[1] x_rows = df_x.shape[0]; x_cols = df_x.shape[1] print("x(", x_rows, ",", x_cols, "), y(", t_rows, ",", t_cols, ")") print("... (행수, 열수)", "." * 100, "\n") # ======================================= # -- 문헌간 Term 존재 유무 Counting # ======================================= """ Doc_t1\Doc_t2 Y(1) N(0) ------------- ------- ------- Y(1) 1(=d11) 1(=d10) N(0) 2(=d01) 1(=d00) """ df_t_model = pd.DataFrame(columns=("doc", "d11", "d10", "d01", "d00")) row_t_model = 0 for i1 in range(t_rows): for i2 in range(t_rows): if i1 < i2: doc_key = "d" + str(i1) + "_d" + str(i2) d11 = d10 = d01 = d00 = 0 for j in range(t_cols): if df_t.ix[i1, j] == 1 and df_t.ix[i2, j] == 1: d11 += 1 if df_t.ix[i1, j] == 1 and df_t.ix[i2, j] == 0: d10 += 1 if df_t.ix[i1, j] == 0 and df_t.ix[i2, j] == 1: d01 += 1 if df_t.ix[i1, j] == 0 and df_t.ix[i2, j] == 0: d00 += 1 df_t_model.loc[row_t_model] = [doc_key, d11, d10, d01, d00] row_t_model += 1 print(df_t_model) print(",,, compare_existing_by_doc", "," * 100, "\n") # ======================================= # -- Document Similarity Method # ======================================= # -- # -- common features model(d11 / (d11 + d10 + d01 + d00)) # -- df_t_model["common"] = df_t_model.apply(lambda d: float(d[1] / (d[1] + d[2] + d[3] + d[4])), axis=1) print(df_t_model) print(";;; common_features_model", ";" * 100, "\n") # -- # -- ratio model(d11 / (d11 + d10 + d01)) # -- df_t_model["ratio"] = df_t_model.apply(lambda d: float(d[1] / (d[1] + d[2] + d[3])), axis=1) print(df_t_model) print(";;; ratio_model", ";" * 100, "\n") # -- # -- simple matching coefficient((d11 + d00) / (d11 + d10 + d01 + d00)) # -- df_t_model["simple"] = df_t_model.apply(lambda d: float((d[1] + d[4]) / (d[1] + d[2] + d[3] + d[4])), axis=1) print(df_t_model) print(";;; simple_matching_eoefficient", ";" * 100, "\n") # -- # -- jaccard & cosine similarity function # -- # Ref : dbrang.tistory.com/1201 def jaccard_similarity(x, y): intersection_cardinality = len(set.intersection(*[set(x), set(y)])) union_cardinality = len(set.union(*[set(x), set(y)])) return intersection_cardinality / float(union_cardinality) def square_rooted(x): return round(sqrt(sum([a * a for a in x])), 3) def cosine_similarity(x, y): numerator = sum(a * b for a, b in zip(x, y)) denominator = square_rooted(x) * square_rooted(y) return round(numerator / float(denominator), 3) # -- # -- jaccard & cosine similarity # -- df_x_model = pd.DataFrame(columns=("doc", "jaccard", "cosine")) row_x_model = 0 for i1 in range(x_rows): for i2 in range(x_rows): if i1 < i2: doc_key = "d" + str(i1) + "_d" + str(i2) lst_x = df_x.ix[i1,].tolist() lst_y = df_x.ix[i2,].tolist() jac_sim = jaccard_similarity(lst_x, lst_y) cos_sim = cosine_similarity(lst_x, lst_y) df_x_model.loc[row_x_model] = [doc_key, jac_sim, cos_sim] row_x_model += 1 print(df_x_model) print(",,, jaccard_cosine_similarity", "," * 100, "\n") | cs |