/*******************************************************************************************************************
-- Title : [Py3.5] 5-most popular similarity measures implementation in python
-- Reference : dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/
-- Key word : 유사성 similarity measure 유사도 거리 euclidean distance manhattan distance cosine similarity
유클리디언 유클리디안 맨하탄 코사인 자카드 jaccard similarity minkowski distance
*******************************************************************************************************************/
■ Similarity Measure
- The similarity measure is the measure of how much alike two data objects are.
- Similarity measure is a distance with dimensions representing features of the objects.
- The similarity is subjective and is highly dependent on the domain and application.
- Similarity are measured in the range 0 to 1.
* Similarity = 1 if X = Y (Where X, Y are two objects)
* Similarity = 0 if X != Y
■ Euclidean Distance
- The most common use of distance.
- Euclidean distance is also known as simply distance.
- The Euclidean distance between two points is the length of the path connecting them.
- The Pythagorean theorem gives this distance between two points.
1 2 3 4 5 6 7 8 9 10 11 12 13 | from math import* def euclidean_distance(x,y): return sqrt(sum(pow(a-b,2) for a, b in zip(x, y))) print ("(A)", euclidean_distance([2,2],[1,7])) print ("(B)", euclidean_distance([5,2],[12,8])) print ("(C)", euclidean_distance([2,2],[5,2])) """ (A) 5.0990195135927845 (B) 9.219544457292887 (C) 3.0 """ |
■ Manhattan Distance
- The sum of the absolute differences of their Cartesian coordinates.
- it is the total sum of the difference between the x-coordinates and y-coordinates.
- Manhattan distance = |x1 – x2| + | y1 - y2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 | from math import * def manhattan_distance(x, y): return sum(abs(a - b) for a, b in zip(x, y)) print ("(A)", manhattan_distance([2,2], [1,7])) print ("(B)", manhattan_distance([5,2], [12,8])) print ("(C)", manhattan_distance([2,2], [5,2])) """ (A) 6 (B) 13 (C) 3 """ |
■ Minkowski Distance
- A generalized metric form of Euclidean distance and Manhattan distance.
- The way distances are measured by the Minkowski metric of different orders between two objects with three variables.
- In the image it displayed in a coordinate system with x, y ,z-axes.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | from math import * from decimal import Decimal def nth_root(value, n_root): root_value = 1 / float(n_root) return round(Decimal(value) ** Decimal(root_value), 3) def minkowski_distance(x, y, p_value): return nth_root(sum(pow(abs(a - b), p_value) for a, b in zip(x, y)), p_value) print ("(A)", minkowski_distance([2,2], [1,7], 3)) # don't know how to do p_value print ("(B)", minkowski_distance([5,2], [12,8], 3)) print ("(C)", minkowski_distance([2,2], [5,2], 3)) """ (A) 5.013 (B) 8.238 (C) 3.000 """ | |
■ Cosine Similarity
- Cosine similarity metric finds the normalized dot product of the two attributes.
- Two vectors with the same orientation have a cosine similarity of 1,
two vectors at 90° have a similarity of 0,
and two vectors diametrically opposed have a similarity of -1.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | from math import * def square_rooted(x): return round(sqrt(sum([a * a for a in x])), 3) def cosine_similarity(x, y): numerator = sum(a * b for a, b in zip(x, y)) denominator = square_rooted(x) * square_rooted(y) return round(numerator / float(denominator), 3) print ("(A)", cosine_similarity([2,2], [1,7])) print ("(B)", cosine_similarity([5,2], [6,6])) print ("(C)", cosine_similarity([2,2], [8,5])) """ (A) 0.8 (B) 0.979 (C) 0.919 """ |
■ Jaccard Similarity
- Where the objects are points or vectors when we consider about Jaccard similarity this objects will be sets.
- Sets:
which counts how many elements are in A.
- Intersection:
The intersection between two sets A and B is denoted A ∩ B and reveals all items which are in both sets A,B.
- Union:
Union between two sets A and B is denoted A ∪ B and reveals all items which are in either set.
1 2 3 4 5 6 7 8 9 10 11 | from math import * def jaccard_similarity(x, y): intersection_cardinality = len(set.intersection(*[set(x), set(y)])) union_cardinality = len(set.union(*[set(x), set(y)])) return intersection_cardinality / float(union_cardinality) print (jaccard_similarity([0,1,2,5,6],[0,2,3,5,7,9])) """ 0.375 """ | |
■ Similarity Class
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | ######################################## ### similaritymeasures.py ######################################## from math import * from decimal import Decimal class Similarity(): """ Five similarity measures function """ # euclidean distance def euclidean_distance(self, x, y): """ return euclidean distance between two lists """ return sqrt(sum(pow(a - b, 2) for a, b in zip(x, y))) # manhattan distance def manhattan_distance(self, x, y): """ return manhattan distance between two lists """ return sum(abs(a - b) for a, b in zip(x, y)) # minkowski distance def nth_root(self, value, n_root): """ returns the n_root of an value """ root_value = 1 / float(n_root) return round(Decimal(value) ** Decimal(root_value), 3) def minkowski_distance(self, x, y, p_value): """ return minkowski distance between two lists """ return self.nth_root(sum(pow(abs(a - b), p_value) for a, b in zip(x, y)), p_value) # consine similarity def square_rooted(self, x): """ return 3 rounded square rooted value """ return round(sqrt(sum([a * a for a in x])), 3) def cosine_similarity(self, x, y): """ return cosine similarity between two lists """ numerator = sum(a * b for a, b in zip(x, y)) denominator = self.square_rooted(x) * self.square_rooted(y) return round(numerator / float(denominator), 3) # jaccard similarity def jaccard_similarity(self, x, y): """ returns the jaccard similarity between two lists """ intersection_cardinality = len(set.intersection(*[set(x), set(y)])) union_cardinality = len(set.union(*[set(x), set(y)])) return intersection_cardinality / float(union_cardinality) ######################################## ### similarity.py ######################################## from similaritymeasures import Similarity def main(): """ the main function to create Similarity class instance and get used to it """ measures = Similarity() print (measures.euclidean_distance([0, 3, 4, 5], [7, 6, 3, -1])) print (measures.jaccard_similarity([0, 1, 2, 5, 6], [0, 2, 3, 5, 7, 9])) if __name__ == "__main__": main() |