반응형

/*******************************************************************************************************************
-- Title : [Py3.5] 5-most popular similarity measures implementation in python
-- Reference : dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/
-- Key word : 유사성 similarity measure 유사도 거리 euclidean distance manhattan distance cosine similarity
                  유클리디언 유클리디안 맨하탄 코사인 자카드 jaccard similarity minkowski distance
*******************************************************************************************************************/

■ Similarity Measure
    - The similarity measure is the measure of how much alike two data objects are.
    - Similarity measure is a distance with dimensions representing features of the objects.
    - The similarity is subjective and is highly dependent on the domain and application.
    - Similarity are measured in the range 0 to 1.
      * Similarity = 1 if X = Y    (Where X, Y are two objects)
      * Similarity = 0 if X != Y

      



■ Euclidean Distance
    - The most common use of distance.
    - Euclidean distance is also known as simply distance.
    - The Euclidean distance between two points is the length of the path connecting them.
    - The Pythagorean theorem gives this distance between two points.


1
2
3
4
5
6
7
8
9
10
11
12
13
from math import*
 
def euclidean_distance(x,y):
    return sqrt(sum(pow(a-b,2for a, b in zip(x, y)))
 
print ("(A)", euclidean_distance([2,2],[1,7]))
print ("(B)", euclidean_distance([5,2],[12,8]))
print ("(C)", euclidean_distance([2,2],[5,2]))
"""
(A) 5.0990195135927845
(B) 9.219544457292887
(C) 3.0
"""
 
 


■ Manhattan Distance
    - The sum of the absolute differences of their Cartesian coordinates.
    - it is the total sum of the difference between the x-coordinates and y-coordinates.
    - Manhattan distance = |x1 – x2| + | y1 - y2 |

1
2
3
4
5
6
7
8
9
10
11
12
13
from math import *
 
def manhattan_distance(x, y):
    return sum(abs(a - b) for a, b in zip(x, y))
 
print ("(A)", manhattan_distance([2,2], [1,7]))
print ("(B)", manhattan_distance([5,2], [12,8]))
print ("(C)", manhattan_distance([2,2], [5,2]))
"""
(A) 6
(B) 13
(C) 3
"""
 
 


■ Minkowski Distance
    - A generalized metric form of Euclidean distance and Manhattan distance.
    - The way distances are measured by the Minkowski metric of different orders between two objects with three variables.
    - In the image it displayed in a coordinate system with x, y ,z-axes.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from math import *
from decimal import Decimal
 
def nth_root(value, n_root):
    root_value = 1 / float(n_root)
    return round(Decimal(value) ** Decimal(root_value), 3)
 
def minkowski_distance(x, y, p_value):
    return nth_root(sum(pow(abs(a - b), p_value) for a, b in zip(x, y)), p_value)
 
print ("(A)", minkowski_distance([2,2], [1,7], 3))   # don't know how to do p_value
print ("(B)", minkowski_distance([5,2], [12,8], 3))
print ("(C)", minkowski_distance([2,2], [5,2], 3))
"""
(A) 5.013
(B) 8.238
(C) 3.000
"""
 
 




■ Cosine Similarity

    - Cosine similarity metric finds the normalized dot product of the two attributes.
    - Two vectors with the same orientation have a cosine similarity of 1
        two vectors at 90° have a similarity of 0
        and two vectors diametrically opposed have a similarity of -1.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from math import *
 
def square_rooted(x):
    return round(sqrt(sum([a * a for a in x])), 3)
 
def cosine_similarity(x, y):
    numerator = sum(a * b for a, b in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
 
    return round(numerator / float(denominator), 3)
 
print ("(A)", cosine_similarity([2,2], [1,7]))
print ("(B)", cosine_similarity([5,2], [6,6]))
print ("(C)", cosine_similarity([2,2], [8,5]))
"""
(A) 0.8
(B) 0.979
(C) 0.919
"""
 
 



■ Jaccard Similarity

    - Where the objects are points or vectors when we consider about Jaccard similarity this objects will be sets.
    - Sets:
        which counts how many elements are in A.
    - Intersection:
        The intersection between two sets A and B is denoted A ∩ B and reveals all items which are in both sets A,B.
    - Union:
        Union between two sets A and B is denoted A ∪ B and reveals all items which are in either set.


1
2
3
4
5
6
7
8
9
10
11
from math import *
 
def jaccard_similarity(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality / float(union_cardinality)
 
print (jaccard_similarity([0,1,2,5,6],[0,2,3,5,7,9]))
"""
0.375
"""
 
 



■ Similarity Class

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
########################################
### similaritymeasures.py
########################################
from math import *
from decimal import Decimal
 
 
class Similarity():
    """ Five similarity measures function """
 
    # euclidean distance
    def euclidean_distance(self, x, y):
        """ return euclidean distance between two lists """
 
        return sqrt(sum(pow(a - b, 2for a, b in zip(x, y)))
 
    # manhattan distance
    def manhattan_distance(self, x, y):
        """ return manhattan distance between two lists """
 
        return sum(abs(a - b) for a, b in zip(x, y))
 
    # minkowski distance
    def nth_root(self, value, n_root):
        """ returns the n_root of an value """
 
        root_value = 1 / float(n_root)
        return round(Decimal(value) ** Decimal(root_value), 3)
 
    def minkowski_distance(self, x, y, p_value):
        """ return minkowski distance between two lists """
 
        return self.nth_root(sum(pow(abs(a - b), p_value) for a, b in zip(x, y)),
                             p_value)
 
    # consine similarity
    def square_rooted(self, x):
        """ return 3 rounded square rooted value """
 
        return round(sqrt(sum([a * a for a in x])), 3)
 
    def cosine_similarity(self, x, y):
        """ return cosine similarity between two lists """
 
        numerator = sum(a * b for a, b in zip(x, y))
        denominator = self.square_rooted(x) * self.square_rooted(y)
        return round(numerator / float(denominator), 3)
 
    # jaccard similarity
    def jaccard_similarity(self, x, y):
 
        """ returns the jaccard similarity between two lists """
 
        intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
        union_cardinality = len(set.union(*[set(x), set(y)]))
        return intersection_cardinality / float(union_cardinality)
    
    
########################################
### similarity.py
########################################
from similaritymeasures import Similarity
 
def main():
    """ the main function to create Similarity class instance and get used to it """
 
    measures = Similarity()
 
    print (measures.euclidean_distance([0345], [763-1]))
    print (measures.jaccard_similarity([01256], [023579]))
 
if __name__ == "__main__":
    main()
 
 


반응형

+ Recent posts