-
Notifications
You must be signed in to change notification settings - Fork 9
/
similarity.go
109 lines (90 loc) · 2.28 KB
/
similarity.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
package simhash
import (
"hash/fnv"
"fmt"
"strings"
)
const (
SIMILAR_DISTANCE = 3
)
type WordWeight struct {
Word string
Weight float64
}
func SimHashSimilar(srcWordWeighs, dstWordWeights []WordWeight) (distance int, err error) {
srcFingerPrint, err := simhashFingerPrint(srcWordWeighs)
if err != nil {
return
}
fmt.Println("srcFingerPrint: ", srcFingerPrint)
dstFingerPrint, err := simhashFingerPrint(dstWordWeights)
if err != nil {
return
}
fmt.Println("dstFingerPrint: ", dstFingerPrint)
distance = hammingDistance(srcFingerPrint, dstFingerPrint)
return
}
func simhashFingerPrint(wordWeights []WordWeight) (fingerPrint []string, err error) {
binaryWeights := make([]float64, 32)
for _, ww := range wordWeights {
bitHash := strHashBitCode(ww.Word)
weights := calcWithWeight(bitHash, ww.Weight) //binary每个元素与weight的乘积结果数组
binaryWeights, err = sliceInnerPlus(binaryWeights, weights)
//fmt.Printf("ww.Word:%v, bitHash:%v, ww.Weight:%v, binaryWeights: %v\n", ww.Word,bitHash, ww.Weight, binaryWeights)
if err != nil {
return
}
}
fingerPrint = make([]string, 0)
for _, b := range binaryWeights {
if b > 0 { // bit 1
fingerPrint = append(fingerPrint, "1")
} else { // bit 0
fingerPrint = append(fingerPrint, "0")
}
}
return
}
func strHashBitCode(str string) string {
h := fnv.New32a()
h.Write([]byte(str))
b := int64(h.Sum32())
return fmt.Sprintf("%032b", b)
}
func calcWithWeight(bitHash string, weight float64) []float64 {
bitHashs := strings.Split(bitHash, "")
binarys := make([]float64, 0)
for _, bit := range bitHashs {
if bit == "0" {
binarys = append(binarys, float64(-1)*weight)
} else {
binarys = append(binarys, float64(weight))
}
}
return binarys
}
func sliceInnerPlus(arr1, arr2 [] float64) (dstArr []float64, err error) {
dstArr = make([]float64, len(arr1), len(arr1))
if arr1 == nil || arr2 == nil {
err = fmt.Errorf("sliceInnerPlus array nil")
return
}
if len(arr1) != len(arr2) {
err = fmt.Errorf("sliceInnerPlus array Length NOT match, %v != %v", len(arr1), len(arr2))
return
}
for i, v1 := range arr1 {
dstArr[i] = v1 + arr2[i]
}
return
}
func hammingDistance(arr1, arr2 []string) int {
count := 0
for i, v1 := range arr1 {
if v1 != arr2[i] {
count++
}
}
return count
}