the following is a simhash algorithm implemented by python. The current problem with Xiao Sheng is that I can"t understand the following code.
ask the great god to help me with the answer where I marked it.
-sharp!/usr/bin/python
-sharp coding=utf-8
class simhash:
-sharp
def __init__(self, tokens="", hashbits=128):
self.hashbits = hashbits
self.hash = self.simhash(tokens);
-sharp toString
def __str__(self):
return str(self.hash)
-sharp simhash
def simhash(self, tokens):
v = [0] * self.hashbits
for t in [self._string_hash(x) for x in tokens]: -sharp ttokenhash
**for i in range(self.hashbits):-sharp---------
bitmask = 1 << i -sharp
if t & bitmask:
v[i] += 1 -sharp bit1,+1 --
else:
v[i] -= 1 -sharp ,-1
fingerprint = 0
for i in range(self.hashbits):
if v[i] >= 0:
fingerprint += 1 << i -sharp--
return fingerprint -sharp fingerprint>=0**
-sharp----------
-sharp
def hamming_distance(self, other):
x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
tot = 0;
while x:
tot += 1
x &= x - 1
return tot
-sharp
def similarity(self, other):
a = float(self.hash)
b = float(other.hash)
if a > b:
return b / a
else:
return a / b
-sharp sourcehash (Python)
def _string_hash(self, source): -sharp----------
if source == "":
return 0
else:
x = ord(source[0]) << 7 -sharp--
m = 1000003
mask = 2 ** self.hashbits - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
return x
if __name__ == "__main__":
s = "This is a test string for testing"
hash1 = simhash(s.split())-sharp
s = "This is a test string for testing also"
hash2 = simhash(s.split())
s = "nai nai ge xiong cao"
hash3 = simhash(s.split())
print(hash1,hash2,hash3)
print(hash1.hamming_distance(hash2), " ", hash1.similarity(hash2))
print(hash1.hamming_distance(hash3), " ", hash1.similarity(hash3))