Ingen beskrivning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pymmh3.py 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. '''
  2. pymmh3 was written by Fredrik Kihlander and enhanced by Swapnil Gusani, and is placed in the public
  3. domain. The authors hereby disclaim copyright to this source code.
  4. pure python implementation of the murmur3 hash algorithm
  5. https://code.google.com/p/smhasher/wiki/MurmurHash3
  6. This was written for the times when you do not want to compile c-code and install modules,
  7. and you only want a drop-in murmur3 implementation.
  8. As this is purely python it is FAR from performant and if performance is anything that is needed
  9. a proper c-module is suggested!
  10. This module is written to have the same format as mmh3 python package found here for simple conversions:
  11. https://pypi.python.org/pypi/mmh3/2.3.1
  12. '''
  13. import sys as _sys
  14. if (_sys.version_info > (3, 0)):
  15. def xrange( a, b, c ):
  16. return range( a, b, c )
  17. def xencode(x):
  18. if isinstance(x, bytes) or isinstance(x, bytearray):
  19. return x
  20. else:
  21. return x.encode()
  22. else:
  23. def xencode(x):
  24. return x
  25. del _sys
  26. def hash( key, seed = 0x0 ):
  27. ''' Implements 32bit murmur3 hash. '''
  28. key = bytearray( xencode(key) )
  29. def fmix( h ):
  30. h ^= h >> 16
  31. h = ( h * 0x85ebca6b ) & 0xFFFFFFFF
  32. h ^= h >> 13
  33. h = ( h * 0xc2b2ae35 ) & 0xFFFFFFFF
  34. h ^= h >> 16
  35. return h
  36. length = len( key )
  37. nblocks = int( length / 4 )
  38. h1 = seed
  39. c1 = 0xcc9e2d51
  40. c2 = 0x1b873593
  41. # body
  42. for block_start in xrange( 0, nblocks * 4, 4 ):
  43. # ??? big endian?
  44. k1 = key[ block_start + 3 ] << 24 | \
  45. key[ block_start + 2 ] << 16 | \
  46. key[ block_start + 1 ] << 8 | \
  47. key[ block_start + 0 ]
  48. k1 = ( c1 * k1 ) & 0xFFFFFFFF
  49. k1 = ( k1 << 15 | k1 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
  50. k1 = ( c2 * k1 ) & 0xFFFFFFFF
  51. h1 ^= k1
  52. h1 = ( h1 << 13 | h1 >> 19 ) & 0xFFFFFFFF # inlined ROTL32
  53. h1 = ( h1 * 5 + 0xe6546b64 ) & 0xFFFFFFFF
  54. # tail
  55. tail_index = nblocks * 4
  56. k1 = 0
  57. tail_size = length & 3
  58. if tail_size >= 3:
  59. k1 ^= key[ tail_index + 2 ] << 16
  60. if tail_size >= 2:
  61. k1 ^= key[ tail_index + 1 ] << 8
  62. if tail_size >= 1:
  63. k1 ^= key[ tail_index + 0 ]
  64. if tail_size > 0:
  65. k1 = ( k1 * c1 ) & 0xFFFFFFFF
  66. k1 = ( k1 << 15 | k1 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
  67. k1 = ( k1 * c2 ) & 0xFFFFFFFF
  68. h1 ^= k1
  69. #finalization
  70. unsigned_val = fmix( h1 ^ length )
  71. if unsigned_val & 0x80000000 == 0:
  72. return unsigned_val
  73. else:
  74. return -( (unsigned_val ^ 0xFFFFFFFF) + 1 )
  75. def hash128( key, seed = 0x0, x64arch = True ):
  76. ''' Implements 128bit murmur3 hash. '''
  77. def hash128_x64( key, seed ):
  78. ''' Implements 128bit murmur3 hash for x64. '''
  79. def fmix( k ):
  80. k ^= k >> 33
  81. k = ( k * 0xff51afd7ed558ccd ) & 0xFFFFFFFFFFFFFFFF
  82. k ^= k >> 33
  83. k = ( k * 0xc4ceb9fe1a85ec53 ) & 0xFFFFFFFFFFFFFFFF
  84. k ^= k >> 33
  85. return k
  86. length = len( key )
  87. nblocks = int( length / 16 )
  88. h1 = seed
  89. h2 = seed
  90. c1 = 0x87c37b91114253d5
  91. c2 = 0x4cf5ad432745937f
  92. #body
  93. for block_start in xrange( 0, nblocks * 8, 8 ):
  94. # ??? big endian?
  95. k1 = key[ 2 * block_start + 7 ] << 56 | \
  96. key[ 2 * block_start + 6 ] << 48 | \
  97. key[ 2 * block_start + 5 ] << 40 | \
  98. key[ 2 * block_start + 4 ] << 32 | \
  99. key[ 2 * block_start + 3 ] << 24 | \
  100. key[ 2 * block_start + 2 ] << 16 | \
  101. key[ 2 * block_start + 1 ] << 8 | \
  102. key[ 2 * block_start + 0 ]
  103. k2 = key[ 2 * block_start + 15 ] << 56 | \
  104. key[ 2 * block_start + 14 ] << 48 | \
  105. key[ 2 * block_start + 13 ] << 40 | \
  106. key[ 2 * block_start + 12 ] << 32 | \
  107. key[ 2 * block_start + 11 ] << 24 | \
  108. key[ 2 * block_start + 10 ] << 16 | \
  109. key[ 2 * block_start + 9 ] << 8 | \
  110. key[ 2 * block_start + 8 ]
  111. k1 = ( c1 * k1 ) & 0xFFFFFFFFFFFFFFFF
  112. k1 = ( k1 << 31 | k1 >> 33 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
  113. k1 = ( c2 * k1 ) & 0xFFFFFFFFFFFFFFFF
  114. h1 ^= k1
  115. h1 = ( h1 << 27 | h1 >> 37 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
  116. h1 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
  117. h1 = ( h1 * 5 + 0x52dce729 ) & 0xFFFFFFFFFFFFFFFF
  118. k2 = ( c2 * k2 ) & 0xFFFFFFFFFFFFFFFF
  119. k2 = ( k2 << 33 | k2 >> 31 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
  120. k2 = ( c1 * k2 ) & 0xFFFFFFFFFFFFFFFF
  121. h2 ^= k2
  122. h2 = ( h2 << 31 | h2 >> 33 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
  123. h2 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
  124. h2 = ( h2 * 5 + 0x38495ab5 ) & 0xFFFFFFFFFFFFFFFF
  125. #tail
  126. tail_index = nblocks * 16
  127. k1 = 0
  128. k2 = 0
  129. tail_size = length & 15
  130. if tail_size >= 15:
  131. k2 ^= key[ tail_index + 14 ] << 48
  132. if tail_size >= 14:
  133. k2 ^= key[ tail_index + 13 ] << 40
  134. if tail_size >= 13:
  135. k2 ^= key[ tail_index + 12 ] << 32
  136. if tail_size >= 12:
  137. k2 ^= key[ tail_index + 11 ] << 24
  138. if tail_size >= 11:
  139. k2 ^= key[ tail_index + 10 ] << 16
  140. if tail_size >= 10:
  141. k2 ^= key[ tail_index + 9 ] << 8
  142. if tail_size >= 9:
  143. k2 ^= key[ tail_index + 8 ]
  144. if tail_size > 8:
  145. k2 = ( k2 * c2 ) & 0xFFFFFFFFFFFFFFFF
  146. k2 = ( k2 << 33 | k2 >> 31 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
  147. k2 = ( k2 * c1 ) & 0xFFFFFFFFFFFFFFFF
  148. h2 ^= k2
  149. if tail_size >= 8:
  150. k1 ^= key[ tail_index + 7 ] << 56
  151. if tail_size >= 7:
  152. k1 ^= key[ tail_index + 6 ] << 48
  153. if tail_size >= 6:
  154. k1 ^= key[ tail_index + 5 ] << 40
  155. if tail_size >= 5:
  156. k1 ^= key[ tail_index + 4 ] << 32
  157. if tail_size >= 4:
  158. k1 ^= key[ tail_index + 3 ] << 24
  159. if tail_size >= 3:
  160. k1 ^= key[ tail_index + 2 ] << 16
  161. if tail_size >= 2:
  162. k1 ^= key[ tail_index + 1 ] << 8
  163. if tail_size >= 1:
  164. k1 ^= key[ tail_index + 0 ]
  165. if tail_size > 0:
  166. k1 = ( k1 * c1 ) & 0xFFFFFFFFFFFFFFFF
  167. k1 = ( k1 << 31 | k1 >> 33 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
  168. k1 = ( k1 * c2 ) & 0xFFFFFFFFFFFFFFFF
  169. h1 ^= k1
  170. #finalization
  171. h1 ^= length
  172. h2 ^= length
  173. h1 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
  174. h2 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
  175. h1 = fmix( h1 )
  176. h2 = fmix( h2 )
  177. h1 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
  178. h2 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
  179. return ( h2 << 64 | h1 )
  180. def hash128_x86( key, seed ):
  181. ''' Implements 128bit murmur3 hash for x86. '''
  182. def fmix( h ):
  183. h ^= h >> 16
  184. h = ( h * 0x85ebca6b ) & 0xFFFFFFFF
  185. h ^= h >> 13
  186. h = ( h * 0xc2b2ae35 ) & 0xFFFFFFFF
  187. h ^= h >> 16
  188. return h
  189. length = len( key )
  190. nblocks = int( length / 16 )
  191. h1 = seed
  192. h2 = seed
  193. h3 = seed
  194. h4 = seed
  195. c1 = 0x239b961b
  196. c2 = 0xab0e9789
  197. c3 = 0x38b34ae5
  198. c4 = 0xa1e38b93
  199. #body
  200. for block_start in xrange( 0, nblocks * 16, 16 ):
  201. k1 = key[ block_start + 3 ] << 24 | \
  202. key[ block_start + 2 ] << 16 | \
  203. key[ block_start + 1 ] << 8 | \
  204. key[ block_start + 0 ]
  205. k2 = key[ block_start + 7 ] << 24 | \
  206. key[ block_start + 6 ] << 16 | \
  207. key[ block_start + 5 ] << 8 | \
  208. key[ block_start + 4 ]
  209. k3 = key[ block_start + 11 ] << 24 | \
  210. key[ block_start + 10 ] << 16 | \
  211. key[ block_start + 9 ] << 8 | \
  212. key[ block_start + 8 ]
  213. k4 = key[ block_start + 15 ] << 24 | \
  214. key[ block_start + 14 ] << 16 | \
  215. key[ block_start + 13 ] << 8 | \
  216. key[ block_start + 12 ]
  217. k1 = ( c1 * k1 ) & 0xFFFFFFFF
  218. k1 = ( k1 << 15 | k1 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
  219. k1 = ( c2 * k1 ) & 0xFFFFFFFF
  220. h1 ^= k1
  221. h1 = ( h1 << 19 | h1 >> 13 ) & 0xFFFFFFFF # inlined ROTL32
  222. h1 = ( h1 + h2 ) & 0xFFFFFFFF
  223. h1 = ( h1 * 5 + 0x561ccd1b ) & 0xFFFFFFFF
  224. k2 = ( c2 * k2 ) & 0xFFFFFFFF
  225. k2 = ( k2 << 16 | k2 >> 16 ) & 0xFFFFFFFF # inlined ROTL32
  226. k2 = ( c3 * k2 ) & 0xFFFFFFFF
  227. h2 ^= k2
  228. h2 = ( h2 << 17 | h2 >> 15 ) & 0xFFFFFFFF # inlined ROTL32
  229. h2 = ( h2 + h3 ) & 0xFFFFFFFF
  230. h2 = ( h2 * 5 + 0x0bcaa747 ) & 0xFFFFFFFF
  231. k3 = ( c3 * k3 ) & 0xFFFFFFFF
  232. k3 = ( k3 << 17 | k3 >> 15 ) & 0xFFFFFFFF # inlined ROTL32
  233. k3 = ( c4 * k3 ) & 0xFFFFFFFF
  234. h3 ^= k3
  235. h3 = ( h3 << 15 | h3 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
  236. h3 = ( h3 + h4 ) & 0xFFFFFFFF
  237. h3 = ( h3 * 5 + 0x96cd1c35 ) & 0xFFFFFFFF
  238. k4 = ( c4 * k4 ) & 0xFFFFFFFF
  239. k4 = ( k4 << 18 | k4 >> 14 ) & 0xFFFFFFFF # inlined ROTL32
  240. k4 = ( c1 * k4 ) & 0xFFFFFFFF
  241. h4 ^= k4
  242. h4 = ( h4 << 13 | h4 >> 19 ) & 0xFFFFFFFF # inlined ROTL32
  243. h4 = ( h1 + h4 ) & 0xFFFFFFFF
  244. h4 = ( h4 * 5 + 0x32ac3b17 ) & 0xFFFFFFFF
  245. #tail
  246. tail_index = nblocks * 16
  247. k1 = 0
  248. k2 = 0
  249. k3 = 0
  250. k4 = 0
  251. tail_size = length & 15
  252. if tail_size >= 15:
  253. k4 ^= key[ tail_index + 14 ] << 16
  254. if tail_size >= 14:
  255. k4 ^= key[ tail_index + 13 ] << 8
  256. if tail_size >= 13:
  257. k4 ^= key[ tail_index + 12 ]
  258. if tail_size > 12:
  259. k4 = ( k4 * c4 ) & 0xFFFFFFFF
  260. k4 = ( k4 << 18 | k4 >> 14 ) & 0xFFFFFFFF # inlined ROTL32
  261. k4 = ( k4 * c1 ) & 0xFFFFFFFF
  262. h4 ^= k4
  263. if tail_size >= 12:
  264. k3 ^= key[ tail_index + 11 ] << 24
  265. if tail_size >= 11:
  266. k3 ^= key[ tail_index + 10 ] << 16
  267. if tail_size >= 10:
  268. k3 ^= key[ tail_index + 9 ] << 8
  269. if tail_size >= 9:
  270. k3 ^= key[ tail_index + 8 ]
  271. if tail_size > 8:
  272. k3 = ( k3 * c3 ) & 0xFFFFFFFF
  273. k3 = ( k3 << 17 | k3 >> 15 ) & 0xFFFFFFFF # inlined ROTL32
  274. k3 = ( k3 * c4 ) & 0xFFFFFFFF
  275. h3 ^= k3
  276. if tail_size >= 8:
  277. k2 ^= key[ tail_index + 7 ] << 24
  278. if tail_size >= 7:
  279. k2 ^= key[ tail_index + 6 ] << 16
  280. if tail_size >= 6:
  281. k2 ^= key[ tail_index + 5 ] << 8
  282. if tail_size >= 5:
  283. k2 ^= key[ tail_index + 4 ]
  284. if tail_size > 4:
  285. k2 = ( k2 * c2 ) & 0xFFFFFFFF
  286. k2 = ( k2 << 16 | k2 >> 16 ) & 0xFFFFFFFF # inlined ROTL32
  287. k2 = ( k2 * c3 ) & 0xFFFFFFFF
  288. h2 ^= k2
  289. if tail_size >= 4:
  290. k1 ^= key[ tail_index + 3 ] << 24
  291. if tail_size >= 3:
  292. k1 ^= key[ tail_index + 2 ] << 16
  293. if tail_size >= 2:
  294. k1 ^= key[ tail_index + 1 ] << 8
  295. if tail_size >= 1:
  296. k1 ^= key[ tail_index + 0 ]
  297. if tail_size > 0:
  298. k1 = ( k1 * c1 ) & 0xFFFFFFFF
  299. k1 = ( k1 << 15 | k1 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
  300. k1 = ( k1 * c2 ) & 0xFFFFFFFF
  301. h1 ^= k1
  302. #finalization
  303. h1 ^= length
  304. h2 ^= length
  305. h3 ^= length
  306. h4 ^= length
  307. h1 = ( h1 + h2 ) & 0xFFFFFFFF
  308. h1 = ( h1 + h3 ) & 0xFFFFFFFF
  309. h1 = ( h1 + h4 ) & 0xFFFFFFFF
  310. h2 = ( h1 + h2 ) & 0xFFFFFFFF
  311. h3 = ( h1 + h3 ) & 0xFFFFFFFF
  312. h4 = ( h1 + h4 ) & 0xFFFFFFFF
  313. h1 = fmix( h1 )
  314. h2 = fmix( h2 )
  315. h3 = fmix( h3 )
  316. h4 = fmix( h4 )
  317. h1 = ( h1 + h2 ) & 0xFFFFFFFF
  318. h1 = ( h1 + h3 ) & 0xFFFFFFFF
  319. h1 = ( h1 + h4 ) & 0xFFFFFFFF
  320. h2 = ( h1 + h2 ) & 0xFFFFFFFF
  321. h3 = ( h1 + h3 ) & 0xFFFFFFFF
  322. h4 = ( h1 + h4 ) & 0xFFFFFFFF
  323. return ( h4 << 96 | h3 << 64 | h2 << 32 | h1 )
  324. key = bytearray( xencode(key) )
  325. if x64arch:
  326. return hash128_x64( key, seed )
  327. else:
  328. return hash128_x86( key, seed )
  329. def hash64( key, seed = 0x0, x64arch = True ):
  330. ''' Implements 64bit murmur3 hash. Returns a tuple. '''
  331. hash_128 = hash128( key, seed, x64arch )
  332. unsigned_val1 = hash_128 & 0xFFFFFFFFFFFFFFFF
  333. if unsigned_val1 & 0x8000000000000000 == 0:
  334. signed_val1 = unsigned_val1
  335. else:
  336. signed_val1 = -( (unsigned_val1 ^ 0xFFFFFFFFFFFFFFFF) + 1 )
  337. unsigned_val2 = ( hash_128 >> 64 ) & 0xFFFFFFFFFFFFFFFF
  338. if unsigned_val2 & 0x8000000000000000 == 0:
  339. signed_val2 = unsigned_val2
  340. else:
  341. signed_val2 = -( (unsigned_val2 ^ 0xFFFFFFFFFFFFFFFF) + 1 )
  342. return ( int( signed_val1 ), int( signed_val2 ) )
  343. def hash_bytes( key, seed = 0x0, x64arch = True ):
  344. ''' Implements 128bit murmur3 hash. Returns a byte string. '''
  345. hash_128 = hash128( key, seed, x64arch )
  346. bytestring = ''
  347. for i in xrange(0, 16, 1):
  348. lsbyte = hash_128 & 0xFF
  349. bytestring = bytestring + str( chr( lsbyte ) )
  350. hash_128 = hash_128 >> 8
  351. return bytestring
  352. if __name__ == "__main__":
  353. import argparse
  354. parser = argparse.ArgumentParser( 'pymurmur3', 'pymurmur [options] "string to hash"' )
  355. parser.add_argument( '--seed', type = int, default = 0 )
  356. parser.add_argument( 'strings', default = [], nargs='+')
  357. opts = parser.parse_args()
  358. for str_to_hash in opts.strings:
  359. sys.stdout.write( '"%s" = 0x%08X\n' % ( str_to_hash, hash( str_to_hash ) ) )