123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 |
- '''
- pymmh3 was written by Fredrik Kihlander and enhanced by Swapnil Gusani, and is placed in the public
- domain. The authors hereby disclaim copyright to this source code.
-
- pure python implementation of the murmur3 hash algorithm
-
- https://code.google.com/p/smhasher/wiki/MurmurHash3
-
- This was written for the times when you do not want to compile c-code and install modules,
- and you only want a drop-in murmur3 implementation.
-
- As this is purely python it is FAR from performant and if performance is anything that is needed
- a proper c-module is suggested!
-
- This module is written to have the same format as mmh3 python package found here for simple conversions:
-
- https://pypi.python.org/pypi/mmh3/2.3.1
- '''
-
- import sys as _sys
- if (_sys.version_info > (3, 0)):
- def xrange( a, b, c ):
- return range( a, b, c )
- def xencode(x):
- if isinstance(x, bytes) or isinstance(x, bytearray):
- return x
- else:
- return x.encode()
- else:
- def xencode(x):
- return x
- del _sys
-
- def hash( key, seed = 0x0 ):
- ''' Implements 32bit murmur3 hash. '''
-
- key = bytearray( xencode(key) )
-
- def fmix( h ):
- h ^= h >> 16
- h = ( h * 0x85ebca6b ) & 0xFFFFFFFF
- h ^= h >> 13
- h = ( h * 0xc2b2ae35 ) & 0xFFFFFFFF
- h ^= h >> 16
- return h
-
- length = len( key )
- nblocks = int( length / 4 )
-
- h1 = seed
-
- c1 = 0xcc9e2d51
- c2 = 0x1b873593
-
- # body
- for block_start in xrange( 0, nblocks * 4, 4 ):
- # ??? big endian?
- k1 = key[ block_start + 3 ] << 24 | \
- key[ block_start + 2 ] << 16 | \
- key[ block_start + 1 ] << 8 | \
- key[ block_start + 0 ]
-
- k1 = ( c1 * k1 ) & 0xFFFFFFFF
- k1 = ( k1 << 15 | k1 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
- k1 = ( c2 * k1 ) & 0xFFFFFFFF
-
- h1 ^= k1
- h1 = ( h1 << 13 | h1 >> 19 ) & 0xFFFFFFFF # inlined ROTL32
- h1 = ( h1 * 5 + 0xe6546b64 ) & 0xFFFFFFFF
-
- # tail
- tail_index = nblocks * 4
- k1 = 0
- tail_size = length & 3
-
- if tail_size >= 3:
- k1 ^= key[ tail_index + 2 ] << 16
- if tail_size >= 2:
- k1 ^= key[ tail_index + 1 ] << 8
- if tail_size >= 1:
- k1 ^= key[ tail_index + 0 ]
-
- if tail_size > 0:
- k1 = ( k1 * c1 ) & 0xFFFFFFFF
- k1 = ( k1 << 15 | k1 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
- k1 = ( k1 * c2 ) & 0xFFFFFFFF
- h1 ^= k1
-
- #finalization
- unsigned_val = fmix( h1 ^ length )
- if unsigned_val & 0x80000000 == 0:
- return unsigned_val
- else:
- return -( (unsigned_val ^ 0xFFFFFFFF) + 1 )
-
-
- def hash128( key, seed = 0x0, x64arch = True ):
- ''' Implements 128bit murmur3 hash. '''
- def hash128_x64( key, seed ):
- ''' Implements 128bit murmur3 hash for x64. '''
-
- def fmix( k ):
- k ^= k >> 33
- k = ( k * 0xff51afd7ed558ccd ) & 0xFFFFFFFFFFFFFFFF
- k ^= k >> 33
- k = ( k * 0xc4ceb9fe1a85ec53 ) & 0xFFFFFFFFFFFFFFFF
- k ^= k >> 33
- return k
-
- length = len( key )
- nblocks = int( length / 16 )
-
- h1 = seed
- h2 = seed
-
- c1 = 0x87c37b91114253d5
- c2 = 0x4cf5ad432745937f
-
- #body
- for block_start in xrange( 0, nblocks * 8, 8 ):
- # ??? big endian?
- k1 = key[ 2 * block_start + 7 ] << 56 | \
- key[ 2 * block_start + 6 ] << 48 | \
- key[ 2 * block_start + 5 ] << 40 | \
- key[ 2 * block_start + 4 ] << 32 | \
- key[ 2 * block_start + 3 ] << 24 | \
- key[ 2 * block_start + 2 ] << 16 | \
- key[ 2 * block_start + 1 ] << 8 | \
- key[ 2 * block_start + 0 ]
-
- k2 = key[ 2 * block_start + 15 ] << 56 | \
- key[ 2 * block_start + 14 ] << 48 | \
- key[ 2 * block_start + 13 ] << 40 | \
- key[ 2 * block_start + 12 ] << 32 | \
- key[ 2 * block_start + 11 ] << 24 | \
- key[ 2 * block_start + 10 ] << 16 | \
- key[ 2 * block_start + 9 ] << 8 | \
- key[ 2 * block_start + 8 ]
-
- k1 = ( c1 * k1 ) & 0xFFFFFFFFFFFFFFFF
- k1 = ( k1 << 31 | k1 >> 33 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
- k1 = ( c2 * k1 ) & 0xFFFFFFFFFFFFFFFF
- h1 ^= k1
-
- h1 = ( h1 << 27 | h1 >> 37 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
- h1 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
- h1 = ( h1 * 5 + 0x52dce729 ) & 0xFFFFFFFFFFFFFFFF
-
- k2 = ( c2 * k2 ) & 0xFFFFFFFFFFFFFFFF
- k2 = ( k2 << 33 | k2 >> 31 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
- k2 = ( c1 * k2 ) & 0xFFFFFFFFFFFFFFFF
- h2 ^= k2
-
- h2 = ( h2 << 31 | h2 >> 33 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
- h2 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
- h2 = ( h2 * 5 + 0x38495ab5 ) & 0xFFFFFFFFFFFFFFFF
-
- #tail
- tail_index = nblocks * 16
- k1 = 0
- k2 = 0
- tail_size = length & 15
-
- if tail_size >= 15:
- k2 ^= key[ tail_index + 14 ] << 48
- if tail_size >= 14:
- k2 ^= key[ tail_index + 13 ] << 40
- if tail_size >= 13:
- k2 ^= key[ tail_index + 12 ] << 32
- if tail_size >= 12:
- k2 ^= key[ tail_index + 11 ] << 24
- if tail_size >= 11:
- k2 ^= key[ tail_index + 10 ] << 16
- if tail_size >= 10:
- k2 ^= key[ tail_index + 9 ] << 8
- if tail_size >= 9:
- k2 ^= key[ tail_index + 8 ]
-
- if tail_size > 8:
- k2 = ( k2 * c2 ) & 0xFFFFFFFFFFFFFFFF
- k2 = ( k2 << 33 | k2 >> 31 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
- k2 = ( k2 * c1 ) & 0xFFFFFFFFFFFFFFFF
- h2 ^= k2
-
- if tail_size >= 8:
- k1 ^= key[ tail_index + 7 ] << 56
- if tail_size >= 7:
- k1 ^= key[ tail_index + 6 ] << 48
- if tail_size >= 6:
- k1 ^= key[ tail_index + 5 ] << 40
- if tail_size >= 5:
- k1 ^= key[ tail_index + 4 ] << 32
- if tail_size >= 4:
- k1 ^= key[ tail_index + 3 ] << 24
- if tail_size >= 3:
- k1 ^= key[ tail_index + 2 ] << 16
- if tail_size >= 2:
- k1 ^= key[ tail_index + 1 ] << 8
- if tail_size >= 1:
- k1 ^= key[ tail_index + 0 ]
-
- if tail_size > 0:
- k1 = ( k1 * c1 ) & 0xFFFFFFFFFFFFFFFF
- k1 = ( k1 << 31 | k1 >> 33 ) & 0xFFFFFFFFFFFFFFFF # inlined ROTL64
- k1 = ( k1 * c2 ) & 0xFFFFFFFFFFFFFFFF
- h1 ^= k1
-
- #finalization
- h1 ^= length
- h2 ^= length
-
- h1 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
- h2 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
-
- h1 = fmix( h1 )
- h2 = fmix( h2 )
-
- h1 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
- h2 = ( h1 + h2 ) & 0xFFFFFFFFFFFFFFFF
-
- return ( h2 << 64 | h1 )
-
- def hash128_x86( key, seed ):
- ''' Implements 128bit murmur3 hash for x86. '''
-
- def fmix( h ):
- h ^= h >> 16
- h = ( h * 0x85ebca6b ) & 0xFFFFFFFF
- h ^= h >> 13
- h = ( h * 0xc2b2ae35 ) & 0xFFFFFFFF
- h ^= h >> 16
- return h
-
- length = len( key )
- nblocks = int( length / 16 )
-
- h1 = seed
- h2 = seed
- h3 = seed
- h4 = seed
-
- c1 = 0x239b961b
- c2 = 0xab0e9789
- c3 = 0x38b34ae5
- c4 = 0xa1e38b93
-
- #body
- for block_start in xrange( 0, nblocks * 16, 16 ):
- k1 = key[ block_start + 3 ] << 24 | \
- key[ block_start + 2 ] << 16 | \
- key[ block_start + 1 ] << 8 | \
- key[ block_start + 0 ]
-
- k2 = key[ block_start + 7 ] << 24 | \
- key[ block_start + 6 ] << 16 | \
- key[ block_start + 5 ] << 8 | \
- key[ block_start + 4 ]
-
- k3 = key[ block_start + 11 ] << 24 | \
- key[ block_start + 10 ] << 16 | \
- key[ block_start + 9 ] << 8 | \
- key[ block_start + 8 ]
-
- k4 = key[ block_start + 15 ] << 24 | \
- key[ block_start + 14 ] << 16 | \
- key[ block_start + 13 ] << 8 | \
- key[ block_start + 12 ]
-
- k1 = ( c1 * k1 ) & 0xFFFFFFFF
- k1 = ( k1 << 15 | k1 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
- k1 = ( c2 * k1 ) & 0xFFFFFFFF
- h1 ^= k1
-
- h1 = ( h1 << 19 | h1 >> 13 ) & 0xFFFFFFFF # inlined ROTL32
- h1 = ( h1 + h2 ) & 0xFFFFFFFF
- h1 = ( h1 * 5 + 0x561ccd1b ) & 0xFFFFFFFF
-
- k2 = ( c2 * k2 ) & 0xFFFFFFFF
- k2 = ( k2 << 16 | k2 >> 16 ) & 0xFFFFFFFF # inlined ROTL32
- k2 = ( c3 * k2 ) & 0xFFFFFFFF
- h2 ^= k2
-
- h2 = ( h2 << 17 | h2 >> 15 ) & 0xFFFFFFFF # inlined ROTL32
- h2 = ( h2 + h3 ) & 0xFFFFFFFF
- h2 = ( h2 * 5 + 0x0bcaa747 ) & 0xFFFFFFFF
-
- k3 = ( c3 * k3 ) & 0xFFFFFFFF
- k3 = ( k3 << 17 | k3 >> 15 ) & 0xFFFFFFFF # inlined ROTL32
- k3 = ( c4 * k3 ) & 0xFFFFFFFF
- h3 ^= k3
-
- h3 = ( h3 << 15 | h3 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
- h3 = ( h3 + h4 ) & 0xFFFFFFFF
- h3 = ( h3 * 5 + 0x96cd1c35 ) & 0xFFFFFFFF
-
- k4 = ( c4 * k4 ) & 0xFFFFFFFF
- k4 = ( k4 << 18 | k4 >> 14 ) & 0xFFFFFFFF # inlined ROTL32
- k4 = ( c1 * k4 ) & 0xFFFFFFFF
- h4 ^= k4
-
- h4 = ( h4 << 13 | h4 >> 19 ) & 0xFFFFFFFF # inlined ROTL32
- h4 = ( h1 + h4 ) & 0xFFFFFFFF
- h4 = ( h4 * 5 + 0x32ac3b17 ) & 0xFFFFFFFF
-
- #tail
- tail_index = nblocks * 16
- k1 = 0
- k2 = 0
- k3 = 0
- k4 = 0
- tail_size = length & 15
-
- if tail_size >= 15:
- k4 ^= key[ tail_index + 14 ] << 16
- if tail_size >= 14:
- k4 ^= key[ tail_index + 13 ] << 8
- if tail_size >= 13:
- k4 ^= key[ tail_index + 12 ]
-
- if tail_size > 12:
- k4 = ( k4 * c4 ) & 0xFFFFFFFF
- k4 = ( k4 << 18 | k4 >> 14 ) & 0xFFFFFFFF # inlined ROTL32
- k4 = ( k4 * c1 ) & 0xFFFFFFFF
- h4 ^= k4
-
- if tail_size >= 12:
- k3 ^= key[ tail_index + 11 ] << 24
- if tail_size >= 11:
- k3 ^= key[ tail_index + 10 ] << 16
- if tail_size >= 10:
- k3 ^= key[ tail_index + 9 ] << 8
- if tail_size >= 9:
- k3 ^= key[ tail_index + 8 ]
-
- if tail_size > 8:
- k3 = ( k3 * c3 ) & 0xFFFFFFFF
- k3 = ( k3 << 17 | k3 >> 15 ) & 0xFFFFFFFF # inlined ROTL32
- k3 = ( k3 * c4 ) & 0xFFFFFFFF
- h3 ^= k3
-
- if tail_size >= 8:
- k2 ^= key[ tail_index + 7 ] << 24
- if tail_size >= 7:
- k2 ^= key[ tail_index + 6 ] << 16
- if tail_size >= 6:
- k2 ^= key[ tail_index + 5 ] << 8
- if tail_size >= 5:
- k2 ^= key[ tail_index + 4 ]
-
- if tail_size > 4:
- k2 = ( k2 * c2 ) & 0xFFFFFFFF
- k2 = ( k2 << 16 | k2 >> 16 ) & 0xFFFFFFFF # inlined ROTL32
- k2 = ( k2 * c3 ) & 0xFFFFFFFF
- h2 ^= k2
-
- if tail_size >= 4:
- k1 ^= key[ tail_index + 3 ] << 24
- if tail_size >= 3:
- k1 ^= key[ tail_index + 2 ] << 16
- if tail_size >= 2:
- k1 ^= key[ tail_index + 1 ] << 8
- if tail_size >= 1:
- k1 ^= key[ tail_index + 0 ]
-
- if tail_size > 0:
- k1 = ( k1 * c1 ) & 0xFFFFFFFF
- k1 = ( k1 << 15 | k1 >> 17 ) & 0xFFFFFFFF # inlined ROTL32
- k1 = ( k1 * c2 ) & 0xFFFFFFFF
- h1 ^= k1
-
- #finalization
- h1 ^= length
- h2 ^= length
- h3 ^= length
- h4 ^= length
-
- h1 = ( h1 + h2 ) & 0xFFFFFFFF
- h1 = ( h1 + h3 ) & 0xFFFFFFFF
- h1 = ( h1 + h4 ) & 0xFFFFFFFF
- h2 = ( h1 + h2 ) & 0xFFFFFFFF
- h3 = ( h1 + h3 ) & 0xFFFFFFFF
- h4 = ( h1 + h4 ) & 0xFFFFFFFF
-
- h1 = fmix( h1 )
- h2 = fmix( h2 )
- h3 = fmix( h3 )
- h4 = fmix( h4 )
-
- h1 = ( h1 + h2 ) & 0xFFFFFFFF
- h1 = ( h1 + h3 ) & 0xFFFFFFFF
- h1 = ( h1 + h4 ) & 0xFFFFFFFF
- h2 = ( h1 + h2 ) & 0xFFFFFFFF
- h3 = ( h1 + h3 ) & 0xFFFFFFFF
- h4 = ( h1 + h4 ) & 0xFFFFFFFF
-
- return ( h4 << 96 | h3 << 64 | h2 << 32 | h1 )
-
- key = bytearray( xencode(key) )
-
- if x64arch:
- return hash128_x64( key, seed )
- else:
- return hash128_x86( key, seed )
-
-
- def hash64( key, seed = 0x0, x64arch = True ):
- ''' Implements 64bit murmur3 hash. Returns a tuple. '''
-
- hash_128 = hash128( key, seed, x64arch )
-
- unsigned_val1 = hash_128 & 0xFFFFFFFFFFFFFFFF
- if unsigned_val1 & 0x8000000000000000 == 0:
- signed_val1 = unsigned_val1
- else:
- signed_val1 = -( (unsigned_val1 ^ 0xFFFFFFFFFFFFFFFF) + 1 )
-
- unsigned_val2 = ( hash_128 >> 64 ) & 0xFFFFFFFFFFFFFFFF
- if unsigned_val2 & 0x8000000000000000 == 0:
- signed_val2 = unsigned_val2
- else:
- signed_val2 = -( (unsigned_val2 ^ 0xFFFFFFFFFFFFFFFF) + 1 )
-
- return ( int( signed_val1 ), int( signed_val2 ) )
-
-
- def hash_bytes( key, seed = 0x0, x64arch = True ):
- ''' Implements 128bit murmur3 hash. Returns a byte string. '''
-
- hash_128 = hash128( key, seed, x64arch )
-
- bytestring = ''
-
- for i in xrange(0, 16, 1):
- lsbyte = hash_128 & 0xFF
- bytestring = bytestring + str( chr( lsbyte ) )
- hash_128 = hash_128 >> 8
-
- return bytestring
-
-
- if __name__ == "__main__":
- import argparse
-
- parser = argparse.ArgumentParser( 'pymurmur3', 'pymurmur [options] "string to hash"' )
- parser.add_argument( '--seed', type = int, default = 0 )
- parser.add_argument( 'strings', default = [], nargs='+')
-
- opts = parser.parse_args()
-
- for str_to_hash in opts.strings:
- sys.stdout.write( '"%s" = 0x%08X\n' % ( str_to_hash, hash( str_to_hash ) ) )
|