一个好的散列函数
1 public static int hash(String key, int tableSize) 2 { 3 int hashVal = 0; 4 5 for (int i = 0; i < key.length(); i++) 6 hashVal = 37 * hashVal + key.charAt(i); 7 8 hashVal %= tableSize; 9 if (hashVal < 0) 10 hashVal += tableSize; 11 12 return hashVal; 13 }
如果当一个元素被插入时与一个已经插入的元素散列到相同的值,那么就产生一个冲突,这个冲突需要消除。解决这种冲突的方法有几种,其中最简单的两种:分离链接法和开放地址法
分离链接法
1 import java.util.LinkedList; 2 import java.util.List; 3 4 public class SeparateChainingHashTable<AnyType> 5 { 6 public SeparateChainingHashTable() 7 { this(DEFAULT_TABLE_SIZE); } 8 public SeparateChainingHashTable(int size) 9 { 10 theLists = new LinkedList[nextPrime(size)]; 11 for (int i = 0; i < theLists.length; i++) 12 theLists[i] = new LinkedList<>(); 13 } 14 15 public boolean contains(AnyType x) 16 { 17 List<AnyType>whichList = theLists[myhash(x)]; 18 return whichList.contains(x); 19 } 20 21 public void insert(AnyType x) 22 { 23 List<AnyType>whichList = theLists[myhash(x)]; 24 if (!whichList.contains(x)) 25 { 26 whichList.add(x); 27 28 if (++currentSize > theLists.length) 29 rehash(); 30 } 31 } 32 33 public void remove(AnyType x) 34 { 35 List<AnyType>whichList = theLists[myhash(x)]; 36 if (whichList.contains(x)) 37 { 38 whichList.remove(x); 39 currentSize--; 40 } 41 } 42 43 public void makeEmpty() 44 { 45 for (int i = 0; i < theLists.length; i++) 46 theLists[i].clear(); 47 48 currentSize = 0; 49 } 50 51 private int currentSize; 52 private List<AnyType>[] theLists; 53 private static final int DEFAULT_TABLE_SIZE = 101; 54 55 private void rehash() 56 { 57 List<AnyType>[] oldLists = theLists; 58 59 theLists = new List[nextPrime( 2 * theLists.length)]; 60 for (int i = 0; i < theLists.length; i++) 61 theLists[i] = new LinkedList<>(); 62 63 currentSize = 0; 64 for (int i = 0; i < oldLists.length; i++) 65 for (AnyType item : oldLists[i]) 66 insert(item); 67 } 68 69 private int myhash(AnyType x) 70 { 71 int hashVal = x.hashCode(); 72 73 hashVal %= theLists.length; 74 if (hashVal < 0) 75 hashVal += theLists.length; 76 77 return hashVal; 78 } 79 80 private static boolean isPrime(int n) 81 { 82 if (n == 2 || n == 3) 83 return true; 84 if (n == 1 || (n&1) == 0) 85 return false; 86 for (int i = 3; i * i < n; i++) 87 if (n % i == 0) 88 return false; 89 90 return true; 91 } 92 93 private static int nextPrime(int n) 94 { 95 if (n % 2 == 0) 96 n++; 97 for (; !isPrime(n); n += 2) 98 ; 99 return n; 100 } 101 }
5.4 不用链表的散列表
5.4.1 线性探测法
只要表足够大,总能够找到一个自由单元,但是如此花费的时间是相当多的。更糟的是,即使表相对较空,这样占据的单元也会开始形成一些区块,其结果称为一次聚集,就是说,散列到区块中的任何关键字都需要多次试选单元才能够解决冲突,然后该关键字被添加到相应的区块中。
5.4.2 平方探测法
平方探测是消除线性探测中一次聚焦问题的冲突解决方法。虽然平方探测排除了一次聚集,但是散列到同一位置上的那些元素将探测相同的备选单元。这叫作二次聚集。
1 public class QuadraticProbingHashTable<AnyType> { 2 public QuadraticProbingHashTable() { 3 this(DEFAULT_TABLE_SIZE); 4 } 5 6 public QuadraticProbingHashTable(int size) { 7 allocateArray(size); 8 makeEmpty(); 9 } 10 11 public void makeEmpty() { 12 currentSize = 0; 13 for (int i = 0; i < array.length; i++) 14 array[i] = null; 15 } 16 17 public boolean contains(AnyType x) { 18 int currentPos = findPos(x); 19 return isActive(currentPos); 20 } 21 22 public void insert(AnyType x) { 23 int currentPos = findPos(x); 24 if (isActive(currentPos)) 25 return; 26 27 array[currentPos] = new HashEntry<>(x, true); 28 if (currentPos > array.length / 2) 29 rehash(); 30 } 31 32 public void remove(AnyType x) 33 { 34 int currentPos = findPos(x); 35 if (isActive(currentPos)) 36 array[currentPos].isActive = false; 37 } 38 39 private static class HashEntry<AnyType> 40 { 41 public AnyType element; 42 public boolean isActive; 43 44 public HashEntry(AnyType e) 45 { this(e, true); } 46 47 public HashEntry(AnyType e, boolean i) 48 { 49 element = e; 50 isActive = i; 51 } 52 } 53 54 private static final int DEFAULT_TABLE_SIZE = 101; 55 56 private HashEntry<AnyType>[] array; 57 private int currentSize; 58 59 private void allocateArray(int arraySize) 60 { array = new HashEntry[nextPrime(arraySize)]; } 61 private int findPos(AnyType x) 62 { 63 int offest = 1; 64 int currentPos = myhash(x); 65 66 while (array[currentPos] != null && !array[currentPos].element.equals(x)) 67 { 68 currentPos += offest; 69 offest += 2; 70 if (currentPos >= array.length) 71 currentPos -= array.length; 72 } 73 return currentPos; 74 } 75 private boolean isActive(int currentPos) 76 { return array[currentPos] != null && array[currentPos].isActive; } 77 private void rehash() 78 { 79 HashEntry<AnyType>[] oldArray = array; 80 allocateArray(nextPrime(2 * array.length)); 81 currentSize = 0; 82 83 for (int i = 0; i < oldArray.length; i++) 84 if (oldArray[i] != null && oldArray[i].isActive) 85 insert(oldArray[i].element); 86 } 87 88 private int myhash(AnyType x) 89 { 90 int hashVal = x.hashCode(); 91 92 hashVal %= array.length; 93 if (hashVal < 0) 94 hashVal += array.length; 95 96 return hashVal; 97 } 98 99 private static int nextPrime(int n) 100 { 101 if ((n&1) == 0) 102 n++; 103 104 for (; !isPrime(n); n += 2) 105 ; 106 107 return n; 108 } 109 110 private static boolean isPrime(int n) 111 { 112 if (n == 2 || n == 3) 113 return true; 114 if (n == 1 || (n&1) == 0) 115 return false; 116 for (int i = 3; i * i <= n; i++) 117 if (n % i == 0) 118 return false; 119 120 return true; 121 } 122 }
5.4.3 双散列 最后一个冲突解决方法
5.5 再散列 对于使用平方探测的开放定址散列法,如果散列表填的太满,那么操作的运行时间将开始消耗过长,且插入操作可能失败。这可能发生在有太多的移动和插入混合的场合。此时,一个解决方法是建立另外一个大约两倍大的表(而且使用一个相关的新散列函数),扫描整个原始散列表,计算每个(未删除)元素的新散列值并将其插入到新表中。
5.6 标准库中的散列表
HashSet和HashMap通常是用分离链接散列实现的。
5.7.2 布谷鸟散列
1 import java.util.Random; 2 3 public class CuckooHashTable<AnyType> 4 { 5 public CuckooHashTable(HashFamily<? super AnyType>hf) { this(hf, DEFAULLT_TABLE_SIZE); } 6 7 public CuckooHashTable(HashFamily<? super AnyType>hf, int size) 8 { 9 allocateArray(nextPrime(size)); 10 doClear(); 11 hashFunctions = hf; 12 numHashFunctions = hf.getNumberOfFunctions(); 13 } 14 15 private Random r = new Random(); 16 17 private static final double MAX_LOAD = 0.4; 18 private static final int ALLOWED_REHASHES = 1; 19 20 private int rehashes = 0; 21 22 private boolean insertHelper1(AnyType x) 23 { 24 final int COUNT_LINIT = 100; 25 26 while (true) 27 { 28 int lastPos = 1; 29 int pos; 30 for (int count = 0; count < COUNT_LINIT; count++) 31 { 32 for (int i = 0; i < numHashFunctions; i++) 33 { 34 pos = myhash(x, i); 35 if (array[pos] == null) 36 { 37 array[pos] = x; 38 currentSize++; 39 return true; 40 } 41 } 42 int i = 0; 43 do 44 { 45 pos = myhash(x, r.nextInt(numHashFunctions)); 46 }while (pos == lastPos && i++ < 5); 47 48 AnyType tmp = array[lastPos = pos]; 49 array[pos] = x; 50 x = tmp; 51 } 52 if (++rehashes > ALLOWED_REHASHES) 53 { 54 expand(); 55 rehashes = 0; 56 } 57 else 58 rehash(); 59 } 60 } 61 62 63 public boolean insert(AnyType x) 64 { 65 if (contains(x)) 66 return false; 67 if (currentSize >= array.length / MAX_LOAD) 68 expand(); 69 return insertHelper1(x); 70 } 71 72 private int myhash(AnyType x, int which) 73 { 74 int hashVal = hashFunctions.hash(x, which); 75 76 hashVal %= array.length; 77 if (hashVal < 0) 78 hashVal += array.length; 79 80 return hashVal; 81 } 82 83 private void expand(){ rehash((int)(array.length / MAX_LOAD));} 84 85 private void rehash() 86 { 87 hashFunctions.generateNewFunctions(); 88 rehash(array.length); 89 } 90 91 private void rehash(int newLength) 92 { 93 AnyType[] oldArray = array; 94 allocateArray(nextPrime(newLength)); 95 96 currentSize = 0; 97 98 for (AnyType str : oldArray) 99 if (str != null) 100 insert(str); 101 } 102 103 public int size(){ return currentSize; } 104 105 public int capacity(){ return array.length; } 106 107 private int findPos(AnyType x) 108 { 109 for (int i = 0; i < numHashFunctions; i++) 110 { 111 int pos = myhash(x, i); 112 if (array[pos] != null && array[pos].equals(x)) 113 return pos; 114 } 115 return -1; 116 } 117 118 public boolean remove(AnyType x) 119 { 120 int pos = findPos(x); 121 if (pos != -1) 122 { 123 array[pos] = null; 124 currentSize--; 125 } 126 return pos != -1; 127 } 128 129 public boolean contains(AnyType x){ return findPos(x) != -1;} 130 131 public void makeEmpty(){ doClear(); } 132 133 private void doClear() 134 { 135 currentSize = 0; 136 for (int i = 0; i < array.length; i++) 137 array[i] = null; 138 } 139 140 private static final int DEFAULLT_TABLE_SIZE = 101; 141 142 private final HashFamily<? super AnyType>hashFunctions; 143 private final int numHashFunctions; 144 private AnyType[] array; 145 private int currentSize; 146 147 private void allocateArray(int arraySize) { array = (AnyType[])new Object[arraySize]; } 148 149 protected static int nextPrime(int n) 150 { 151 if ((n&1) == 0) 152 n++; 153 for (; !isPime(n); n += 2) 154 ; 155 return n; 156 } 157 158 private static boolean isPime(int n) 159 { 160 if (n == 2 || n == 3) 161 return true; 162 if (n == 1 || (n&1) == 0) 163 return false; 164 for (int i = 3; i * i <= n; i += 2) 165 if (n % i == 0) 166 return false; 167 168 return true; 169 } 170 }
5.7.3 跳房子散列的思路是,用事先确定的、对计算机的底层体系结构而言是最优的一个常数,给探测序列的最大长度加个上界。这样做可以给出常数级的最坏查询时间,并且与布谷鸟散列一样,查询可以并行化,以同时检查可用位置的有限集。