[Javaの小枝] 週刊簡潔データ構造を作る ③

Last updated at 2016-02-26Posted at 2016-02-26

最小完全ハッシュ関数の実装

前回までで最小完全ハッシュ関数を作るための処理を全て説明し終えた。

以下に実装を示すが、初回に説明したように実用性を追求せず、簡素なコードを目指して書いてあることには注意して欲しい。

SuccinctDataStructures.java

	public static class MinimalPerfectHashFunction implements HashFunction {

		public static class CyclicCheckException extends RuntimeException {private static final long serialVersionUID = -1843676951903920769L;} // 枝(edge)がからまって頂点と一対一対応ができないときにスローされる

		public static HashFunction makeMinimalPerfectHashFunction(byte[][] keys, int pertite, double factor, int retryLimit) {
			for (int i = 0; i < retryLimit; i ++) { // マッピングステップが完了するまでハッシュ関数を替えながらリトライ
				try {
					List<Integer> primeListShuffled = new ArrayList<>(primeList); Collections.shuffle(primeListShuffled);
					List<HashFunction> hashFunctions = new ArrayList<>();
					for (int j = 0; j < pertite; j ++) hashFunctions.add(new SimpleHashFunction(primeListShuffled.get(j)));
					return new MinimalPerfectHashFunction(keys, hashFunctions, factor);
				} catch (CyclicCheckException e) {/* nothing to do */}
			}
			throw new CyclicCheckException();
		}
		
		private final byte[] g;
		private final List<HashFunction> hashFunctions;
		private final boolean[] dictionary;
		
		private MinimalPerfectHashFunction(byte[][] keys, List<HashFunction> hashFunctions_, double factor) {
			hashFunctions = hashFunctions_; 
			int[] degreeOfVertexArray = new int[(((int) (keys.length * factor / hashFunctions.size())) + 1) * hashFunctions.size()];
			
			List<int[]> edges = new ArrayList<>(); 
			for (int i = 0; i < keys.length; i ++) edges.add(makeEdge(keys[i], degreeOfVertexArray.length));
			for (int[] edge : edges) changeDegreeOfVertex(degreeOfVertexArray, edge, 1); // 全てのエッジを数えあげて degree of vertex を算出(degree:頂点から生えている枝の数)
			
			List<int[]> edgesRemoved = new ArrayList<>(); // 一つだけ枝をはやした頂点から生えている枝を見つけたら取り除き、こちらに加える。(元の edges から全てのエッジが取り除かれたら acyclic であることの証明が終了
			AcyclicCheck : while (edges.size() > 0) {
				for (int[] edge : edges) {
					if (! canBeRemoved(degreeOfVertexArray, edge)) continue;
					edges.remove(edge); edgesRemoved.add(edge); // エッジを取り除き、関連する vertex の degree も一つ減らす
					changeDegreeOfVertex(degreeOfVertexArray, edge, -1);
					continue AcyclicCheck;
				}
				throw new CyclicCheckException();
			}
			
			g = new byte[degreeOfVertexArray.length];
			for (int i = 0; i < g.length; i ++) g[i] = (byte) hashFunctions.size(); // アサイン済みのものとそうでないものを区別するため、あらかじめ頂点グループIDの最大値+1を入れておく
			for (ListIterator<int[]> it = edgesRemoved.listIterator(edgesRemoved.size()); it.hasPrevious();) assign(g, it.previous());
			
			dictionary = new boolean[degreeOfVertexArray.length]; // each item is initialized to false in Java
			for (int i = 0; i < keys.length; i ++) dictionary[phfHash(keys[i])] = true;
		}
		
		// データに対応する枝(edge)を生成するメソッド
		public int[] makeEdge(byte[] key, int nodeSize) {
			int pertiteSize = nodeSize / hashFunctions.size();
			int[] edge = new int[hashFunctions.size()];
			for (int i = 0; i < edge.length; i ++) edge[i] = hashFunctions.get(i).folded(key, pertiteSize * i, pertiteSize * (i + 1));
			return edge;
		}
		
		// 頂点から生えている枝の数(=degree)を増減させるメソッド
		public void changeDegreeOfVertex(int[] degreeOfVertex, int[] edge, int diff) {for (int e : edge) degreeOfVertex[e] += diff;}

		public boolean canBeRemoved(int[] degreeOfVertex, int[] edge) {
			for (int e : edge) if (degreeOfVertex[e] < 2) return true; 
			return false;
		}
		
		// g[] の中で未定義な箇所を探索するメソッド
		public static byte findNotDefinedPertite(byte[] g, int[] edge) {
			for (byte i = 0; i < edge.length; i ++) if (g[edge[i]] == edge.length) return i;
			throw new InternalError(); // cyclic check 通過しているのでこちらには来ないはず
		}

		// g[] の中で未定義な箇所に値を挿入するメソッド
		public static void assignValueToNotDefined(byte[] g, int[] edge, byte value) {
			for (int e : edge) if (g[e] == edge.length) {g[e] = value; value = 0;}
		}
		
		// g[] の中で定義なものだけを積算するメソッド
		public static byte sumDefined(byte[] g, int[] edge) {
			byte sum = 0;
			for (int e : edge) if (g[e] != edge.length) sum += g[e];
			return sum;
		}

		// わりあてステップのメインロジック		
		public static void assign(byte[] g, int[] edge) {
			byte pertite = findNotDefinedPertite(g, edge);
			byte definedAlready = sumDefined(g, edge);
			byte valueToBeAssigned = (byte) (pertite - definedAlready); 
			while (valueToBeAssigned < 0) valueToBeAssigned += edge.length;
			
			assignValueToNotDefined(g, edge, valueToBeAssigned);
		}

		private int phfHash(byte[] src) {
			int partiteSize = g.length / hashFunctions.size();
			int idx = 0;
			for (int i = 0; i < hashFunctions.size(); i ++) idx += g[hashFunctions.get(i).folded(src, partiteSize * i, partiteSize * (i + 1))];
			idx %= hashFunctions.size();
			return hashFunctions.get(idx).folded(src, partiteSize * idx, partiteSize * (idx + 1));
		}

		@Override public int hash(byte[] src) {return rank(dictionary, phfHash(src), true);}
	}

以下、ランダムな20文字の文字列に対して最小完全ハッシュ関数を実行し、ハッシュ値を生成するメソッドを示す。

SuccinctDataStructures.java

	private static char alphaNumericChar() {
		final String src = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
		return src.charAt((int)(Math.random() * src.length()));
	}
	
	public static String alphaNumericString(int length) {
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < length; i ++) sb.append(alphaNumericChar());
		return sb.toString();
	}
	
	public static void main(String[] args) {
		
		List<String> keyList = new ArrayList<>();
		for (int i = 0; i < 20; i ++) keyList.add(alphaNumericString(20));
		
		byte[][] keys = new byte[keyList.size()][];
		for (int i = 0; i < keyList.size(); i ++) keys[i] = keyList.get(i).getBytes();
		
		HashFunction mphf = MinimalPerfectHashFunction.makeMinimalPerfectHashFunction(keys, 3, 1.3, 100);
		
		for (int i = 0; i < keyList.size(); i ++) System.out.println(keyList.get(i) + " -> " + mphf.hash(keyList.get(i).getBytes()));
	}

実行結果は以下の通り。

mGxM0cKFpn8ubAK1wJib -> 10
4b6ytsxfhYwrj27BjlLd -> 4
nOUNLb3MO6vBwlzQNNdp -> 16
9kzughsHq1RSOLKmZB1o -> 18
lgjApjjNlVQzgwiTZpje -> 13
VcDYtpqZ1NsdiImOdkvr -> 7
ccZqgTB7eNonLVsz2iE9 -> 6
2dKygq4EZaUUVIsSX1hN -> 17
M1ZjMTbKdHNAJTwzuNjr -> 19
4QHSLO9K90QFCQQzta3c -> 9
7dPddSJq7NuNUxqjZN9L -> 1
yYFTwwjy5mN6Gzwa5MMG -> 12
VmLngOomJS1PYMkzHlHm -> 14
LQ7bCNRcLL2T62wJB28B -> 8
KrnsP6cuQnXQ9pmYltOf -> 15
4xBApXcwI4fuHsIwPCia -> 0
nwwObYmvmpao5QdkxSgV -> 3
Fen2AftgWuq4UPrJ4KG2 -> 5
OrPQe1w7IIulSqzOnrmu -> 11
Adt92XOnAuZdeweoyUz5 -> 2

重複なくハッシュ値がわりあてられていることがわかる。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

[Javaの小枝] 週刊 簡潔データ構造を作る ③

最小完全ハッシュ関数の実装

[Javaの小枝] 週刊簡潔データ構造を作る ③