Java 文字列をバイト数で切り捨てる

Java

Last updated at 2016-03-10Posted at 2016-02-11

Javaで文字列をバイト数で切り捨てる効率の良い方法がパッと思い浮かばなかったので、
調べたり悩んだりしてみました。

文字列をバイト数で切り捨てるコード

最終的に書いたのはこれです。
（そこそこ効率良いと思ってます。）

	public static String truncateBytes(String s, Charset charset, int maxBytes) {
		ByteBuffer bb = ByteBuffer.allocate(maxBytes);
		CharBuffer cb = CharBuffer.wrap(s);
		CharsetEncoder encoder = charset.newEncoder()
				.onMalformedInput(CodingErrorAction.REPLACE)
				.onUnmappableCharacter(CodingErrorAction.REPLACE)
				.reset();
		CoderResult cr = encoder.encode(cb, bb, true);
		if (!cr.isOverflow()) {
			return s;
		}
		encoder.flush(bb);
		return cb.flip().toString();
	}

CharsetEncoder#encodeでバイト配列に変換してみるけど、バイト配列に収まらないところでストップするので、
このとき進んだchar配列分を文字列に変換する。ってイメージ。

この前に思いついたパターン１

CharBufferとかよくわからないとき思いついたのはこれです。

	public static String truncateBytes1(String s, Charset charset, int maxBytes) {
		int bytes = 0;
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < s.length(); i++) {
			String c = s.substring(i, i + 1);
			bytes += c.getBytes(charset).length;
			if (bytes > maxBytes) {
				return sb.toString();
			}
			sb.append(c);
		}
		return s;
	}

これは頭から順番にcharのbyte数を足していってmaxに到達したら終了するイメージです。
ただこれだと、切り捨てたいバイト数が大きくなるととても非効率です。

この前に思いついたパターン２

上記の問題を解決しようと思って思いついたパターン

	public static String truncateBytes2(String s, Charset charset, int maxBytes) {
		byte[] bs = s.getBytes(charset);
		if (bs.length <= maxBytes) {
			return s;
		}
		String result = new String(bs, 0, maxBytes, charset);
		int index = Math.min(s.length(), result.length()) - 1;
		while (index >= 0 && result.charAt(index) != s.charAt(index)) {//文字化け判定
			index--;
		}
		if (index < 0) {
			return "";
		}
		return result.substring(0, index + 1);
	}

とりあえずbyte配列にしてみて、目的の長さのバイト配列だけコピー。
コピーしたバイト配列を文字列に変換。
文字化けを排除して文字列を再構築して返すイメージ。
ただ今度は、切り捨てたいバイト数が小さくて与えられた元の文字列が長い場合に、
一度フルのバイト配列を作ってしまうので非効率なのです。

計測

それぞれの結果が下記。100万回ずつ実行を5回やった平均。
左の数値が切り捨てたいバイト数がそこそこ（31バイト）のパターンのミリ秒。
右の数値が切り捨てたいバイト数が少ない（3バイト）で元の文字列がかなり長いパターンのミリ秒。

truncateBytes	:370	106
truncateBytes1	:2147	426
truncateBytes2	:564	2147

最初に記載したコードが一番効率がいい。

計測用テストコードは下記。

	private static final Charset MS932 = Charset.forName("MS932");
	private static final Charset UTF_8 = StandardCharsets.UTF_8;

	interface TruncateByteFn {
		String test(String s, Charset charset, int maxBytes);
	}

	private static class TestObj {
		private final TruncateByteFn fn;
		private final String name;

		public TestObj(TruncateByteFn fn, String name) {
			super();
			this.fn = fn;
			this.name = name;
		}
	}

	private static final TestObj[] TESTS = new TestObj[] {
			new TestObj(TruncateByte::truncateBytes, "truncateBytes"),
			new TestObj(TruncateByte::truncateBytes1, "truncateBytes1"),
			new TestObj(TruncateByte::truncateBytes2, "truncateBytes2"),
	};
	@Test
	public void test() {
		int testTimes = 6;
		long[][] times1 = new long[TESTS.length][testTimes];
		long[][] times2 = new long[TESTS.length][testTimes];
		for (int t = 0; t < testTimes; t++) {
			for (int i = 0; i < TESTS.length; i++) {
				TestObj obj = TESTS[i];
				times1[i][t] = this.speedTest1(obj.fn);
				times2[i][t] = this.speedTest2(obj.fn);
			}
		}
		for (int i = 0; i < TESTS.length; i++) {
			String name = TESTS[i].name;
			Arrays.stream(times1[i])
					.skip(1)
					.average()
					.ifPresent(time -> System.out
							.print(name + "\t:" + TimeUnit.NANOSECONDS.toMillis((int) time)));
			Arrays.stream(times2[i])
					.skip(1)
					.average()
					.ifPresent(time -> System.out.println("\t" + TimeUnit.NANOSECONDS.toMillis((int) time)));
		}
	}

	private long speedTest1(TruncateByteFn test) {
		int count = 1000000;
		long start = System.nanoTime();
		for (int i = 0; i < count; i++) {
			test.test("ABCDEFGHIJKLMNOPQRSTUVWXYZＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ", MS932, 31);
		}
		long end = System.nanoTime();
		return end - start;
	}

	private long speedTest2(TruncateByteFn test) {
		String s = "ABCDEFGHIJKLMNOPQRSTUVWXYZＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ";
		for (int i = 0; i < 5; i++) {
			s += s;
		}
		int count = 1000000;
		long start = System.nanoTime();
		for (int i = 0; i < count; i++) {
			test.test(s, UTF_8, 3);
		}
		long end = System.nanoTime();
		return end - start;
	}

テストコード

一応普通のテストコード

	private static final Charset MS932 = Charset.forName("MS932");
	private static final Charset UTF_8 = StandardCharsets.UTF_8;

	interface TruncateByteFn {
		String test(String s, Charset charset, int maxBytes);
	}

	private static final TruncateByteFn[] TESTS = new TruncateByteFn[] {
			TruncateByte::truncateBytes1,
			TruncateByte::truncateBytes2,
			TruncateByte::truncateBytes,
	};

	@Test
	public void test() {
		for (TruncateByteFn fn : TESTS) {
			assertThat(fn.test("ABCDEFG", MS932, 0), is(""));
			assertThat(fn.test("ABCDEFG", MS932, 1), is("A"));
			assertThat(fn.test("ABCDEFG", MS932, 2), is("AB"));
			assertThat(fn.test("ABCDEFG", MS932, 3), is("ABC"));

			assertThat(fn.test("あいうえお", MS932, 0), is(""));
			assertThat(fn.test("あいうえお", MS932, 1), is(""));
			assertThat(fn.test("あいうえお", MS932, 2), is("あ"));
			assertThat(fn.test("あいうえお", MS932, 3), is("あ"));
			assertThat(fn.test("あいうえお", MS932, 4), is("あい"));
			assertThat(fn.test("あいうえお", MS932, 5), is("あい"));
			assertThat(fn.test("あいうえお", MS932, 6), is("あいう"));

			assertThat(fn.test("ABCDEFG", MS932, 6), is("ABCDEF"));
			assertThat(fn.test("ABCDEFG", MS932, 7), is("ABCDEFG"));
			assertThat(fn.test("ABCDEFG", MS932, 8), is("ABCDEFG"));
			assertThat(fn.test("ABCDEFG", MS932, 9), is("ABCDEFG"));
			assertThat(fn.test("ABCDEFG", MS932, 10), is("ABCDEFG"));
			assertThat(fn.test("ABCDEFG", MS932, 100), is("ABCDEFG"));

			assertThat(fn.test("ABCDEFG", UTF_8, 0), is(""));
			assertThat(fn.test("ABCDEFG", UTF_8, 1), is("A"));
			assertThat(fn.test("ABCDEFG", UTF_8, 2), is("AB"));
			assertThat(fn.test("ABCDEFG", UTF_8, 3), is("ABC"));

			assertThat(fn.test("あいうえお", UTF_8, 0), is(""));
			assertThat(fn.test("あいうえお", UTF_8, 1), is(""));
			assertThat(fn.test("あいうえお", UTF_8, 2), is(""));
			assertThat(fn.test("あいうえお", UTF_8, 3), is("あ"));
			assertThat(fn.test("あいうえお", UTF_8, 4), is("あ"));
			assertThat(fn.test("あいうえお", UTF_8, 5), is("あ"));
			assertThat(fn.test("あいうえお", UTF_8, 6), is("あい"));

			assertThat(fn.test("", UTF_8, 0), is(""));
		}
	}

参考

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up