15
11

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

javascriptでUTF-8/UTF-16変換

Last updated at Posted at 2016-02-12

javascriptでUTF-8/UTF-16変換

javascriptの文字列をUTF-8/UTF-16(LE,BE)へ変換・逆変換する

  • 文字列からUTF-8/UTF-16バイト列への変換
    1. スカラー値の取得
    2. 符号化
  • UTF-8/UTF-16バイト列から文字列への変換

文字列からUTF-8/UTF-16バイト列への変換

###スカラー値の取得

String.prototype.charCodeAt(i) でi文字目の文字コードを取得出来る。 ただjsの文字列はUTF-16なのでサロゲートペアを考慮する必要がある。

"use strict";
var Utf = {
	charCodes: function(str) {
		var i, len, arr;
		for(i=0,len=str.length,arr=[]; i<len; i++) {
			arr.push(str.charCodeAt(i));
		}
		return arr;
	},
	scalarValues: function(arr) {
		var scalars = [], i, len, c;
		if (typeof arr === 'string') {arr = Utf.charCodes(arr);}
		for(i=0,len=arr.length; c=arr[i],i<len; i++) {
			if (c >= 0xd800 && c <= 0xdbff) {
				scalars.push((c & 1023) + 64 << 10 | arr[++i] & 1023);
			} else {scalars.push(c);}
		}
		return scalars;
	}
};

符号化

UTF-8はスカラー値から変換する。 UTF-16は文字コードからバイト列化する(スカラー値からだと2度手間)。

Utf.encodeUTF8 = function(str) {
	var sv = Utf.scalarValues(str), i, len, arr, c;
	for(i=0,len=sv.length,arr=[]; c=sv[i],i<len; i++) {
		if (c <= 0x7f) {arr.push(c);} //1byte
		else if (c <= 0x7ff) {arr.push(0xc0|c>>>6, 0x80|c&0xbf);} //2byte
		else if (c <= 0xffff) { //3byte
			arr.push(0xe0|c>>>12, 0x80|c>>>6&0xbf, 0x80|c&0xbf);
		} else if (c <= 0x10ffff) { //4byte
			arr.push(0xf0|c>>>18, 0x80|c>>>12&0xbf, 0x80|c>>>6&0xbf, 0x80|c&0xbf);
		}
	}
	return arr;
};
Utf.encodeUTF16 = function(str, endian) {
	var codes = Utf.charCodes(str), i, len, arr, c;
	arr = endian ? endian === 'LE' ? [0xff,0xfe] : [0xfe,0xff] : []; //BOM
	for(i=0,len=codes.length; c=codes[i],i<len; i++) {
		if (endian === 'LE') {
			arr.push(c & 0xff, c >>> 8 & 0xff);
		} else {
			arr.push(c >>> 8 & 0xff, c & 0xff);
		}
	}
	return arr;
};

またUTF-8への変換にスカラー値を求めず、文字コードからサロゲートペアを考慮しても良い

Utf.encodeUTF8 = function(str) {
	var codes = Utf.charCodes(str), i, len, arr, c;
	for(i=0,len=codes.length,arr=[]; c=codes[i],i<len; i++) {
		if (c <= 0x7f) {arr.push(c);} //1byte
		else if (c <= 0x7ff) {arr.push(0xc0|c>>>6, 0x80|c&0xbf);} //2byte
		else if (c <= 0xdbff && c >= 0xd800) { //4byte
			//utf16  110110wwwwxxxxxx  110111xxxxxxxxxx
			//utf8   11110uuu  10uuxxxx  10xxxxxx  10xxxxxx (uuuuu = wwww+1)
			arr.push(
				0xf0 | (c=(c & 1023) + 64) >>> 8,
				0x80 | c >>> 2 & 63,
				0x80 | (c & 3) << 4 | (c=codes[++i] & 1023) >>> 6,
				0x80 | c & 63
			);
		} else if (c <= 0xffff) { //3byte
			arr.push(0xe0|c>>>12, 0x80|c>>>6&0xbf, 0x80|c&0xbf);
		}
	}
	return arr;
};

UTF-8/UTF-16バイト列から文字列への変換

上記と逆の処理を行えばよい

Utf.decodeUTF8 = function(arr) {
	var i, len, c, str, char = String.fromCharCode;
	for(i=0,len=arr.length,str=""; c=arr[i],i<len; i++) {
		if (c <= 0x7f) {str += char(c);}
		else if (c <= 0xdf && c >= 0xc2) {
			str += char((c&31)<<6 | arr[++i]&63);
		} else if (c <= 0xef && c >= 0xe0) {
			str += char((c&15)<<12 | (arr[++i]&63)<<6 | arr[++i]&63);
		} else if (c <= 0xf7 && c >= 0xf0) {
			//utf8   11110uuu  10uuxxxx  10xxxxxx  10xxxxxx
			//utf16  110110wwwwxxxxxx  110111xxxxxxxxxx (wwww = uuuuu-1)
			str += char(
				0xd800 | ((c&7)<<8 | (arr[++i]&63)<<2 | arr[++i]>>>4&3) - 64,
				0xdc00 | (arr[i++]&15)<<6 | arr[i]&63
			);
		}
		else {str += char(0xfffd);}
	}
	return str;
};
Utf.decodeUTF16 = function(arr) {
	var endian = 'BE', i = 0, len, str;
	if (arr[0]*arr[1] === 0xff*0xfe) { // remove BOM
		endian = arr[0] === 0xff ? 'LE' : 'BE';
		i = 2;
	}
	for(len=arr.length,str=""; i<len; i+=2) {
		str += String.fromCharCode(endian === 'LE' ? (arr[i+1]<<8|arr[i]) : (arr[i]<<8|arr[i+1]));
	}
	return str;
};
15
11
2

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
15
11

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?