時々 Emacs が UTF-8 を SJIS と判断する。
そこで、SJIS の検出処理に
- CP932 未定義文字の調査
- UTF-8 として成立していて、全角と半角カナの数を利用
を追加してみる。
Emacs-26.3用パッチ
emacs-26.3-sjis.patch
--- emacs-26.3/src/coding.c.orig 2020-04-02 18:20:40.368317678 +0900
+++ emacs-26.3/src/coding.c 2020-04-06 06:49:33.253716262 +0900
@@ -4599,0 +4600,23 @@
+static const unsigned char detect_coding_sjis_valid_table1[64] =
+ {
+ 0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 0, 0, 7, 10, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 11, 0, 0, 0,
+ };
+static const unsigned int detect_coding_sjis_valid_table2[12][8] =
+ {
+ { 0,0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+ { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xff001fff, 0xfc007f00, 0x10ff01ff },
+ { 0,0, 0x01ff8000, 0x03ffffff, 0x87fffffe, 0xffffffff, 0xffffffff, 0x0003ffff },
+ { 0,0, 0xffffffff, 0x7fffffff, 0x807fffff, 0x807fffff, 0x007fffff, 0x00000000 },
+ { 0,0, 0xffffffff, 0x7fff0001, 0x8003ffff, 0x7fffffff, 0x00000000, 0x00000000 },
+ { 0,0, 0xbfffffff, 0x403fffff, 0x1fffffff, 0x00000000, 0x00000000, 0x00000000 },
+ { 0,0, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+ { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fffffff },
+ { 0,0, 0xffffffff, 0x0007ffff, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+ { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0x0000001f, 0x00000000, 0x00000000 },
+ { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fff9fff },
+ { 0,0, 0x00000fff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+ };
+
@@ -4612,0 +4636,10 @@
+ int utf8_mode = detect_coding_sjis_check_utf_8;
+ int utf8_mask = 0;
+ int utf8_top = 0;
+ int hankaku_count = 0;
+ int zenkaku_count = 0;
+ int zenhan_count = 0;
+ int zenzen_count = 0;
+ int zenkaku_seqlen = 0;
+ int sjis_high, sjis_hpos;
+
@@ -4620,0 +4654,3 @@
+ if (utf8_mode)
+ goto check_utf_8_mode;
+
@@ -4643,0 +4680,8 @@
+ if (utf8_mode && zenkaku_count &&
+ ((zenkaku_count == zenzen_count*3+zenhan_count) ||
+ (hankaku_count && zenkaku_count == zenhan_count)))
+ {
+ detect_info->rejected |= CATEGORY_MASK_SJIS;
+ return 0;
+ }
+
@@ -4650,0 +4695,108 @@
+
+ check_utf_8_mode:
+ while (1)
+ {
+ src_base = src;
+
+ utf8_mask = (utf8_mask >> 1) & utf8_top;
+ ONE_MORE_BYTE (c);
+ if (c < 0x80)
+ {
+ ascii:
+ zenkaku_seqlen = 0;
+ if (utf8_mask)
+ utf8_mode = 0;
+ continue;
+ }
+ if (c == 0x80)
+ break;
+ if (c <= 0xBF)
+ {
+ if (!utf8_mask)
+ utf8_mode = 0;
+ if (c <= 0x9F)
+ goto second_byte;
+ if (c == 0xA0)
+ {
+ if (!detect_coding_sjis_exclude_nbsp_if_check_utf_8)
+ goto ascii;
+ break;
+ }
+ goto hankaku;
+ }
+ if (c <= 0xDF)
+ {
+ if (utf8_mask)
+ utf8_mode = 0;
+ else
+ {
+ utf8_mask = 0x80;
+ utf8_top = c;
+ }
+ hankaku:
+ hankaku_count++;
+ if (utf8_mask && zenkaku_seqlen == 1)
+ zenhan_count++;
+ zenkaku_seqlen = 0;
+ found = CATEGORY_MASK_SJIS;
+ continue;
+ }
+ if (c > max_first_byte_of_2_byte_code)
+ break;
+ if (c >= 0xFC || utf8_mask)
+ utf8_mode = 0;
+ else
+ {
+ utf8_mask = 0x80;
+ utf8_top = c;
+ }
+
+ second_byte:
+ sjis_high = c;
+
+ utf8_mask = (utf8_mask >> 1) & utf8_top;
+ ONE_MORE_BYTE (c);
+ if (c < 0x40)
+ break;
+ if (c < 0x80)
+ {
+ if (c == 0x7F)
+ break;
+ if (utf8_mask)
+ utf8_mode = 0;
+ }
+ else if (c <= 0xBF)
+ {
+ if (!utf8_mask)
+ utf8_mode = 0;
+ }
+ else if (c > 0xFC)
+ break;
+ else if (c == 0xFC || utf8_mask)
+ utf8_mode = 0;
+ else
+ {
+ utf8_mask = 0x80;
+ utf8_top = c;
+ }
+
+ if (detect_coding_sjis_exclude_undefined_if_check_utf_8)
+ {
+ sjis_hpos = detect_coding_sjis_valid_table1[sjis_high & 0x3f];
+ if (!(detect_coding_sjis_valid_table2[sjis_hpos][(c >> 5)] & (1 << (c & 0x1f))))
+ break;
+ }
+
+ zenkaku_count++;
+ if (!utf8_mode || !utf8_mask)
+ zenkaku_seqlen = 0;
+ else if (++zenkaku_seqlen == 3)
+ {
+ zenzen_count++;
+ zenkaku_seqlen = 0;
+ }
+
+ found = CATEGORY_MASK_SJIS;
+ }
+ detect_info->rejected |= CATEGORY_MASK_SJIS;
+ return 0;
@@ -11277,0 +11430,18 @@
+
+ DEFVAR_BOOL ("detect-coding-sjis-check-utf-8", detect_coding_sjis_check_utf_8,
+ doc: /* If non-nil, try to avoid confusion of UTF-8 in SJIS detection. */);
+ detect_coding_sjis_check_utf_8 = 0;
+
+ DEFVAR_BOOL ("detect-coding-sjis-exclude-nbsp-if-check-utf-8",
+ detect_coding_sjis_exclude_nbsp_if_check_utf_8,
+ doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include NBSP in SJIS. */);
+ detect_coding_sjis_exclude_nbsp_if_check_utf_8 = !0;
+
+ DEFVAR_BOOL ("detect-coding-sjis-exclude-undefined-if-check-utf-8",
+ detect_coding_sjis_exclude_undefined_if_check_utf_8,
+ doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include undefined character in SJIS. */);
+ detect_coding_sjis_exclude_undefined_if_check_utf_8 = !0;
Emacs (git: 1467b04f5cf586c0f44b7df00591986fa8d40c66) 用パッチ
diff --git a/src/coding.c b/src/coding.c
index c16598d275..c4d21308b3 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -4571,6 +4571,29 @@ encode_coding_iso_2022 (struct coding_system *coding)
/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
Return true if a text is encoded in SJIS. */
+static const unsigned char detect_coding_sjis_valid_table1[64] =
+ {
+ 0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 0, 0, 7, 10, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 11, 0, 0, 0,
+ };
+static const unsigned int detect_coding_sjis_valid_table2[12][8] =
+ {
+ { 0,0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+ { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xff001fff, 0xfc007f00, 0x10ff01ff },
+ { 0,0, 0x01ff8000, 0x03ffffff, 0x87fffffe, 0xffffffff, 0xffffffff, 0x0003ffff },
+ { 0,0, 0xffffffff, 0x7fffffff, 0x807fffff, 0x807fffff, 0x007fffff, 0x00000000 },
+ { 0,0, 0xffffffff, 0x7fff0001, 0x8003ffff, 0x7fffffff, 0x00000000, 0x00000000 },
+ { 0,0, 0xbfffffff, 0x403fffff, 0x1fffffff, 0x00000000, 0x00000000, 0x00000000 },
+ { 0,0, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+ { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fffffff },
+ { 0,0, 0xffffffff, 0x0007ffff, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+ { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0x0000001f, 0x00000000, 0x00000000 },
+ { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fff9fff },
+ { 0,0, 0x00000fff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+ };
+
static bool
detect_coding_sjis (struct coding_system *coding,
struct coding_detection_info *detect_info)
@@ -4584,6 +4607,16 @@ detect_coding_sjis (struct coding_system *coding,
Lisp_Object attrs, charset_list;
int max_first_byte_of_2_byte_code;
+ int utf8_mode = detect_coding_sjis_check_utf_8;
+ int utf8_mask = 0;
+ int utf8_top = 0;
+ int hankaku_count = 0;
+ int zenkaku_count = 0;
+ int zenhan_count = 0;
+ int zenzen_count = 0;
+ int zenkaku_seqlen = 0;
+ int sjis_high, sjis_hpos;
+
CODING_GET_INFO (coding, attrs, charset_list);
max_first_byte_of_2_byte_code = list_length (charset_list) <= 3 ? 0xEF : 0xFC;
@@ -4591,6 +4624,9 @@ detect_coding_sjis (struct coding_system *coding,
/* A coding system of this category is always ASCII compatible. */
src += coding->head_ascii;
+ if (utf8_mode)
+ goto check_utf_8_mode;
+
while (1)
{
src_base = src;
@@ -4614,6 +4650,14 @@ detect_coding_sjis (struct coding_system *coding,
return 0;
no_more_source:
+ if (utf8_mode && zenkaku_count &&
+ ((zenkaku_count == zenzen_count*3+zenhan_count) ||
+ (hankaku_count && zenkaku_count == zenhan_count)))
+ {
+ detect_info->rejected |= CATEGORY_MASK_SJIS;
+ return 0;
+ }
+
if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
{
detect_info->rejected |= CATEGORY_MASK_SJIS;
@@ -4621,6 +4665,114 @@ detect_coding_sjis (struct coding_system *coding,
}
detect_info->found |= found;
return 1;
+
+ check_utf_8_mode:
+ while (1)
+ {
+ src_base = src;
+
+ utf8_mask = (utf8_mask >> 1) & utf8_top;
+ ONE_MORE_BYTE (c);
+ if (c < 0x80)
+ {
+ ascii:
+ zenkaku_seqlen = 0;
+ if (utf8_mask)
+ utf8_mode = 0;
+ continue;
+ }
+ if (c == 0x80)
+ break;
+ if (c <= 0xBF)
+ {
+ if (!utf8_mask)
+ utf8_mode = 0;
+ if (c <= 0x9F)
+ goto second_byte;
+ if (c == 0xA0)
+ {
+ if (!detect_coding_sjis_exclude_nbsp_if_check_utf_8)
+ goto ascii;
+ break;
+ }
+ goto hankaku;
+ }
+ if (c <= 0xDF)
+ {
+ if (utf8_mask)
+ utf8_mode = 0;
+ else
+ {
+ utf8_mask = 0x80;
+ utf8_top = c;
+ }
+ hankaku:
+ hankaku_count++;
+ if (utf8_mask && zenkaku_seqlen == 1)
+ zenhan_count++;
+ zenkaku_seqlen = 0;
+ found = CATEGORY_MASK_SJIS;
+ continue;
+ }
+ if (c > max_first_byte_of_2_byte_code)
+ break;
+ if (c >= 0xFC || utf8_mask)
+ utf8_mode = 0;
+ else
+ {
+ utf8_mask = 0x80;
+ utf8_top = c;
+ }
+
+ second_byte:
+ sjis_high = c;
+
+ utf8_mask = (utf8_mask >> 1) & utf8_top;
+ ONE_MORE_BYTE (c);
+ if (c < 0x40)
+ break;
+ if (c < 0x80)
+ {
+ if (c == 0x7F)
+ break;
+ if (utf8_mask)
+ utf8_mode = 0;
+ }
+ else if (c <= 0xBF)
+ {
+ if (!utf8_mask)
+ utf8_mode = 0;
+ }
+ else if (c > 0xFC)
+ break;
+ else if (c == 0xFC || utf8_mask)
+ utf8_mode = 0;
+ else
+ {
+ utf8_mask = 0x80;
+ utf8_top = c;
+ }
+
+ if (detect_coding_sjis_exclude_undefined_if_check_utf_8)
+ {
+ sjis_hpos = detect_coding_sjis_valid_table1[sjis_high & 0x3f];
+ if (!(detect_coding_sjis_valid_table2[sjis_hpos][(c >> 5)] & (1 << (c & 0x1f))))
+ break;
+ }
+
+ zenkaku_count++;
+ if (!utf8_mode || !utf8_mask)
+ zenkaku_seqlen = 0;
+ else if (++zenkaku_seqlen == 3)
+ {
+ zenzen_count++;
+ zenkaku_seqlen = 0;
+ }
+
+ found = CATEGORY_MASK_SJIS;
+ }
+ detect_info->rejected |= CATEGORY_MASK_SJIS;
+ return 0;
}
/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
@@ -12158,6 +12310,24 @@ system (e.g. `iso-2022-7bit').
decode text as usual. */);
inhibit_null_byte_detection = 0;
+ DEFVAR_BOOL ("detect-coding-sjis-check-utf-8", detect_coding_sjis_check_utf_8,
+ doc: /* If non-nil, try to avoid confusion of UTF-8 in SJIS detection. */);
+ detect_coding_sjis_check_utf_8 = 0;
+
+ DEFVAR_BOOL ("detect-coding-sjis-exclude-nbsp-if-check-utf-8",
+ detect_coding_sjis_exclude_nbsp_if_check_utf_8,
+ doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include NBSP in SJIS. */);
+ detect_coding_sjis_exclude_nbsp_if_check_utf_8 = !0;
+
+ DEFVAR_BOOL ("detect-coding-sjis-exclude-undefined-if-check-utf-8",
+ detect_coding_sjis_exclude_undefined_if_check_utf_8,
+ doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include undefined character in SJIS. */);
+ detect_coding_sjis_exclude_undefined_if_check_utf_8 = !0;
+
DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
Internal use only. Remove after the experimental optimizer becomes stable. */);
処理を有効にするには
EMACS-LISP
(setq detect-coding-sjis-check-utf-8 t) ; nil は従来の動作(デフォルト)
とします。
追記:テキストの文字コード判定が困難(面倒)な文字列 の UTF-8/SJIS ファイルは、UTF-8 側の認識になりまします。