1
1

More than 1 year has passed since last update.

Emacs の SJIS 文字コード検出に UTF-8 回避(努力)のパッチをあてる

Last updated at Posted at 2020-04-05

時々 Emacs が UTF-8 を SJIS と判断する。

そこで、SJIS の検出処理に

  • CP932 未定義文字の調査
  • UTF-8 として成立していて、全角と半角カナの数を利用

を追加してみる。

Emacs-26.3用パッチ
emacs-26.3-sjis.patch
--- emacs-26.3/src/coding.c.orig	2020-04-02 18:20:40.368317678 +0900
+++ emacs-26.3/src/coding.c	2020-04-06 06:49:33.253716262 +0900
@@ -4599,0 +4600,23 @@
+static const unsigned char detect_coding_sjis_valid_table1[64] =
+  {
+   0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 7, 7, 7, 7, 7, 7,
+   7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7,
+   7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 0, 0, 7, 10, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 11, 0, 0, 0,
+  };
+static const unsigned int detect_coding_sjis_valid_table2[12][8] =
+  {
+   { 0,0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xff001fff, 0xfc007f00, 0x10ff01ff },
+   { 0,0, 0x01ff8000, 0x03ffffff, 0x87fffffe, 0xffffffff, 0xffffffff, 0x0003ffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0x807fffff, 0x807fffff, 0x007fffff, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fff0001, 0x8003ffff, 0x7fffffff, 0x00000000, 0x00000000 },
+   { 0,0, 0xbfffffff, 0x403fffff, 0x1fffffff, 0x00000000, 0x00000000, 0x00000000 },
+   { 0,0, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x0007ffff, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0x0000001f, 0x00000000, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fff9fff },
+   { 0,0, 0x00000fff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+  };
+
@@ -4612,0 +4636,10 @@
+  int utf8_mode = detect_coding_sjis_check_utf_8;
+  int utf8_mask = 0;
+  int utf8_top = 0;
+  int hankaku_count = 0;
+  int zenkaku_count = 0;
+  int zenhan_count = 0;
+  int zenzen_count = 0;
+  int zenkaku_seqlen = 0;
+  int sjis_high, sjis_hpos;
+
@@ -4620,0 +4654,3 @@
+  if (utf8_mode)
+    goto check_utf_8_mode;
+
@@ -4643,0 +4680,8 @@
+  if (utf8_mode && zenkaku_count &&
+      ((zenkaku_count == zenzen_count*3+zenhan_count) ||
+       (hankaku_count && zenkaku_count == zenhan_count)))
+    {
+      detect_info->rejected |= CATEGORY_MASK_SJIS;
+      return 0;
+    }
+
@@ -4650,0 +4695,108 @@
+
+ check_utf_8_mode:
+  while (1)
+    {
+      src_base = src;
+
+      utf8_mask = (utf8_mask >> 1) & utf8_top;
+      ONE_MORE_BYTE (c);
+      if (c < 0x80)
+	{
+	ascii:
+	  zenkaku_seqlen = 0;
+	  if (utf8_mask)
+	    utf8_mode = 0;
+	  continue;
+	}
+      if (c == 0x80)
+	break;
+      if (c <= 0xBF)
+	{
+	  if (!utf8_mask)
+	    utf8_mode = 0;
+	  if (c <= 0x9F)
+	    goto second_byte;
+	  if (c == 0xA0)
+	    {
+	      if (!detect_coding_sjis_exclude_nbsp_if_check_utf_8)
+		goto ascii;
+	      break;
+	    }
+	  goto hankaku;
+	}
+      if (c <= 0xDF)
+	{
+	  if (utf8_mask)
+	      utf8_mode = 0;
+	  else
+	    {
+	      utf8_mask = 0x80;
+	      utf8_top = c;
+	    }
+	hankaku:
+	  hankaku_count++;
+	  if (utf8_mask && zenkaku_seqlen == 1)
+	    zenhan_count++;
+	  zenkaku_seqlen = 0;
+	  found = CATEGORY_MASK_SJIS;
+	  continue;
+	}
+      if (c > max_first_byte_of_2_byte_code)
+	break;
+      if (c >= 0xFC || utf8_mask)
+	utf8_mode = 0;
+      else
+	{
+	  utf8_mask = 0x80;
+	  utf8_top = c;
+	}
+
+    second_byte:
+      sjis_high = c;
+
+      utf8_mask = (utf8_mask >> 1) & utf8_top;
+      ONE_MORE_BYTE (c);
+      if (c < 0x40)
+	break;
+      if (c < 0x80)
+	{
+	  if (c == 0x7F)
+	    break;
+	  if (utf8_mask)
+	    utf8_mode = 0;
+	}
+      else if (c <= 0xBF)
+	{
+	  if (!utf8_mask)
+	    utf8_mode = 0;
+	}
+      else if (c > 0xFC)
+	break;
+      else if (c == 0xFC || utf8_mask)
+	utf8_mode = 0;
+      else
+	{
+	  utf8_mask = 0x80;
+	  utf8_top = c;
+	}
+
+      if (detect_coding_sjis_exclude_undefined_if_check_utf_8)
+	{
+	  sjis_hpos = detect_coding_sjis_valid_table1[sjis_high & 0x3f];
+	  if (!(detect_coding_sjis_valid_table2[sjis_hpos][(c >> 5)] & (1 << (c & 0x1f))))
+	    break;
+	}
+
+      zenkaku_count++;
+      if (!utf8_mode || !utf8_mask)
+	zenkaku_seqlen = 0;
+      else if (++zenkaku_seqlen == 3)
+	{
+	  zenzen_count++;
+	  zenkaku_seqlen = 0;
+	}
+
+      found = CATEGORY_MASK_SJIS;
+    }
+  detect_info->rejected |= CATEGORY_MASK_SJIS;
+  return 0;
@@ -11277,0 +11430,18 @@
+
+  DEFVAR_BOOL ("detect-coding-sjis-check-utf-8", detect_coding_sjis_check_utf_8,
+	       doc: /* If non-nil, try to avoid confusion of UTF-8 in SJIS detection.  */);
+  detect_coding_sjis_check_utf_8 = 0;
+
+  DEFVAR_BOOL ("detect-coding-sjis-exclude-nbsp-if-check-utf-8",
+	       detect_coding_sjis_exclude_nbsp_if_check_utf_8,
+	       doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include NBSP in SJIS.  */);
+  detect_coding_sjis_exclude_nbsp_if_check_utf_8 = !0;
+
+  DEFVAR_BOOL ("detect-coding-sjis-exclude-undefined-if-check-utf-8",
+	       detect_coding_sjis_exclude_undefined_if_check_utf_8,
+	       doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include undefined character in SJIS.  */);
+  detect_coding_sjis_exclude_undefined_if_check_utf_8 = !0;
Emacs (git: 1467b04f5cf586c0f44b7df00591986fa8d40c66) 用パッチ
diff --git a/src/coding.c b/src/coding.c
index c16598d275..c4d21308b3 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -4571,6 +4571,29 @@ encode_coding_iso_2022 (struct coding_system *coding)
 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
    Return true if a text is encoded in SJIS.  */
 
+static const unsigned char detect_coding_sjis_valid_table1[64] =
+  {
+   0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 7, 7, 7, 7, 7, 7,
+   7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7,
+   7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 0, 0, 7, 10, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 11, 0, 0, 0,
+  };
+static const unsigned int detect_coding_sjis_valid_table2[12][8] =
+  {
+   { 0,0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xff001fff, 0xfc007f00, 0x10ff01ff },
+   { 0,0, 0x01ff8000, 0x03ffffff, 0x87fffffe, 0xffffffff, 0xffffffff, 0x0003ffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0x807fffff, 0x807fffff, 0x007fffff, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fff0001, 0x8003ffff, 0x7fffffff, 0x00000000, 0x00000000 },
+   { 0,0, 0xbfffffff, 0x403fffff, 0x1fffffff, 0x00000000, 0x00000000, 0x00000000 },
+   { 0,0, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x0007ffff, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0x0000001f, 0x00000000, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fff9fff },
+   { 0,0, 0x00000fff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+  };
+
 static bool
 detect_coding_sjis (struct coding_system *coding,
 		    struct coding_detection_info *detect_info)
@@ -4584,6 +4607,16 @@ detect_coding_sjis (struct coding_system *coding,
   Lisp_Object attrs, charset_list;
   int max_first_byte_of_2_byte_code;
 
+  int utf8_mode = detect_coding_sjis_check_utf_8;
+  int utf8_mask = 0;
+  int utf8_top = 0;
+  int hankaku_count = 0;
+  int zenkaku_count = 0;
+  int zenhan_count = 0;
+  int zenzen_count = 0;
+  int zenkaku_seqlen = 0;
+  int sjis_high, sjis_hpos;
+
   CODING_GET_INFO (coding, attrs, charset_list);
   max_first_byte_of_2_byte_code = list_length (charset_list) <= 3 ? 0xEF : 0xFC;
 
@@ -4591,6 +4624,9 @@ detect_coding_sjis (struct coding_system *coding,
   /* A coding system of this category is always ASCII compatible.  */
   src += coding->head_ascii;
 
+  if (utf8_mode)
+    goto check_utf_8_mode;
+
   while (1)
     {
       src_base = src;
@@ -4614,6 +4650,14 @@ detect_coding_sjis (struct coding_system *coding,
   return 0;
 
  no_more_source:
+  if (utf8_mode && zenkaku_count &&
+      ((zenkaku_count == zenzen_count*3+zenhan_count) ||
+       (hankaku_count && zenkaku_count == zenhan_count)))
+    {
+      detect_info->rejected |= CATEGORY_MASK_SJIS;
+      return 0;
+    }
+
   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
     {
       detect_info->rejected |= CATEGORY_MASK_SJIS;
@@ -4621,6 +4665,114 @@ detect_coding_sjis (struct coding_system *coding,
     }
   detect_info->found |= found;
   return 1;
+
+ check_utf_8_mode:
+  while (1)
+    {
+      src_base = src;
+
+      utf8_mask = (utf8_mask >> 1) & utf8_top;
+      ONE_MORE_BYTE (c);
+      if (c < 0x80)
+	{
+	ascii:
+	  zenkaku_seqlen = 0;
+	  if (utf8_mask)
+	    utf8_mode = 0;
+	  continue;
+	}
+      if (c == 0x80)
+	break;
+      if (c <= 0xBF)
+	{
+	  if (!utf8_mask)
+	    utf8_mode = 0;
+	  if (c <= 0x9F)
+	    goto second_byte;
+	  if (c == 0xA0)
+	    {
+	      if (!detect_coding_sjis_exclude_nbsp_if_check_utf_8)
+		goto ascii;
+	      break;
+	    }
+	  goto hankaku;
+	}
+      if (c <= 0xDF)
+	{
+	  if (utf8_mask)
+	      utf8_mode = 0;
+	  else
+	    {
+	      utf8_mask = 0x80;
+	      utf8_top = c;
+	    }
+	hankaku:
+	  hankaku_count++;
+	  if (utf8_mask && zenkaku_seqlen == 1)
+	    zenhan_count++;
+	  zenkaku_seqlen = 0;
+	  found = CATEGORY_MASK_SJIS;
+	  continue;
+	}
+      if (c > max_first_byte_of_2_byte_code)
+	break;
+      if (c >= 0xFC || utf8_mask)
+	utf8_mode = 0;
+      else
+	{
+	  utf8_mask = 0x80;
+	  utf8_top = c;
+	}
+
+    second_byte:
+      sjis_high = c;
+
+      utf8_mask = (utf8_mask >> 1) & utf8_top;
+      ONE_MORE_BYTE (c);
+      if (c < 0x40)
+	break;
+      if (c < 0x80)
+	{
+	  if (c == 0x7F)
+	    break;
+	  if (utf8_mask)
+	    utf8_mode = 0;
+	}
+      else if (c <= 0xBF)
+	{
+	  if (!utf8_mask)
+	    utf8_mode = 0;
+	}
+      else if (c > 0xFC)
+	break;
+      else if (c == 0xFC || utf8_mask)
+	utf8_mode = 0;
+      else
+	{
+	  utf8_mask = 0x80;
+	  utf8_top = c;
+	}
+
+      if (detect_coding_sjis_exclude_undefined_if_check_utf_8)
+	{
+	  sjis_hpos = detect_coding_sjis_valid_table1[sjis_high & 0x3f];
+	  if (!(detect_coding_sjis_valid_table2[sjis_hpos][(c >> 5)] & (1 << (c & 0x1f))))
+	    break;
+	}
+
+      zenkaku_count++;
+      if (!utf8_mode || !utf8_mask)
+	zenkaku_seqlen = 0;
+      else if (++zenkaku_seqlen == 3)
+	{
+	  zenzen_count++;
+	  zenkaku_seqlen = 0;
+	}
+
+      found = CATEGORY_MASK_SJIS;
+    }
+  detect_info->rejected |= CATEGORY_MASK_SJIS;
+  return 0;
 }
 
 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
@@ -12158,6 +12310,24 @@ system (e.g. `iso-2022-7bit').
 decode text as usual.  */);
   inhibit_null_byte_detection = 0;
 
+  DEFVAR_BOOL ("detect-coding-sjis-check-utf-8", detect_coding_sjis_check_utf_8,
+	       doc: /* If non-nil, try to avoid confusion of UTF-8 in SJIS detection.  */);
+  detect_coding_sjis_check_utf_8 = 0;
+
+  DEFVAR_BOOL ("detect-coding-sjis-exclude-nbsp-if-check-utf-8",
+	       detect_coding_sjis_exclude_nbsp_if_check_utf_8,
+	       doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include NBSP in SJIS.  */);
+  detect_coding_sjis_exclude_nbsp_if_check_utf_8 = !0;
+
+  DEFVAR_BOOL ("detect-coding-sjis-exclude-undefined-if-check-utf-8",
+	       detect_coding_sjis_exclude_undefined_if_check_utf_8,
+	       doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include undefined character in SJIS.  */);
+  detect_coding_sjis_exclude_undefined_if_check_utf_8 = !0;
+
   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
 	       doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
 Internal use only.  Remove after the experimental optimizer becomes stable.  */);

処理を有効にするには

EMACS-LISP
(setq detect-coding-sjis-check-utf-8 t) ; nil は従来の動作(デフォルト)

とします。

追記:テキストの文字コード判定が困難(面倒)な文字列 の UTF-8/SJIS ファイルは、UTF-8 側の認識になりまします。

1
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1