More than 3 years have passed since last update.

Emacs の SJIS 文字コード検出に UTF-8 回避(努力)のパッチをあてる

Last updated at 2022-03-19Posted at 2020-04-05

時々 Emacs が UTF-8 を SJIS と判断する。

そこで、SJIS の検出処理に

CP932 未定義文字の調査
UTF-8 として成立していて、全角と半角カナの数を利用

を追加してみる。

Emacs-26.3用パッチ

emacs-26.3-sjis.patch

--- emacs-26.3/src/coding.c.orig	2020-04-02 18:20:40.368317678 +0900
+++ emacs-26.3/src/coding.c	2020-04-06 06:49:33.253716262 +0900
@@ -4599,0 +4600,23 @@
+static const unsigned char detect_coding_sjis_valid_table1[64] =
+  {
+   0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 7, 7, 7, 7, 7, 7,
+   7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7,
+   7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 0, 0, 7, 10, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 11, 0, 0, 0,
+  };
+static const unsigned int detect_coding_sjis_valid_table2[12][8] =
+  {
+   { 0,0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xff001fff, 0xfc007f00, 0x10ff01ff },
+   { 0,0, 0x01ff8000, 0x03ffffff, 0x87fffffe, 0xffffffff, 0xffffffff, 0x0003ffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0x807fffff, 0x807fffff, 0x007fffff, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fff0001, 0x8003ffff, 0x7fffffff, 0x00000000, 0x00000000 },
+   { 0,0, 0xbfffffff, 0x403fffff, 0x1fffffff, 0x00000000, 0x00000000, 0x00000000 },
+   { 0,0, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x0007ffff, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0x0000001f, 0x00000000, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fff9fff },
+   { 0,0, 0x00000fff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+  };
+
@@ -4612,0 +4636,10 @@
+  int utf8_mode = detect_coding_sjis_check_utf_8;
+  int utf8_mask = 0;
+  int utf8_top = 0;
+  int hankaku_count = 0;
+  int zenkaku_count = 0;
+  int zenhan_count = 0;
+  int zenzen_count = 0;
+  int zenkaku_seqlen = 0;
+  int sjis_high, sjis_hpos;
+
@@ -4620,0 +4654,3 @@
+  if (utf8_mode)
+    goto check_utf_8_mode;
+
@@ -4643,0 +4680,8 @@
+  if (utf8_mode && zenkaku_count &&
+      ((zenkaku_count == zenzen_count*3+zenhan_count) ||
+       (hankaku_count && zenkaku_count == zenhan_count)))
+    {
+      detect_info->rejected |= CATEGORY_MASK_SJIS;
+      return 0;
+    }
+
@@ -4650,0 +4695,108 @@
+
+ check_utf_8_mode:
+  while (1)
+    {
+      src_base = src;
+
+      utf8_mask = (utf8_mask >> 1) & utf8_top;
+      ONE_MORE_BYTE (c);
+      if (c < 0x80)
+	{
+	ascii:
+	  zenkaku_seqlen = 0;
+	  if (utf8_mask)
+	    utf8_mode = 0;
+	  continue;
+	}
+      if (c == 0x80)
+	break;
+      if (c <= 0xBF)
+	{
+	  if (!utf8_mask)
+	    utf8_mode = 0;
+	  if (c <= 0x9F)
+	    goto second_byte;
+	  if (c == 0xA0)
+	    {
+	      if (!detect_coding_sjis_exclude_nbsp_if_check_utf_8)
+		goto ascii;
+	      break;
+	    }
+	  goto hankaku;
+	}
+      if (c <= 0xDF)
+	{
+	  if (utf8_mask)
+	      utf8_mode = 0;
+	  else
+	    {
+	      utf8_mask = 0x80;
+	      utf8_top = c;
+	    }
+	hankaku:
+	  hankaku_count++;
+	  if (utf8_mask && zenkaku_seqlen == 1)
+	    zenhan_count++;
+	  zenkaku_seqlen = 0;
+	  found = CATEGORY_MASK_SJIS;
+	  continue;
+	}
+      if (c > max_first_byte_of_2_byte_code)
+	break;
+      if (c >= 0xFC || utf8_mask)
+	utf8_mode = 0;
+      else
+	{
+	  utf8_mask = 0x80;
+	  utf8_top = c;
+	}
+
+    second_byte:
+      sjis_high = c;
+
+      utf8_mask = (utf8_mask >> 1) & utf8_top;
+      ONE_MORE_BYTE (c);
+      if (c < 0x40)
+	break;
+      if (c < 0x80)
+	{
+	  if (c == 0x7F)
+	    break;
+	  if (utf8_mask)
+	    utf8_mode = 0;
+	}
+      else if (c <= 0xBF)
+	{
+	  if (!utf8_mask)
+	    utf8_mode = 0;
+	}
+      else if (c > 0xFC)
+	break;
+      else if (c == 0xFC || utf8_mask)
+	utf8_mode = 0;
+      else
+	{
+	  utf8_mask = 0x80;
+	  utf8_top = c;
+	}
+
+      if (detect_coding_sjis_exclude_undefined_if_check_utf_8)
+	{
+	  sjis_hpos = detect_coding_sjis_valid_table1[sjis_high & 0x3f];
+	  if (!(detect_coding_sjis_valid_table2[sjis_hpos][(c >> 5)] & (1 << (c & 0x1f))))
+	    break;
+	}
+
+      zenkaku_count++;
+      if (!utf8_mode || !utf8_mask)
+	zenkaku_seqlen = 0;
+      else if (++zenkaku_seqlen == 3)
+	{
+	  zenzen_count++;
+	  zenkaku_seqlen = 0;
+	}
+
+      found = CATEGORY_MASK_SJIS;
+    }
+  detect_info->rejected |= CATEGORY_MASK_SJIS;
+  return 0;
@@ -11277,0 +11430,18 @@
+
+  DEFVAR_BOOL ("detect-coding-sjis-check-utf-8", detect_coding_sjis_check_utf_8,
+	       doc: /* If non-nil, try to avoid confusion of UTF-8 in SJIS detection.  */);
+  detect_coding_sjis_check_utf_8 = 0;
+
+  DEFVAR_BOOL ("detect-coding-sjis-exclude-nbsp-if-check-utf-8",
+	       detect_coding_sjis_exclude_nbsp_if_check_utf_8,
+	       doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include NBSP in SJIS.  */);
+  detect_coding_sjis_exclude_nbsp_if_check_utf_8 = !0;
+
+  DEFVAR_BOOL ("detect-coding-sjis-exclude-undefined-if-check-utf-8",
+	       detect_coding_sjis_exclude_undefined_if_check_utf_8,
+	       doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include undefined character in SJIS.  */);
+  detect_coding_sjis_exclude_undefined_if_check_utf_8 = !0;

Emacs (git: 1467b04f5cf586c0f44b7df00591986fa8d40c66) 用パッチ

diff --git a/src/coding.c b/src/coding.c
index c16598d275..c4d21308b3 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -4571,6 +4571,29 @@ encode_coding_iso_2022 (struct coding_system *coding)
 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
    Return true if a text is encoded in SJIS.  */
 
+static const unsigned char detect_coding_sjis_valid_table1[64] =
+  {
+   0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 7, 7, 7, 7, 7, 7,
+   7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7,
+   7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 0, 0, 7, 10, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 11, 0, 0, 0,
+  };
+static const unsigned int detect_coding_sjis_valid_table2[12][8] =
+  {
+   { 0,0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xff001fff, 0xfc007f00, 0x10ff01ff },
+   { 0,0, 0x01ff8000, 0x03ffffff, 0x87fffffe, 0xffffffff, 0xffffffff, 0x0003ffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0x807fffff, 0x807fffff, 0x007fffff, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fff0001, 0x8003ffff, 0x7fffffff, 0x00000000, 0x00000000 },
+   { 0,0, 0xbfffffff, 0x403fffff, 0x1fffffff, 0x00000000, 0x00000000, 0x00000000 },
+   { 0,0, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x0007ffff, 0x80000000, 0xffffffff, 0xffffffff, 0x1fffffff },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0x0000001f, 0x00000000, 0x00000000 },
+   { 0,0, 0xffffffff, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x1fff9fff },
+   { 0,0, 0x00000fff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+  };
+
 static bool
 detect_coding_sjis (struct coding_system *coding,
 		    struct coding_detection_info *detect_info)
@@ -4584,6 +4607,16 @@ detect_coding_sjis (struct coding_system *coding,
   Lisp_Object attrs, charset_list;
   int max_first_byte_of_2_byte_code;
 
+  int utf8_mode = detect_coding_sjis_check_utf_8;
+  int utf8_mask = 0;
+  int utf8_top = 0;
+  int hankaku_count = 0;
+  int zenkaku_count = 0;
+  int zenhan_count = 0;
+  int zenzen_count = 0;
+  int zenkaku_seqlen = 0;
+  int sjis_high, sjis_hpos;
+
   CODING_GET_INFO (coding, attrs, charset_list);
   max_first_byte_of_2_byte_code = list_length (charset_list) <= 3 ? 0xEF : 0xFC;
 
@@ -4591,6 +4624,9 @@ detect_coding_sjis (struct coding_system *coding,
   /* A coding system of this category is always ASCII compatible.  */
   src += coding->head_ascii;
 
+  if (utf8_mode)
+    goto check_utf_8_mode;
+
   while (1)
     {
       src_base = src;
@@ -4614,6 +4650,14 @@ detect_coding_sjis (struct coding_system *coding,
   return 0;
 
  no_more_source:
+  if (utf8_mode && zenkaku_count &&
+      ((zenkaku_count == zenzen_count*3+zenhan_count) ||
+       (hankaku_count && zenkaku_count == zenhan_count)))
+    {
+      detect_info->rejected |= CATEGORY_MASK_SJIS;
+      return 0;
+    }
+
   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
     {
       detect_info->rejected |= CATEGORY_MASK_SJIS;
@@ -4621,6 +4665,114 @@ detect_coding_sjis (struct coding_system *coding,
     }
   detect_info->found |= found;
   return 1;
+
+ check_utf_8_mode:
+  while (1)
+    {
+      src_base = src;
+
+      utf8_mask = (utf8_mask >> 1) & utf8_top;
+      ONE_MORE_BYTE (c);
+      if (c < 0x80)
+	{
+	ascii:
+	  zenkaku_seqlen = 0;
+	  if (utf8_mask)
+	    utf8_mode = 0;
+	  continue;
+	}
+      if (c == 0x80)
+	break;
+      if (c <= 0xBF)
+	{
+	  if (!utf8_mask)
+	    utf8_mode = 0;
+	  if (c <= 0x9F)
+	    goto second_byte;
+	  if (c == 0xA0)
+	    {
+	      if (!detect_coding_sjis_exclude_nbsp_if_check_utf_8)
+		goto ascii;
+	      break;
+	    }
+	  goto hankaku;
+	}
+      if (c <= 0xDF)
+	{
+	  if (utf8_mask)
+	      utf8_mode = 0;
+	  else
+	    {
+	      utf8_mask = 0x80;
+	      utf8_top = c;
+	    }
+	hankaku:
+	  hankaku_count++;
+	  if (utf8_mask && zenkaku_seqlen == 1)
+	    zenhan_count++;
+	  zenkaku_seqlen = 0;
+	  found = CATEGORY_MASK_SJIS;
+	  continue;
+	}
+      if (c > max_first_byte_of_2_byte_code)
+	break;
+      if (c >= 0xFC || utf8_mask)
+	utf8_mode = 0;
+      else
+	{
+	  utf8_mask = 0x80;
+	  utf8_top = c;
+	}
+
+    second_byte:
+      sjis_high = c;
+
+      utf8_mask = (utf8_mask >> 1) & utf8_top;
+      ONE_MORE_BYTE (c);
+      if (c < 0x40)
+	break;
+      if (c < 0x80)
+	{
+	  if (c == 0x7F)
+	    break;
+	  if (utf8_mask)
+	    utf8_mode = 0;
+	}
+      else if (c <= 0xBF)
+	{
+	  if (!utf8_mask)
+	    utf8_mode = 0;
+	}
+      else if (c > 0xFC)
+	break;
+      else if (c == 0xFC || utf8_mask)
+	utf8_mode = 0;
+      else
+	{
+	  utf8_mask = 0x80;
+	  utf8_top = c;
+	}
+
+      if (detect_coding_sjis_exclude_undefined_if_check_utf_8)
+	{
+	  sjis_hpos = detect_coding_sjis_valid_table1[sjis_high & 0x3f];
+	  if (!(detect_coding_sjis_valid_table2[sjis_hpos][(c >> 5)] & (1 << (c & 0x1f))))
+	    break;
+	}
+
+      zenkaku_count++;
+      if (!utf8_mode || !utf8_mask)
+	zenkaku_seqlen = 0;
+      else if (++zenkaku_seqlen == 3)
+	{
+	  zenzen_count++;
+	  zenkaku_seqlen = 0;
+	}
+
+      found = CATEGORY_MASK_SJIS;
+    }
+  detect_info->rejected |= CATEGORY_MASK_SJIS;
+  return 0;
 }
 
 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
@@ -12158,6 +12310,24 @@ system (e.g. `iso-2022-7bit').
 decode text as usual.  */);
   inhibit_null_byte_detection = 0;
 
+  DEFVAR_BOOL ("detect-coding-sjis-check-utf-8", detect_coding_sjis_check_utf_8,
+	       doc: /* If non-nil, try to avoid confusion of UTF-8 in SJIS detection.  */);
+  detect_coding_sjis_check_utf_8 = 0;
+
+  DEFVAR_BOOL ("detect-coding-sjis-exclude-nbsp-if-check-utf-8",
+	       detect_coding_sjis_exclude_nbsp_if_check_utf_8,
+	       doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include NBSP in SJIS.  */);
+  detect_coding_sjis_exclude_nbsp_if_check_utf_8 = !0;
+
+  DEFVAR_BOOL ("detect-coding-sjis-exclude-undefined-if-check-utf-8",
+	       detect_coding_sjis_exclude_undefined_if_check_utf_8,
+	       doc: /*
+Option for `detect-coding-sjis-check-utf-8'.
+If non-nil, does not include undefined character in SJIS.  */);
+  detect_coding_sjis_exclude_undefined_if_check_utf_8 = !0;
+
   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
 	       doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
 Internal use only.  Remove after the experimental optimizer becomes stable.  */);

処理を有効にするには

EMACS-LISP

(setq detect-coding-sjis-check-utf-8 t) ; nil は従来の動作(デフォルト)

とします。

追記：テキストの文字コード判定が困難(面倒)な文字列の UTF-8/SJIS ファイルは、UTF-8 側の認識になりまします。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up