LoginSignup
6

More than 5 years have passed since last update.

JSONやその他ログ等の日本語マルチバイト文字デコード術

Last updated at Posted at 2016-12-11

Emacsで扱うマルチバイト文字のデコードについて書きます
Emacs Advent Calendar 2016

エンコードされた日本語等を含むテキストをEmacsでデコードします。

バッファ内Unicodeの置換

Decode (\u3044)

;; 選択範囲(region) + M-x my-decode-backslash-u-string-in-region-and-replace
;; \U3044\U30fc\U307e\U3063\U304f\U3059 -> いーまっくす
(defun my-decode-backslash-u-string-in-region-and-replace ($beg $end)
  (interactive "r")
  (let (($bufstr (buffer-substring-no-properties $beg $end))
        $replaced)
    (with-temp-buffer
      (insert $bufstr)
      (goto-char (point-min))
      (while (re-search-forward "\\\\[uU]\\(.\\{4\\}\\)" nil t)
        (let (($hex (match-string 1)))
          (delete-region (match-beginning 0) (match-end 0))
          (insert (format "%c" (string-to-number $hex 16)))))
      (setq $replaced (buffer-string)))
    (delete-region $beg $end)
    (insert $replaced)))

Decode (&#x3044)

;; 選択範囲(region) + my-decode-ampersand-sharp-string-in-region-and-replace
;; いーまっくす -> いーまっくす
(defun my-decode-ampersand-sharp-string-in-region-and-replace ($beg $end)
  (interactive "r")
  (let (($bufstr (buffer-substring-no-properties $beg $end))
        $replaced)
    (with-temp-buffer
      (insert $bufstr)
      (goto-char (point-min))
      (while (re-search-forward "\\&\\#x\\(.\\{4\\}\\);" nil t)
        (let (($hex (match-string 1)))
          (delete-region (match-beginning 0) (match-end 0))
          (insert (format "%c" (string-to-number $hex 16)))))
      (setq $replaced (buffer-string)))
    (delete-region $beg $end)
    (insert $replaced)))

URLの変換

日本語(multibyte)ドメイン用には別途puny変換が必要なため, w3mの関数を利用しています。
Emacs25.2以降はデフォルトでpuny変換用の関数が搭載されるようです。

Encode

(my-encode-url-string "https://www.google.co.jp/search?q=いーまっくす")
;; => https://www.google.co.jp/search?q=%E3%81%84%E3%83%BC%E3%81%BE%E3%81%A3%E3%81%8F%E3%81%99
(defun my-encode-url-string ($url)
  (when (fboundp 'w3m-puny-encode-url)
    (setq $url (w3m-puny-encode-url $url)))
  (url-encode-url $url))

Decode

(my-decode-url-string "https://www.google.co.jp/search?q=%E3%81%84%E3%83%BC%E3%81%BE%E3%81%A3%E3%81%8F%E3%81%99")
;; => https://www.google.co.jp/search?q=いーまっくす
(defun my-decode-url-string ($url)
  (when (fboundp 'w3m-puny-decode-url)
    (setq $url (w3m-puny-decode-url $url)))
  (decode-coding-string (url-unhex-string $url) 'utf-8))

Multi-byte (\345\215\203)

バックスラッシュに3桁の8進数の文字

Encode

(my-encode-as-octet-string "emacsを始めよう!")
;; => emacs\343\202\222\345\247\213\343\202\201\343\202\210\343\201\206\357\274\201
(defun my-encode-as-octet-string ($str)
  (with-temp-buffer
    (insert $str)
    (encode-coding-region (point-min) (point-max) 'utf-8)
    (buffer-string)))

Decode

(my-decode-octet-string "emacs\343\202\222\345\247\213\343\202\201\343\202\210\343\201\206\357\274\201")
;; => emacsを始めよう!
(defun my-decode-octet-string ($str)
  (with-temp-buffer
    (insert $str)
    (decode-coding-region (point-min) (point-max) 'utf-8)
    (buffer-string)))

その他

(string-as-unibyte "千") ;; => \345\215\203
(string-as-multibyte "\345\215\203") ;; => 千
(toggle-enable-multibyte-characters) ;; バッファ内のマルチバイト文字置換

;; String -> Character
(string-to-char "あ") ;;=> 12354
? ;; => 12354

;; Character -> String
(char-to-string 12354) ;; => あ
(format "%c" 12354) ;; => あ

;; Unicode encode/decode
(my-char-to-entity ?) ;; => %3042
(my-entity-to-char "%3042" t)  ; => あ
(defun my-char-to-entity ($char)
  (interactive "cChar: ")
  (if (eq (type-of $char) 'string)
      (setq $char (string-to-char $char)))
  (let (($res (concat "%" (upcase (format "%x" $char)))))
    (when (called-interactively-p 'any)
      (message " %c -> %s " $char $res))
    $res))

(defun my-entity-to-char ($entity &optional $to-string)
  (interactive)
  (when (string-match "^%" $entity)
    (let* (($h (substring $entity 1))
           ($c (string-to-number $h 16))
           )
      (if $to-string
          (format "%c" $c)
        $c))))

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
6