LoginSignup
6
5

More than 5 years have passed since last update.

Ruby で改行・タブ・空白以外の ASCII 制御文字を削除するやつ

Last updated at Posted at 2016-06-08

どうやるのが適当なんだろう?
以下は正規表現 /(?![\r\n\t ])[[:cntrl:]]/ でマッチさせる方法。

puts (0..127).map { |o| "#{o}\t#{o.chr.gsub(/(?![\r\n\t ])[[:cntrl:]]/, '').inspect}" }

__END__
0       ""
1       ""
2       ""
3       ""
4       ""
5       ""
6       ""
7       ""
8       ""
9       "\t"
10      "\n"
11      ""
12      ""
13      "\r"
14      ""
15      ""
16      ""
17      ""
18      ""
19      ""
20      ""
21      ""
22      ""
23      ""
24      ""
25      ""
26      ""
27      ""
28      ""
29      ""
30      ""
31      ""
32      " "
33      "!"
34      "\""
35      "#"
36      "$"
37      "%"
38      "&"
39      "'"
40      "("
41      ")"
42      "*"
43      "+"
44      ","
45      "-"
46      "."
47      "/"
48      "0"
49      "1"
50      "2"
51      "3"
52      "4"
53      "5"
54      "6"
55      "7"
56      "8"
57      "9"
58      ":"
59      ";"
60      "<"
61      "="
62      ">"
63      "?"
64      "@"
65      "A"
66      "B"
67      "C"
68      "D"
69      "E"
70      "F"
71      "G"
72      "H"
73      "I"
74      "J"
75      "K"
76      "L"
77      "M"
78      "N"
79      "O"
80      "P"
81      "Q"
82      "R"
83      "S"
84      "T"
85      "U"
86      "V"
87      "W"
88      "X"
89      "Y"
90      "Z"
91      "["
92      "\\"
93      "]"
94      "^"
95      "_"
96      "`"
97      "a"
98      "b"
99      "c"
100     "d"
101     "e"
102     "f"
103     "g"
104     "h"
105     "i"
106     "j"
107     "k"
108     "l"
109     "m"
110     "n"
111     "o"
112     "p"
113     "q"
114     "r"
115     "s"
116     "t"
117     "u"
118     "v"
119     "w"
120     "x"
121     "y"
122     "z"
123     "{"
124     "|"
125     "}"
126     "~"
127     ""

delete, tr, gsub, remove を比較

String#trString#delete を使っておいた方が、素直で速いコードかもしれない。

require 'benchmark'

string = (0..127).map(&:chr).join.freeze
long_string = (string * 1_000_000).freeze

delete_pattern = "\x00\x01\x02\x03\x04\x05\x06\a\b\v\f\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\e\x1C\x1D\x1E\x1F\x7F"
delete_tr_pattern_1 = "\x00-\x08\x0b\x0c\x0e-\x1f\x7f"
delete_tr_pattern_2 = ["\x00-\x1F\x7F", "^\t\n\r"]
tr_pattern = "\x00-\x08\x0b\x0c\x0e-\x1f\x7f"
gsub_pattern_1 = /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/
gsub_pattern_2 = /(?![\r\n\t ])[[:cntrl:]]/

procs = {
  'String#delete("\x00\x01\x02\x03\x04\x05\x06\a\b\v\f\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\e\x1C\x1D\x1E\x1F\x7F")' => lambda { |str| str.delete(delete_pattern) },
  'String#delete("\x00-\x08\x0b\x0c\x0e-\x1f\x7f")'     => lambda { |str| str.delete(delete_tr_pattern_1) },
  'String#delete("\x00-\x1F\x7F", "^\t\n\r")'           => lambda { |str| str.delete(*delete_tr_pattern_2) },
  'String#tr("\x00-\x08\x0b\x0c\x0e-\x1f\x7f")'         => lambda { |str| str.tr(tr_pattern, '') },
  'String#gsub(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/, "")' => lambda { |str| str.gsub(gsub_pattern_1, '') },
  'String#gsub(/(?![\r\n\t ])[[:cntrl:]]/, "")'         => lambda { |str| str.gsub(gsub_pattern_2, '') },
  'String#remove(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/)'   => lambda { |str| str.remove(gsub_pattern_1) },
  'String#remove(/(?![\r\n\t ])[[:cntrl:]]/)'           => lambda { |str| str.remove(gsub_pattern_2) },
}

Benchmark.bm(140) do |x|
  procs.each do |k, v|
    x.report("LOOP: #{k}") { 1_000_000.times { v.call(string) } }
  end

  procs.each do |k, v|
    x.report("LONG: #{k}") { v.call(long_string) }
  end
end

__END__

$ rails r benchmark-delete-control-character.rb
                                                                                                                                                   user     system      total        real
LOOP: String#delete("\x00\x01\x02\x03\x04\x05\x06\a\b\v\f\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\e\x1C\x1D\x1E\x1F\x7F")          1.680000   0.050000   1.730000 (  1.740850)
LOOP: String#delete("\x00-\x08\x0b\x0c\x0e-\x1f\x7f")                                                                                          1.390000   0.010000   1.400000 (  1.407325)
LOOP: String#delete("\x00-\x1F\x7F", "^\t\n\r")                                                                                                1.560000   0.010000   1.570000 (  1.577371)
LOOP: String#tr("\x00-\x08\x0b\x0c\x0e-\x1f\x7f")                                                                                              1.370000   0.010000   1.380000 (  1.383961)
LOOP: String#gsub(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/, "")                                                                                      8.250000   0.050000   8.300000 (  8.532821)
LOOP: String#gsub(/(?![\r\n\t ])[[:cntrl:]]/, "")                                                                                              8.730000   0.040000   8.770000 (  8.924944)
LOOP: String#remove(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/)                                                                                        9.830000   0.030000   9.860000 (  9.874570)
LOOP: String#remove(/(?![\r\n\t ])[[:cntrl:]]/)                                                                                               10.390000   0.030000  10.420000 ( 10.434877)
LONG: String#delete("\x00\x01\x02\x03\x04\x05\x06\a\b\v\f\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\e\x1C\x1D\x1E\x1F\x7F")          0.270000   0.040000   0.310000 (  0.301718)
LONG: String#delete("\x00-\x08\x0b\x0c\x0e-\x1f\x7f")                                                                                          0.240000   0.030000   0.270000 (  0.287336)
LONG: String#delete("\x00-\x1F\x7F", "^\t\n\r")                                                                                                0.310000   0.050000   0.360000 (  0.355947)
LONG: String#tr("\x00-\x08\x0b\x0c\x0e-\x1f\x7f")                                                                                              0.290000   0.040000   0.330000 (  0.327342)
LONG: String#gsub(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/, "")                                                                                      6.050000   0.010000   6.060000 (  6.065576)
LONG: String#gsub(/(?![\r\n\t ])[[:cntrl:]]/, "")                                                                                              6.410000   0.030000   6.440000 (  6.438239)
LONG: String#remove(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/)                                                                                        6.180000   0.070000   6.250000 (  6.276844)
LONG: String#remove(/(?![\r\n\t ])[[:cntrl:]]/)                                                                                                6.410000   0.030000   6.440000 (  6.443963)
6
5
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
6
5