Ruby

Ruby で改行・タブ・空白以外の ASCII 制御文字を削除するやつ

More than 1 year has passed since last update.

どうやるのが適当なんだろう?

以下は正規表現 /(?![\r\n\t ])[[:cntrl:]]/ でマッチさせる方法。

puts (0..127).map { |o| "#{o}\t#{o.chr.gsub(/(?![\r\n\t ])[[:cntrl:]]/, '').inspect}" }

__END__
0 ""
1 ""
2 ""
3 ""
4 ""
5 ""
6 ""
7 ""
8 ""
9 "\t"
10 "\n"
11 ""
12 ""
13 "\r"
14 ""
15 ""
16 ""
17 ""
18 ""
19 ""
20 ""
21 ""
22 ""
23 ""
24 ""
25 ""
26 ""
27 ""
28 ""
29 ""
30 ""
31 ""
32 " "
33 "!"
34 "\""
35 "#"
36 "$"
37 "%"
38 "&"
39 "'"
40 "("
41 ")"
42 "*"
43 "+"
44 ","
45 "-"
46 "."
47 "/"
48 "0"
49 "1"
50 "2"
51 "3"
52 "4"
53 "5"
54 "6"
55 "7"
56 "8"
57 "9"
58 ":"
59 ";"
60 "<"
61 "="
62 ">"
63 "?"
64 "@"
65 "A"
66 "B"
67 "C"
68 "D"
69 "E"
70 "F"
71 "G"
72 "H"
73 "I"
74 "J"
75 "K"
76 "L"
77 "M"
78 "N"
79 "O"
80 "P"
81 "Q"
82 "R"
83 "S"
84 "T"
85 "U"
86 "V"
87 "W"
88 "X"
89 "Y"
90 "Z"
91 "["
92 "\\"
93 "]"
94 "^"
95 "_"
96 "`"
97 "a"
98 "b"
99 "c"
100 "d"
101 "e"
102 "f"
103 "g"
104 "h"
105 "i"
106 "j"
107 "k"
108 "l"
109 "m"
110 "n"
111 "o"
112 "p"
113 "q"
114 "r"
115 "s"
116 "t"
117 "u"
118 "v"
119 "w"
120 "x"
121 "y"
122 "z"
123 "{"
124 "|"
125 "}"
126 "~"
127 ""


delete, tr, gsub, remove を比較

String#trString#delete を使っておいた方が、素直で速いコードかもしれない。

require 'benchmark'

string = (0..127).map(&:chr).join.freeze
long_string = (string * 1_000_000).freeze

delete_pattern = "\x00\x01\x02\x03\x04\x05\x06\a\b\v\f\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\e\x1C\x1D\x1E\x1F\x7F"
delete_tr_pattern_1 = "\x00-\x08\x0b\x0c\x0e-\x1f\x7f"
delete_tr_pattern_2 = ["\x00-\x1F\x7F", "^\t\n\r"]
tr_pattern = "\x00-\x08\x0b\x0c\x0e-\x1f\x7f"
gsub_pattern_1 = /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/
gsub_pattern_2 = /(?![\r\n\t ])[[:cntrl:]]/

procs = {
'String#delete("\x00\x01\x02\x03\x04\x05\x06\a\b\v\f\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\e\x1C\x1D\x1E\x1F\x7F")' => lambda { |str| str.delete(delete_pattern) },
'String#delete("\x00-\x08\x0b\x0c\x0e-\x1f\x7f")' => lambda { |str| str.delete(delete_tr_pattern_1) },
'String#delete("\x00-\x1F\x7F", "^\t\n\r")' => lambda { |str| str.delete(*delete_tr_pattern_2) },
'String#tr("\x00-\x08\x0b\x0c\x0e-\x1f\x7f")' => lambda { |str| str.tr(tr_pattern, '') },
'String#gsub(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/, "")' => lambda { |str| str.gsub(gsub_pattern_1, '') },
'String#gsub(/(?![\r\n\t ])[[:cntrl:]]/, "")' => lambda { |str| str.gsub(gsub_pattern_2, '') },
'String#remove(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/)' => lambda { |str| str.remove(gsub_pattern_1) },
'String#remove(/(?![\r\n\t ])[[:cntrl:]]/)' => lambda { |str| str.remove(gsub_pattern_2) },
}

Benchmark.bm(140) do |x|
procs.each do |k, v|
x.report("LOOP: #{k}") { 1_000_000.times { v.call(string) } }
end

procs.each do |k, v|
x.report("LONG: #{k}") { v.call(long_string) }
end
end

__END__

$ rails r benchmark-delete-control-character.rb
user system total real
LOOP: String#delete("\x00\x01\x02\x03\x04\x05\x06\a\b\v\f\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\e\x1C\x1D\x1E\x1F\x7F") 1.680000 0.050000 1.730000 ( 1.740850)
LOOP: String#delete("\x00-\x08\x0b\x0c\x0e-\x1f\x7f") 1.390000 0.010000 1.400000 ( 1.407325)
LOOP: String#delete("\x00-\x1F\x7F", "^\t\n\r") 1.560000 0.010000 1.570000 ( 1.577371)
LOOP: String#tr("\x00-\x08\x0b\x0c\x0e-\x1f\x7f") 1.370000 0.010000 1.380000 ( 1.383961)
LOOP: String#gsub(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/, "") 8.250000 0.050000 8.300000 ( 8.532821)
LOOP: String#gsub(/(?![\r\n\t ])[[:cntrl:]]/, "") 8.730000 0.040000 8.770000 ( 8.924944)
LOOP: String#remove(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/) 9.830000 0.030000 9.860000 ( 9.874570)
LOOP: String#remove(/(?![\r\n\t ])[[:cntrl:]]/) 10.390000 0.030000 10.420000 ( 10.434877)
LONG: String#delete("\x00\x01\x02\x03\x04\x05\x06\a\b\v\f\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\e\x1C\x1D\x1E\x1F\x7F") 0.270000 0.040000 0.310000 ( 0.301718)
LONG: String#delete("\x00-\x08\x0b\x0c\x0e-\x1f\x7f") 0.240000 0.030000 0.270000 ( 0.287336)
LONG: String#delete("\x00-\x1F\x7F", "^\t\n\r") 0.310000 0.050000 0.360000 ( 0.355947)
LONG: String#tr("\x00-\x08\x0b\x0c\x0e-\x1f\x7f") 0.290000 0.040000 0.330000 ( 0.327342)
LONG: String#gsub(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/, "") 6.050000 0.010000 6.060000 ( 6.065576)
LONG: String#gsub(/(?![\r\n\t ])[[:cntrl:]]/, "") 6.410000 0.030000 6.440000 ( 6.438239)
LONG: String#remove(/[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/) 6.180000 0.070000 6.250000 ( 6.276844)
LONG: String#remove(/(?![\r\n\t ])[[:cntrl:]]/) 6.410000 0.030000 6.440000 ( 6.443963)