v1.16.1 / 2024-02-03
Fixed
- [CRuby] XML::Reader defaults the encoding to UTF-8 if it's not specified in either the document or as a method parameter. Previously non-ASCII characters were serialized as NCRs in this case. [#2891] (@flavorjones)
これが気になったので、まず既存のxmlパースの書き方と問題のXML::Reader
の書き方を比べて、ライブラリのソースコードを見に行ってみる。
# frozen_string_literal: true
require 'nokogiri'
require "debug"
raw_xml = File.read('atom10.xml')
binding.break
ret = Nokogiri raw_xml
binding.break
ret2 = Nokogiri::XML::Reader('atom10.xml')
# pp ret
pp ret.at("feed/id")
pp ret2
[3, 11] in a.rb
3| require 'nokogiri'
4| require "debug"
5|
6| raw_xml = File.read('atom10.xml')
7| binding.break
=> 8| ret = Nokogiri raw_xml
9| binding.break
10| # pp ret
11| pp ret.at("feed/id")
=>#0 <main> at a.rb:8
(rdbg) step # command
[105, 114] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb
105| # Parse a document contained in +args+. Nokogiri will try to guess what type of document you are
106| # attempting to parse. For more information, see Nokogiri.parse
107| #
108| # To specify the type of document, use {Nokogiri.XML}, {Nokogiri.HTML4}, or {Nokogiri.HTML5}.
109| def Nokogiri(*args, &block)
=> 110| if block
111| Nokogiri::HTML4::Builder.new(&block).doc.root
112| else
113| Nokogiri.parse(*args)
114| end
=>#0 Object#Nokogiri(args=["<?xml version=\"1.0\" encoding=\"utf-8\..., block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb:110
#1 <main> at a.rb:8
(rdbg) next # command
[108, 117] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb
108| # To specify the type of document, use {Nokogiri.XML}, {Nokogiri.HTML4}, or {Nokogiri.HTML5}.
109| def Nokogiri(*args, &block)
110| if block
111| Nokogiri::HTML4::Builder.new(&block).doc.root
112| else
=> 113| Nokogiri.parse(*args)
114| end
115| end
116|
117| require_relative "nokogiri/version"
=>#0 Object#Nokogiri(args=["<?xml version=\"1.0\" encoding=\"utf-8\..., block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb:113
#1 <main> at a.rb:8
(rdbg) step # command
[40, 49] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb
40| module Nokogiri
41| class << self
42| ###
43| # Parse an HTML or XML document. +string+ contains the document.
44| def parse(string, url = nil, encoding = nil, options = nil)
=> 45| if string.respond_to?(:read) ||
46| /^\s*<(?:!DOCTYPE\s+)?html[\s>]/i.match?(string[0, 512])
47| # Expect an HTML indicator to appear within the first 512
48| # characters of a document. (<?xml ?> + <?xml-stylesheet ?>
49| # shouldn't be that long)
=>#0 Nokogiri.parse(string="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb:45
#1 Object#Nokogiri(args=["<?xml version=\"1.0\" encoding=\"utf-8\..., block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb:113
# and 1 frames (use `bt' command for all frames)
(rdbg) next # command
[48, 57] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb
48| # characters of a document. (<?xml ?> + <?xml-stylesheet ?>
49| # shouldn't be that long)
50| Nokogiri.HTML4(string, url, encoding,
51| options || XML::ParseOptions::DEFAULT_HTML)
52| else
=> 53| Nokogiri.XML(string, url, encoding,
54| options || XML::ParseOptions::DEFAULT_XML)
55| end.tap do |doc|
56| yield doc if block_given?
57| end
=>#0 Nokogiri.parse(string="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb:53
#1 Object#Nokogiri(args=["<?xml version=\"1.0\" encoding=\"utf-8\..., block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb:113
# and 1 frames (use `bt' command for all frames)
(rdbg) step # command
[3, 12] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb
3| module Nokogiri
4| class << self
5| ###
6| # Parse XML. Convenience method for Nokogiri::XML::Document.parse
7| def XML(thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_XML, &block)
=> 8| Nokogiri::XML::Document.parse(thing, url, encoding, options, &block)
9| end
10| end
11|
12| module XML
=>#0 Nokogiri.XML(thing="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=4196353, block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:8
#1 Nokogiri.parse(string="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri.rb:53
# and 2 frames (use `bt' command for all frames)
(rdbg) step # command
[44, 53] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb
44| # set) by default.
45| #
46| # Nokogiri.XML() is a convenience method which will call this method.
47| #
48| def self.parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML)
=> 49| options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
50| yield options if block_given?
51|
52| url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
53|
=>#0 Nokogiri::XML::Document.parse(string_or_io="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=4196353) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb:49
#1 Nokogiri.XML(thing="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=4196353, block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:8
# and 3 frames (use `bt' command for all frames)
[47, 56] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb
47| #
48| def self.parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML)
49| options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
50| yield options if block_given?
51|
=> 52| url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
53|
54| if empty_doc?(string_or_io)
55| if options.strict?
56| raise Nokogiri::XML::SyntaxError, "Empty document"
=>#0 Nokogiri::XML::Document.parse(string_or_io="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=#<Nokogiri::XML::ParseOptions:0x00007fe4a...) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb:52
#1 Nokogiri.XML(thing="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=4196353, block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:8
# and 3 frames (use `bt' command for all frames)
(rdbg) next # command
[49, 58] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb
49| options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
50| yield options if block_given?
51|
52| url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
53|
=> 54| if empty_doc?(string_or_io)
55| if options.strict?
56| raise Nokogiri::XML::SyntaxError, "Empty document"
57| else
58| return encoding ? new.tap { |i| i.encoding = encoding } : new
=>#0 Nokogiri::XML::Document.parse(string_or_io="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=#<Nokogiri::XML::ParseOptions:0x00007fe4a...) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb:54
#1 Nokogiri.XML(thing="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=4196353, block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:8
# and 3 frames (use `bt' command for all frames)
(rdbg) next # command
[57, 66] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb
57| else
58| return encoding ? new.tap { |i| i.encoding = encoding } : new
59| end
60| end
61|
=> 62| doc = if string_or_io.respond_to?(:read)
63| if string_or_io.is_a?(Pathname)
64| # resolve the Pathname to the file and open it as an IO object, see #2110
65| string_or_io = string_or_io.expand_path.open
66| url ||= string_or_io.path
=>#0 Nokogiri::XML::Document.parse(string_or_io="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=#<Nokogiri::XML::ParseOptions:0x00007fe4a...) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb:62
#1 Nokogiri.XML(thing="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=4196353, block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:8
# and 3 frames (use `bt' command for all frames)
(rdbg) next # command
[67, 76] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb
67| end
68|
69| read_io(string_or_io, url, encoding, options.to_i)
70| else
71| # read_memory pukes on empty docs
=> 72| read_memory(string_or_io, url, encoding, options.to_i)
73| end
74|
75| # do xinclude processing
76| doc.do_xinclude(options) if options.xinclude?
=>#0 Nokogiri::XML::Document.parse(string_or_io="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=#<Nokogiri::XML::ParseOptions:0x00007fe4a...) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb:72
#1 Nokogiri.XML(thing="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=4196353, block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:8
# and 3 frames (use `bt' command for all frames)
(rdbg) next # command
[71, 80] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb
71| # read_memory pukes on empty docs
72| read_memory(string_or_io, url, encoding, options.to_i)
73| end
74|
75| # do xinclude processing
=> 76| doc.do_xinclude(options) if options.xinclude?
77|
78| doc
79| end
80|
=>#0 Nokogiri::XML::Document.parse(string_or_io="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=#<Nokogiri::XML::ParseOptions:0x00007fe4a...) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/document.rb:76
#1 Nokogiri.XML(thing="<?xml version=\"1.0\" encoding=\"utf-8\"..., url=nil, encoding=nil, options=4196353, block=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:8
# and 3 frames (use `bt' command for all frames)
にたどり着く
[5, 13] in a.rb
5|
6| raw_xml = File.read('atom10.xml')
7| binding.break
8| ret = Nokogiri raw_xml
9| binding.break
=> 10| ret2 = Nokogiri::XML::Reader('atom10.xml')
11| # pp ret
12| pp ret.at("feed/id")
13| pp ret2
=>#0 <main> at a.rb:10
(rdbg) step # command
[19, 28] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb
19| class << self
20| ###
21| # Parse an XML document using the Nokogiri::XML::Reader API. See
22| # Nokogiri::XML::Reader for mor information
23| def Reader(string_or_io, url = nil, encoding = nil, options = ParseOptions::STRICT)
=> 24| options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
25| yield options if block_given?
26|
27| if string_or_io.respond_to?(:read)
28| return Reader.from_io(string_or_io, url, encoding, options.to_i)
=>#0 Nokogiri::XML.Reader(string_or_io="atom10.xml", url=nil, encoding=nil, options=0) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:24
#1 <main> at a.rb:10
(rdbg) next # command
[20, 29] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb
20| ###
21| # Parse an XML document using the Nokogiri::XML::Reader API. See
22| # Nokogiri::XML::Reader for mor information
23| def Reader(string_or_io, url = nil, encoding = nil, options = ParseOptions::STRICT)
24| options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
=> 25| yield options if block_given?
26|
27| if string_or_io.respond_to?(:read)
28| return Reader.from_io(string_or_io, url, encoding, options.to_i)
29| end
=>#0 Nokogiri::XML.Reader(string_or_io="atom10.xml", url=nil, encoding=nil, options=#<Nokogiri::XML::ParseOptions:0x00007fe3d...) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:25
#1 <main> at a.rb:10
(rdbg) next # command
[22, 31] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb
22| # Nokogiri::XML::Reader for mor information
23| def Reader(string_or_io, url = nil, encoding = nil, options = ParseOptions::STRICT)
24| options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
25| yield options if block_given?
26|
=> 27| if string_or_io.respond_to?(:read)
28| return Reader.from_io(string_or_io, url, encoding, options.to_i)
29| end
30|
31| Reader.from_memory(string_or_io, url, encoding, options.to_i)
=>#0 Nokogiri::XML.Reader(string_or_io="atom10.xml", url=nil, encoding=nil, options=#<Nokogiri::XML::ParseOptions:0x00007fe3d...) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:27
#1 <main> at a.rb:10
(rdbg) next # command
[26, 35] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb
26|
27| if string_or_io.respond_to?(:read)
28| return Reader.from_io(string_or_io, url, encoding, options.to_i)
29| end
30|
=> 31| Reader.from_memory(string_or_io, url, encoding, options.to_i)
32| end
33|
34| ###
35| # Parse XML. Convenience method for Nokogiri::XML::Document.parse
=>#0 Nokogiri::XML.Reader(string_or_io="atom10.xml", url=nil, encoding=nil, options=#<Nokogiri::XML::ParseOptions:0x00007fe3d...) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:31
#1 <main> at a.rb:10
(rdbg) step # command
[75, 84] in ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/reader.rb
75| attr_reader :source
76|
77| alias_method :self_closing?, :empty_element?
78|
79| def initialize(source, url = nil, encoding = nil) # :nodoc:
=> 80| @source = source
81| @errors = []
82| @encoding = encoding
83| end
84| private :initialize
=>#0 Nokogiri::XML::Reader#initialize(source="atom10.xml", url=nil, encoding=nil) at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml/reader.rb:80
#1 [C] Nokogiri::XML::Reader.from_memory at ~/.rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/nokogiri-1.13.10/lib/nokogiri/xml.rb:31
# and 2 frames (use `bt' command for all frames)
にたどり着く。
というわけで、全然別物のlibxml2のメソッドを呼び出すところに帰結している。
問題を改修するPRを見てみる。
すると手を出しているのはReaderの方だけにみえる。
ここでもともとのIssueをよく読んでみることにする。
Nokogiri::XML::Reader#inner_xml
returns NCR encoded attributes even if the encoding is set toutf-8
in#from_memory
call.
It does not happen if the XML input sets the encoding with<?xml version="1.0" encoding="UTF-8"?>
.
It only happens to attributes, elements and text nodes are correctly encoded.
NCR encodedとは♪
のような数値文字参照のことで、結局このIssueがなにを言っているかというとattributesの値がエンコードされちゃうみたいな話。
#! /usr/bin/env ruby
require 'bundler/inline'
gemfile do
source 'https://rubygems.org'
gem 'nokogiri', '1.13.8'
end
require 'nokogiri'
xml = <<~XML
<test><anotación tipo="inspiración">(inspiración)</anotación></test>
XML
Nokogiri::XML::Document.parse(xml).to_xml
# => "<?xml version=\"1.0\"?>\n" +
# "<test>\n" +
# " <anotación tipo=\"inspiración\">(inspiración)</anotación>\n" +
# "</test>\n"
Nokogiri::XML::Document.parse(xml, nil, "UTF-8").to_xml
# => "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
# "<test>\n" +
# " <anotación tipo=\"inspiración\">(inspiración)</anotación>\n" +
# "</test>\n"
というわけでattributesの値をそういう使い方をしていて、かつ、XML中でエンコードを指定していないと影響を受けそう。