Fetching content from an URL seems to be not complicated at first until I got into it. Starting with a simple function, now it turns into a big one and I still encounter errors as time goes by. Even now I could not remember why I had written some lines of code, however I am sure that I had a serious reason for each line of code in the function.
I probably will write a detail article on this to explain the evolution of the function. Stay tune!
# encoding: utf-8
def fetch_url url, limit = 5
return nil if limit.zero?
uri = Addressable::URI.parse(url).normalize
request = Net::HTTP.new uri.host
uri.path = "/" if uri.path.blank?
response = request.head uri.request_uri
if response.is_a? Net::HTTPMethodNotAllowed
Timeout::timeout(TIMEOUT) do
response = request.get uri.request_uri
end
end
case response
when Net::HTTPSuccess
if ["text/html", "application/xhtml+xml"].include? response.content_type
http = Net::HTTP.new uri.host, uri.port
req = Net::HTTP::Get.new uri.request_uri, {"User-Agent" => "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"}
res = http.request req
if res.is_a? Net::HTTPRedirection
fetch_url res["location"], limit.pred
else
res.uri = uri.to_s
res["charset"] = response.header["content-type"].rpartition("charset=").last.strip
res
end
else
nil
end
when Net::HTTPRedirection
fetch_url response["location"], limit.pred
when Net::HTTPClientError
nil
when Net::HTTPServerError
nil
end
rescue SocketError, TypeError, URI::InvalidURIError, NoMethodError,
OpenSSL::SSL::SSLError, Timeout::Error
nil
end
def nokogiri_doc response
return unless response
if response["charset"].in? ["euc-jp", "EUC-JP"]
Nokogiri::HTML.parse response.body, nil, response["charset"]
else
Nokogiri::HTML.parse response.body
end
end