4
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

プチ業務改善 ファイルのエンコーディングを気にせず読み込む便利クラス

Last updated at Posted at 2018-06-08

ちょっとしたツールを作る際にエンコーディングを気にしなくて済むと便利。
いつもWindows環境で仕事をしているんだけど、お目にかかるファイルのエンコーディングはShiftJISとUTF8、UTF16くらい。
ユニコードエンコーディングの場合は基本的にBOMがついているので、エンコーディングを自動判定するクラスを作ってみた。
備忘録として残しておく(^^)

FileHelper.rb
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;

namespace GeoLibrary.IO
{
    public class FileHelper
    {
        public static StreamReader OpenReader(FileInfo target)
        {
            return new StreamReaderOpener(target).Open();
        }

        public static IEnumerable<string> OpenAndReadAllLines(FileInfo target)
        {
            using (var sr = OpenReader(target))
            {
                return sr.ReadToEnd().Split('\n').Select(_ => _.TrimEnd('\r'));
            }
        }

        private class StreamReaderOpener
        {
            private readonly FileInfo _target;

            internal StreamReaderOpener(FileInfo target)
            {
                if (target == null)
                {
                    throw new ArgumentNullException(nameof(target));
                }
                _target = target;
            }

            internal StreamReader Open()
            {
                if (!File.Exists(_target.FullName))
                {
                    var message = $"File does not exists => {_target.FullName}";
                    throw new ArgumentException(message);
                }
                var stream = _target.OpenRead();
                var encodingDetector = new FileEncodingDetector(() => stream);
                return new StreamReader(stream, encodingDetector.Detect());
            }
        }

        private class FileEncodingDetector
        {
            private readonly IDictionary<ByteOrderMarker.BomType, Encoding> EncodingDictionary
                = new Dictionary<ByteOrderMarker.BomType, Encoding>()
            {
				{ ByteOrderMarker.BomType.Utf8, Encoding.UTF8 },
				{ ByteOrderMarker.BomType.Unicode, Encoding.Unicode },
				{ ByteOrderMarker.BomType.BigEndianUnicode, Encoding.BigEndianUnicode },
                { ByteOrderMarker.BomType.Utf32, Encoding.UTF32 },
            };
            private readonly Func<FileStream> _streamGetter;

            internal FileEncodingDetector(Func<FileStream> streamGetter)
            {
                if (streamGetter == null)
                {
                    throw new ArgumentNullException(nameof(streamGetter));
                }
                _streamGetter = streamGetter;
            }

            internal Encoding Detect()
            {
                var stream = _streamGetter();
                Debug.Assert(stream != null);
                var position = stream.Position;
                var bomType = new ByteOrderMarker(TryReadByteFromTop(4)).ToType();
                stream.Seek(position, SeekOrigin.Begin);
                return EncodingDictionary.ContainsKey(bomType) ?
                    EncodingDictionary[bomType] : Encoding.Default;
            }

            private void MoveToTopPosition()
            {
                var stream = _streamGetter();
                Debug.Assert(stream != null);
                stream.Seek(0, SeekOrigin.Begin);
            }

            private IEnumerable<byte> TryReadByteFromTop(int count)
            {
                var stream = _streamGetter();
                Debug.Assert(stream != null);
                MoveToTopPosition();
                var bytes = new List<byte>();
                for (var i = 0; i < count; ++i)
                {
                    if (stream.Length <= i)
                    {
                        break;
                    }
                    bytes.Add((byte)stream.ReadByte());
                }
                return bytes;
            }
        }

        private class ByteOrderMarker
        {
            internal enum BomType
            {
                Utf8,
                Unicode,
                BigEndianUnicode,
                Utf32,
                Unknown,
            }

            private readonly IEnumerable<byte> _bom;

            internal ByteOrderMarker(IEnumerable<byte> bom)
            {
                _bom = bom;
            }

            internal BomType ToType()
            {
                var dic = new[] {
					new { Bom = new byte[] { 0xEF, 0xBB, 0xBF }, Type = BomType.Utf8 },
					new { Bom = new byte[] { 0xFF, 0xFE }, Type = BomType.Unicode },
					new { Bom = new byte[] { 0xFE, 0xFF }, Type = BomType.BigEndianUnicode },
					new { Bom = new byte[] { 0xFF, 0xFE, 0x00, 0x00 }, Type = BomType.Utf32 },
                };

                var candidate = dic.Where(_ => Enumerable.SequenceEqual(_.Bom, _bom));
                Debug.Assert(candidate.Count() <= 1);
                return candidate.Count() != 0 ?
                    candidate.First().Type : BomType.Unknown;
            }
        }
    }
}

4
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?