0
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

【ElasticSearch】WordデータをElasticSearchに投入

Last updated at Posted at 2022-01-21

はじめに

ElasticSearchにWordのデータ投入をどうにかできないかということで、ElasticSearchのingest-attachmentプラグインを用いてファイルの取り込みを検証

↓参考の記事
https://qiita.com/tsgkdt/items/6dec64a8e98dfbf8ab24

投入するデータ

下記のような職務経歴書のテンプレートを投入してみようと思います
image.png

ingest-attachmentプラグインのインストール

下記コマンドをコマンドプロンプトで実行

elasticsearch-plugin.bat install ingest-attachment

C#のソース

ElasticSearch.cs
// ElasticSearchのURLを定義
public static string ApiUrl = "http://localhost:9200";

// ElasticSearchのingest-attachmentプラグインを用いてファイルの取り込み
public static void RegistWordData()
{
    var indexDefinition = new IndexDefinition(new Uri(ApiUrl));
    var file = new FileInfo(@"任意のwordファイル.doc");

    // wordのデータをElasticSearchに登録
    indexDefinition.CreateMapping()
        .PutPipeline()
        .AddAttachment("1", file)
    ;
}
IndexDefinition.cs(ほとんどコピペ)
using ConsoleElasticSearchSample.Model;
using Nest;
using System;
using System.IO;
using System.Net.Http;

namespace ConsoleElasticSearchSample
{
    class IndexDefinition
    {
        private readonly string DocumentsIndex = "worddata";

        private readonly ElasticClient _client = null;

        /// <summary>
        /// ElasticClientを生成するコンストラクタ
        /// 接続したいElasticSearchのエンドポイントを指定します。(例:new Uri("http://192.168.1.1:9200/"))
        /// </summary>
        /// <param path="uri">ElasticSearch接続先</param>
        public IndexDefinition(Uri uri = null)
        {
            var connectionSettings = new ConnectionSettings(uri)
                .DefaultMappingFor<IndexModel>(m => m
                        .IndexName(DocumentsIndex)
                //CallDetails.DebugInformationを出す用に
                ).DisableDirectStreaming()
                ;
            _client = new ElasticClient(connectionSettings);
        }

        /// <summary>
        /// Indexとマッピングを生成します。
        /// </summary>
        /// <returns></returns>
        public IndexDefinition CreateMapping()
        {
            /*
            var indexResponse = _client.CreateIndex(DocumentsIndex, c => c
                .Settings(s => s
                        .Analysis(a => a
                                .Analyzers(ad => ad
                                        .Custom("windows_path_hierarchy_analyzer", ca => ca
                                                .Tokenizer("windows_path_hierarchy_tokenizer")
                                        )
                                )
                                .Tokenizers(t => t
                                        .PathHierarchy("windows_path_hierarchy_tokenizer", ph => ph
                                                .Delimiter('\\')
                                        )
                                )
                        )
                )
                .Mappings(m => m
                    .Map<Document>(mp => mp
                        .AutoMap()
                        .AllField(all => all
                                .Enabled(false)
                        )
                        .Properties(ps => ps
                            .Text(s => s
                                    .Name(n => n.Path)
                                    .Analyzer("windows_path_hierarchy_analyzer")
                            )
                            .Object<Attachment>(a => a
                                .Name(n => n.Attachment)
                                .Properties(p => p
                                    .Text(t => t
                                        .Name(n => n.Content)
                                        .Fields(f => f
                                            .Keyword(k => k.IgnoreAbove(256)))
                                        )
                                    )
                            .AutoMap()
                            )
                        )
                    )
                )
            );
*/
            var indexResponse = _client.Indices.Create(DocumentsIndex, c => c
                .Settings(s => s
                    .Analysis(a => a
                        .Analyzers(ad => ad
                            .Custom("windows_path_hierarchy_analyzer", ca => ca
                                .Tokenizer("windows_path_hierarchy_tokenizer")
                            )
                        )
                        .Tokenizers(t => t
                            .PathHierarchy("windows_path_hierarchy_tokenizer", ph => ph
                                .Delimiter('\\')
                            )
                        )
                    )
                )
                .Mappings(m => m
                    .Map<IndexModel>(mp => mp
                        .AllField(all => all
                            .Enabled(false)
                        )
                        .Properties(ps => ps
                            .Number(n => n
                                .Name(nn => nn.id)
                            )
                            .Text(s => s
                                .Name(n => n.path)
                                .Analyzer("windows_path_hierarchy_analyzer")
                            )
                            .Object<Attachment>(a => a
                                .Name(n => n.Attachment)
                                .Properties(p => p
                                    .Text(t => t
                                        .Name(n => n.Name)
                                    )
                                    .Text(t => t
                                        .Name(n => n.Content)
                                    )
                                    .Text(t => t
                                        .Name(n => n.ContentType)
                                    )
                                    .Number(n => n
                                        .Name(nn => nn.ContentLength)
                                    )
                                    .Date(d => d
                                        .Name(n => n.Date)
                                    )
                                    .Text(t => t
                                        .Name(n => n.Author)
                                    )
                                    .Text(t => t
                                        .Name(n => n.Title)
                                    )
                                    .Text(t => t
                                        .Name(n => n.Keywords)
                                    )
                                )
                            )
                        )
                    )
                )
            );
            return this;
        }

        /// <summary>
        /// マッピングを取得します。
        /// </summary>
        /// <returns></returns>
        //public IGetMappingResponse GetMapping()
        //{
        //    var mappingResponse = _client.GetMapping<Document>();
        //    return mappingResponse;
        //}

        /// <summary>
        /// Attachments用のパイプラインを設定します。
        /// </summary>
        /// <returns></returns>
        public IndexDefinition PutPipeline()
        {
            _client.Ingest.PutPipeline("attachments", p => p
                .Description("Document attachment pipeline")
                .Processors(pr => pr
                    .Attachment<IndexModel>(a => a
                        .Field(f => f.Content)
                        .TargetField(f => f.Attachment)
                        .IndexedCharacters(-1)
                    )
                    .Remove<IndexModel>(r => r
                        .Field(ff => ff
                        .Field(f => f.Content))
                    )
                )
            );
            return this;
        }

        /// <summary>
        /// ファイルを登録します。
        /// </summary>
        /// <param path="id">ID</param>
        /// <param path="file">インデックスしたいファイル</param>
        /// <returns></returns>
        public IndexDefinition AddAttachment(string id, FileInfo file)
        {
            var bytes = File.ReadAllBytes(Path.Combine(file.FullName));
            Index(bytes, id, file.FullName);
            return this;
        }

        /// <summary>
        /// ネット上に転がっているファイルを登録します。
        /// </summary>
        /// <param path="id">ID</param>
        /// <param path="uri">URL</param>
        /// <returns></returns>
        public IndexDefinition AddAttachment(string id, Uri uri)
        {
            var bytes = new HttpClient().GetByteArrayAsync(uri).Result;
            Index(bytes, id, uri.AbsolutePath);
            return this;
        }

        /// <summary>
        /// ElasticSearchに登録します
        /// </summary>
        /// <param path="content">ファイルの中身</param>
        /// <param path="id">登録するID</param>
        /// <param path="path">パスに指定する</param>
        private void Index(byte[] content, string id, string path)
        {
            var base64File = Convert.ToBase64String(content);
            var res = _client.Index(new IndexModel
            {
                id = id,
                path = path,
                Content = base64File
            }, i => i.Pipeline("attachments"));

            Console.WriteLine(res.DebugInformation);
        }
    }
}

kibanaでデータ確認

下記コマンドを実行

get /worddata/_search

データが確認されていることを確認
image.png

0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?