More than 5 years have passed since last update.

言語処理100本ノック with Elixir 第1章: 準備運動

Posted at 2019-07-02

Qiita初投稿です。
Elixirを勉強中で、自然言語処理も勉強中なので言語処理100本ノックをElixirでやってみます。
ご指摘・ツッコミお待ちしております。

1章1モジュール1ファイルで書いていきます。

全体のコード

chapter_one.ex

defmodule ChapterOne do
  # 00. 文字列の逆順
  def reverse(str), do: str |> String.reverse

  # 01. 「パタトクカシーー」
  def pick_odd(str), do: str |> String.codepoints |> Enum.take_every(2) |> Enum.join

  # 02. 「パトカー」＋「タクシー」＝「パタトクカシーー」
  def merge_alternately(str1, str2) do
    [str1, str2]
    |> Enum.map(&String.codepoints/1)
    |> Enum.zip
    |> Enum.map(&Tuple.to_list/1)
    |> Enum.concat
    |> Enum.join
  end

  # 03. 円周率
  def num_of_characters_by_word(sentence) do
    sentence
    |> String.replace(~r/[,.]/, "")
    |> String.split
    |> Enum.map(&String.length/1)
  end

  # 04. 元素記号
  def take_first_char_of_words(sentence) do
    sentence
    |> String.split
    |> Enum.map(&String.first/1)
    |> Enum.with_index(1)
    |> Enum.into(%{})
  end
  def take_first_char_of_words(sentence, nums_of_char) do
    sentence
    |> String.split
    |> Enum.zip(nums_of_char)
    |> Enum.map(fn {word, num} -> String.slice(word, 0..num-1) end)
    |> Enum.with_index(1)
    |> Enum.into(%{})
  end

  def repeat_replace(list, [], _value), do: list
  def repeat_replace(list, [head | tail], value) do
    list
    |> List.replace_at(head, value)
    |> repeat_replace(tail, value)
  end

  # 05 n-gram
  def n_gram(str, n, :word) do
    str
    |> String.split
    |> Enum.chunk_every(n, 1, :discard)
  end
  def n_gram(str, n, :char) do
    str
    |> String.codepoints
    |> Enum.chunk_every(n, 1, :discard)
    |> Enum.map(&List.to_string/1)
  end

  # 06. 集合
  def get_union(a, b), do: MapSet.union(a, b)
  def get_intersection(a, b), do: MapSet.intersection(a, b)
  def get_difference(a, b), do: MapSet.difference(a, b)

  # 07. テンプレートによる文生成
  def template(x, y, z), do: "#{x}時の#{y}は#{z}"

  # 08. 暗号文
  def cipher(str) do
    str
    |> String.to_charlist
    |> Enum.map(&_cipher/1)
    |> List.to_string
  end

  def _cipher(int) when (int >= 97 and int <= 122), do: 219 - int
  def _cipher(int), do: int

  # 09. Typoglycemia
  def make_typoglycemia(str) do
    str
    |> insert_spaces_before_marks
    |> String.split
    |> Enum.map(&String.codepoints/1)
    |> Enum.map(&shuffle_inside/1)
    |> Enum.map(&Enum.join/1)
    |> Enum.join(" ")
    |> remove_spaces_before_marks
  end

  def shuffle_inside(list) when length(list) > 4 do
    first = List.first(list)
    inside = Enum.slice(list, 1..-2) |> Enum.shuffle
    last = List.last(list)
    Enum.concat([[first], inside, [last]])
  end
  def shuffle_inside(list), do: list

  def insert_spaces_before_marks(str) do
    str
    |> String.replace(",", " ,")
    |> String.replace(".", " .")
    |> String.replace(":", " :")
    |> String.replace(";", " ;")
    |> String.replace("!", " !")
    |> String.replace("?", " ?")
  end
  def remove_spaces_before_marks(str) do
    str
    |> String.replace(" ,", ",")
    |> String.replace(" .", ".")
    |> String.replace(" :", ":")
    |> String.replace(" ;", ";")
    |> String.replace(" !", "!")
    |> String.replace(" ?", "?")
  end

  # 実行
  def run() do
    reverse("stressed")
    |> IO.inspect(label: "00. 文字列の逆順")

    pick_odd("パタトクカシーー")
    |> IO.inspect(label: "01. 「パタトクカシーー」")

    merge_alternately("パトカー", "タクシー")
    |> IO.inspect(label: "02. 「パトカー」＋「タクシー」＝「パタトクカシーー」")

    num_of_characters_by_word("Now I need a drink, alcoholic of course, after the heavy lectures involving quantum mechanics.")
    |> IO.inspect(label: "03. 円周率")

    indexes_in_1 = [1, 5, 6, 7, 8, 9, 15, 16, 19] |> Enum.map(&(&1-1))
    nums_of_char = List.duplicate(2, 20) |> repeat_replace(indexes_in_1, 1)
    take_first_char_of_words("Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can.", nums_of_char)
    |> IO.inspect(label: "04. 元素記号")

    n_gram("I am an NLPer", 2, :word)
    |> IO.inspect(label: "05. n-gram 単語")
    n_gram("I am an NLPer", 2, :char)
    |> IO.inspect(label: "05. n-gram 文字")

    x = n_gram("paraparaparadise", 2, :char) |> MapSet.new
    y = n_gram("paragraph", 2, :char) |> MapSet.new
    get_union(x, y)
    |> IO.inspect(label: "06. 集合 和")
    get_intersection(x, y)
    |> IO.inspect(label: "06. 集合 積")
    get_difference(x, y)
    |> IO.inspect(label: "06. 集合 差")

    x = 12
    y = "気温"
    z = 22.4
    template(x, y, z)
    |> IO.inspect(label: "07. テンプレートによる文生成")

    cipher("Elixir is a dynamic, functional language designed for building scalable and maintainable applications.")
    |> IO.inspect(label: "08. 暗号文 暗号化")
    |> cipher
    |> IO.inspect(label: "08. 暗号文 復号化")

    make_typoglycemia("I couldn't believe that I could actually understand what I was reading : the phenomenal power of the human mind .")
    |> IO.inspect(label: "09, Typoglycemia")
  end
end

00. 文字列の逆順

文字列"stressed"の文字を逆に（末尾から先頭に向かって）並べた文字列を得よ．

String.reverse/1 を使うだけ

  def reverse(str), do: str |> String.reverse

  def run() do
    reverse("stressed")
    |> IO.inspect(label: "00. 文字列の逆順")
  end

01. 「パタトクカシーー」

「パタトクカシーー」という文字列の1,3,5,7文字目を取り出して連結した文字列を得よ．

Enumで扱うためにString.codepints/1で文字配列に変換し、Enum.take_every/2とEnum.join/2で処理する。

  def pick_odd(str), do: str |> String.codepoints |> Enum.take_every(2) |> Enum.join

  def run() do
    pick_odd("パタトクカシーー")
    |> IO.inspect(label: "01. 「パタトクカシーー」")
  end

02. 「パトカー」＋「タクシー」＝「パタトクカシーー」

「パトカー」＋「タクシー」の文字を先頭から交互に連結して文字列「パタトクカシーー」を得よ．

複数のEnumerableの要素をまとめるにはEnum.zip/1を使う。
Enum.zip/1はTupleのListを返す。TupleはEnumerableではないので、Tuple.to_list/1でListに変換したのちEnum.concat/1とEnum.join/1で処理する。

  def merge_alternately(str1, str2) do
    [str1, str2]
    |> Enum.map(&String.codepoints/1)
    |> Enum.zip
    |> Enum.map(&Tuple.to_list/1)
    |> Enum.concat
    |> Enum.join
  end

  def run() do
    merge_alternately("パトカー", "タクシー")
    |> IO.inspect(label: "02. 「パトカー」＋「タクシー」＝「パタトクカシーー」")
  end

03. 円周率

"Now I need a drink, alcoholic of course, after the heavy lectures involving quantum mechanics."という文を単語に分解し，各単語の（アルファベットの）文字数を先頭から出現順に並べたリストを作成せよ．

String.split/1を使用し、文を空白で分割する。アルファベットのみをカウントするので、分割前にString.replace/4でコンマとピリオドを取り除く。
各単語の文字数はString.length/1で取得する。

  def num_of_characters_by_word(sentence) do
    sentence
    |> String.replace(~r/[,.]/, "")
    |> String.split
    |> Enum.map(&String.length/1)
  end

  def run() do
    num_of_characters_by_word("Now I need a drink, alcoholic of course, after the heavy lectures involving quantum mechanics.")
    |> IO.inspect(label: "03. 円周率")
  end

04. 元素記号

"Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can."という文を単語に分解し，1, 5, 6, 7, 8, 9, 15, 16, 19番目の単語は先頭の1文字，それ以外の単語は先頭に2文字を取り出し，取り出した文字列から単語の位置（先頭から何番目の単語か）への連想配列（辞書型もしくはマップ型）を作成せよ．

単語の先頭何文字取るのかを指定するリストを入力するようにした。このリストを作成するために関数repeat_replace/3を作成した。
String.split/1で文を単語に分割したあとEnum.zip/2で文字数指定リストと組み合わせ、String.slice/3を使って先頭の文字を取り出す。
Enum.with_index/2でindexをつけたあと、Enum.into/2でMapに変換する。

  def take_first_char_of_words(sentence) do
    sentence
    |> String.split
    |> Enum.map(&String.first/1)
    |> Enum.with_index(1)
    |> Enum.into(%{})
  end
  def take_first_char_of_words(sentence, nums_of_char) do
    sentence
    |> String.split
    |> Enum.zip(nums_of_char)
    |> Enum.map(fn {word, num} -> String.slice(word, 0..num-1) end)
    |> Enum.with_index(1)
    |> Enum.into(%{})
  end

  def repeat_replace(list, [], _value), do: list
  def repeat_replace(list, [head | tail], value) do
    list
    |> List.replace_at(head, value)
    |> repeat_replace(tail, value)
  end

  def run() do
    indexes_in_1 = [1, 5, 6, 7, 8, 9, 15, 16, 19] |> Enum.map(&(&1-1))
    nums_of_char = List.duplicate(2, 20) |> repeat_replace(indexes_in_1, 1)
    take_first_char_of_words("Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can.", nums_of_char)
    |> IO.inspect(label: "04. 元素記号")
  end

05. n-gram

与えられたシーケンス（文字列やリストなど）からn-gramを作る関数を作成せよ．この関数を用い，"I am an NLPer"という文から単語bi-gram，文字bi-gramを得よ．

単語n-gramか文字n-gramかをatomで指定するようにした。単語の場合はString.split/1、文字の場合はString.codepoints/1でListに変換し、Enum.chunk_every/4を利用して指定した要素数ごとのListを得る。文字の場合はList.to_string/1でStringに戻す。

  def n_gram(str, n, :word) do
    str
    |> String.split
    |> Enum.chunk_every(n, 1, :discard)
  end
  def n_gram(str, n, :char) do
    str
    |> String.codepoints
    |> Enum.chunk_every(n, 1, :discard)
    |> Enum.map(&List.to_string/1)
  end

  def run() do
    n_gram("I am an NLPer", 2, :word)
    |> IO.inspect(label: "05. n-gram 単語")
    n_gram("I am an NLPer", 2, :char)
    |> IO.inspect(label: "05. n-gram 文字")
  end

06. 集合

"paraparaparadise"と"paragraph"に含まれる文字bi-gramの集合を，それぞれ, XとYとして求め，XとYの和集合，積集合，差集合を求めよ．さらに，'se'というbi-gramがXおよびYに含まれるかどうかを調べよ．

集合を扱うためのMapSetモジュールを利用する。ずばりそのままの関数MapSet.union/2、MapSet.intersection/2、MapSet.difference/2があるのでそれを使うだけ。

  def get_union(a, b), do: MapSet.union(a, b)
  def get_intersection(a, b), do: MapSet.intersection(a, b)
  def get_difference(a, b), do: MapSet.difference(a, b)

  def run() do
    x = n_gram("paraparaparadise", 2, :char) |> MapSet.new
    y = n_gram("paragraph", 2, :char) |> MapSet.new
    get_union(x, y)
    |> IO.inspect(label: "06. 集合 和")
    get_intersection(x, y)
    |> IO.inspect(label: "06. 集合 積")
    get_difference(x, y)
    |> IO.inspect(label: "06. 集合 差")
  end

07. テンプレートによる文生成

引数x, y, zを受け取り「x時のyはz」という文字列を返す関数を実装せよ．さらに，x=12, y="気温", z=22.4として，実行結果を確認せよ．

文字列""の中で#{}を使うとその中の式を評価し、文字列として結合してくれる。
https://elixir-lang.org/getting-started/basic-types.html#strings

  def template(x, y, z), do: "#{x}時の#{y}は#{z}"

  def run()
    x = 12
    y = "気温"
    z = 22.4
    template(x, y, z)
    |> IO.inspect(label: "07. テンプレートによる文生成")
  end

08. 暗号文

与えられた文字列の各文字を，以下の仕様で変換する関数cipherを実装せよ．

英小文字ならば(219 - 文字コード)の文字に置換

その他の文字はそのまま出力

この関数を用い，英語のメッセージを暗号化・復号化せよ．

英小文字のASCIIコードは97~122。https://ja.wikipedia.org/wiki/ASCII

文字列をASCIIコードとして扱うためにString.to_charlist/1を使う。変換後はList.to_string/1で文字列に戻す。
String.to_charlistはStringをcharlist（UTF-8でエンコードした数値(Integer)のList）にして返すが、String.codepointsはStringのListを返す。

"パトカー" |> String.codepoints # ["パ", "ト", "カ", "ー"]
"パトカー" |> String.to_charlist # [12497, 12488, 12459, 12540]

Elixirでは""（ダブルクォーテーション）で囲むとString、''（シングルクォーテーション）で囲むとcharlistになる。
https://elixir-lang.org/getting-started/binaries-strings-and-char-lists.html

  def cipher(str) do
    str
    |> String.to_charlist
    |> Enum.map(&_cipher/1)
    |> List.to_string
  end

  def _cipher(int) when (int >= 97 and int <= 122), do: 219 - int
  def _cipher(int), do: int

  def run() do
    cipher("Elixir is a dynamic, functional language designed for building scalable and maintainable applications.")
    |> IO.inspect(label: "08. 暗号文 暗号化")
    |> cipher
    |> IO.inspect(label: "08. 暗号文 復号化")
  end

09. Typoglycemia

スペースで区切られた単語列に対して，各単語の先頭と末尾の文字は残し，それ以外の文字の順序をランダムに並び替えるプログラムを作成せよ．ただし，長さが４以下の単語は並び替えないこととする．適当な英語の文（例えば"I couldn't believe that I could actually understand what I was reading : the phenomenal power of the human mind ."）を与え，その実行結果を確認せよ．

Typoglycemia

ランダム並び替えにはEnum.shuffle/1を利用した。
コンマやピリオドを適切に扱うためにスペースを追加、削除する関数を用意した。（例文では予めスペースが入れてあるため処理しなくても問題ない。）

  def make_typoglycemia(str) do
    str
    |> insert_spaces_before_marks
    |> String.split
    |> Enum.map(&String.codepoints/1)
    |> Enum.map(&shuffle_inside/1)
    |> Enum.map(&Enum.join/1)
    |> Enum.join(" ")
    |> remove_spaces_before_marks
  end

  def shuffle_inside(list) when length(list) > 4 do
    first = List.first(list)
    inside = Enum.slice(list, 1..-2) |> Enum.shuffle
    last = List.last(list)
    Enum.concat([[first], inside, [last]])
  end
  def shuffle_inside(list), do: list

  def insert_spaces_before_marks(str) do
    str
    |> String.replace(",", " ,")
    |> String.replace(".", " .")
    |> String.replace(":", " :")
    |> String.replace(";", " ;")
    |> String.replace("!", " !")
    |> String.replace("?", " ?")
  end
  def remove_spaces_before_marks(str) do
    str
    |> String.replace(" ,", ",")
    |> String.replace(" .", ".")
    |> String.replace(" :", ":")
    |> String.replace(" ;", ";")
    |> String.replace(" !", "!")
    |> String.replace(" ?", "?")
  end

  def run() do
    make_typoglycemia("I couldn't believe that I could actually understand what I was reading : the phenomenal power of the human mind .")
    |> IO.inspect(label: "09, Typoglycemia")
  end

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up