自然言語処理をdocker(186)で
https://qiita.com/kaizen_nagoya/items/e29cbaed8370e7913487
tr, awkの処理を自動化することを検討。
失敗の記録:文字列処理スクリプトの場合 docker(188)
https://qiita.com/kaizen_nagoya/items/dcf07a892ce32966fbb3
から出発。
PDFtoTEXTは、単語分割がうまくできなかった。Acrobat ReaderでTXTファイル出力の方が精度が高かった。
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42
for File in *; do
tr 'A-Z' 'a-z' < ../text/${File}.txt > ../text/${File}.smt
awk -f ../gs.awk ../txt/${File}.smt > ../wc/${File}.wc
done
# chmod +x ttowc.sh
root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
./ttowc.sh: line 6: ../text/2101.00678v1.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2101.00678v1.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2309.14322v2.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2309.14322v2.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2311.16502v4.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2311.16502v4.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1.wc.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1.wc.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1L.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1L.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1s.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1s.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/text.txt: No such file or directory
./ttowc.sh: line 7: ../wc/text.wc: No such file or directory
./ttowc.sh: line 6: ../text/ttowc.sh.txt: No such file or directory
./ttowc.sh: line 7: ../wc/ttowc.sh.wc: No such file or directory
./ttowc.sh: line 6: ../text/wc.txt: No such file or directory
./ttowc.sh: line 7: ../wc/wc.wc: No such file or directory
./ttowc.sh: line 6: ../text/wc.awk.txt: No such file or directory
./ttowc.sh: line 7: ../wc/wc.awk.wc: No such file or directory
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42
for File in *; do
tr 'A-Z' 'a-z' < ../text/${File} > ../text/${File}.smt
awk -f ../gs.awk ../txt/${File}.smt > ../wc/${File}.wc
done
root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
./ttowc.sh: line 6: ../text/2101.00678v1.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2101.00678v1.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2309.14322v2.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2309.14322v2.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2311.16502v4.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2311.16502v4.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1.wc: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1.wc.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1L.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1L.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1s.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1s.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/text: No such file or directory
./ttowc.sh: line 7: ../wc/text.wc: No such file or directory
./ttowc.sh: line 6: ../text/ttowc.sh: No such file or directory
./ttowc.sh: line 7: ../wc/ttowc.sh.wc: No such file or directory
./ttowc.sh: line 6: ../text/wc: No such file or directory
./ttowc.sh: line 7: ../wc/wc.wc: No such file or directory
./ttowc.sh: line 6: ../text/wc.awk: No such file or directory
./ttowc.sh: line 7: ../wc/wc.awk.wc: No such file or directory
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42
for File in *; do
tr 'A-Z' 'a-z' < ./text/${File} > ./text/${File}.smt
awk -f ../gs.awk ./txt/${File}.smt > ./wc/${File}.wc
done
root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/2408.16293v1.wc: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/text: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/ttowc.sh: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/wc: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/wc.awk: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42
for File in *; do
tr 'A-Z' 'a-z' < ./text/${File} > ./text/${File}.smt
awk -f wc.awk ./txt/${File}.smt > ./wc/${File}.wc
done
root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
awk: cannot open "./txt/2101.00678v1.txt.smt" (No such file or directory)
awk: cannot open "./txt/2309.14322v2.txt.smt" (No such file or directory)
awk: cannot open "./txt/2311.16502v4.txt.smt" (No such file or directory)
awk: cannot open "./txt/2408.16293v1.txt.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/2408.16293v1.wc: No such file or directory
awk: cannot open "./txt/2408.16293v1.wc.smt" (No such file or directory)
awk: cannot open "./txt/2408.16293v1L.txt.smt" (No such file or directory)
awk: cannot open "./txt/2408.16293v1s.txt.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/text: No such file or directory
awk: cannot open "./txt/text.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/ttowc.sh: No such file or directory
awk: cannot open "./txt/ttowc.sh.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/wc: No such file or directory
awk: cannot open "./txt/wc.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/wc.awk: No such file or directory
awk: cannot open "./txt/wc.awk.smt" (No such file or directory)
# mkdir small
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42
for File in *; do
tr 'A-Z' 'a-z' < ./text/${File} > ./small/${File}
awk -f wc.awk ./small/${File} > ./wc/${File}.wc
done
はっときがついた。これまでの処理でできた不必要なファイルに処理しているものがエラーになっているのかもってなった。
おお、いっぱ無駄なファイルがある。データ中心設計っすね。
root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
./ttowc.sh: line 6: ./text/2408.16293v1.wc: No such file or directory
awk: cannot open "./small/2408.16293v1.wc.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/2408.16293v1L.txt: No such file or directory
awk: cannot open "./small/2408.16293v1L.txt.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/2408.16293v1s.txt: No such file or directory
awk: cannot open "./small/2408.16293v1s.txt.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/small: No such file or directory
awk: cannot open "./small/small.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/text: No such file or directory
awk: cannot open "./small/text.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/ttowc.sh: No such file or directory
awk: cannot open "./small/ttowc.sh.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/wc: No such file or directory
awk: cannot open "./small/wc.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/wc.awk: No such file or directory
awk: cannot open "./small/wc.awk.smt" (No such file or directory)
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42
for File in *; do
tr 'A-Z' 'a-z' < ./${File} > ../small/${File}
awk -f wc.awk ../small/${File} > ../wc/${File}.wc
done
root@bbf22e8ed49c:/rmp/llm# cd text
root@bbf22e8ed49c:/rmp/llm/text# ../ttowc.sh
../ttowc.sh: line 6: ./text/2101.00678v1.txt: No such file or directory
../ttowc.sh: line 7: ./wc/2101.00678v1.txt.wc: No such file or directory
../ttowc.sh: line 6: ./text/2309.14322v2.txt: No such file or directory
../ttowc.sh: line 7: ./wc/2309.14322v2.txt.wc: No such file or directory
../ttowc.sh: line 6: ./text/2311.16502v4.txt: No such file or directory
../ttowc.sh: line 7: ./wc/2311.16502v4.txt.wc: No such file or directory
../ttowc.sh: line 6: ./text/2408.16293v1.txt: No such file or directory
../ttowc.sh: line 7: ./wc/2408.16293v1.txt.wc: No such file or directory
../ttowc.sh: line 6: ./text/old: No such file or directory
../ttowc.sh: line 7: ./wc/old.wc: No such file or directory
ここで新たな問題が発覚。ここで編集して、同じ修正をdockerで手打ちしていて、./text/という文字列を削除し忘れていた。
#!/bin/bash
https://news.mynavi.jp/article/bashonwindows-17/
https://qiita.com/kaizen_nagoya/items/319672853519990cee42
for File in *.txt; do
tr 'A-Z' 'a-z' < ./${File}.txt > ../small/${File}.smt
awk -f wc.awk ../small/${File}.smt > ../wc/${File}.wc
done
oot@bbf22e8ed49c:/rmp/llm/text# ./ttowc.sh
./ttowc.sh: line 7: ./2101.00678v1.txt.txt: No such file or directory
awk: cannot open "wc.awk" (No such file or directory)
./ttowc.sh: line 7: ./2309.14322v2.txt.txt: No such file or directory
awk: cannot open "wc.awk" (No such file or directory)
./ttowc.sh: line 7: ./2311.16502v4.txt.txt: No such file or directory
awk: cannot open "wc.awk" (No such file or directory)
./ttowc.sh: line 7: ./2408.16293v1.txt.txt: No such file or directory
awk: cannot open "wc.awk" (No such file or directory)
root@bbf22e8ed49c:/rmp/llm/text# ./ttowc.sh
awk: cannot open "wc.awk" (No such file or directory)
awk: cannot open "wc.awk" (No such file or directory)
awk: cannot open "wc.awk" (No such file or directory)
awk: cannot open "wc.awk" (No such file or directory)
root@bbf22e8ed49c:/rmp/llm/text# cp ../wc.awk .
root@bbf22e8ed49c:/rmp/llm/text# ./ttowc.sh
やっとうごいた。12回くらいの試行錯誤