1
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

Qiita 自然言語システムAdvent Calendar 2024

Day 9

awk, tr 用 shell script ttowc.sh docker(187)

Last updated at Posted at 2024-11-10

自然言語処理をdocker(186)で
https://qiita.com/kaizen_nagoya/items/e29cbaed8370e7913487

tr, awkの処理を自動化することを検討。

失敗の記録:文字列処理スクリプトの場合 docker(188)
https://qiita.com/kaizen_nagoya/items/dcf07a892ce32966fbb3
から出発。

PDFtoTEXTは、単語分割がうまくできなかった。Acrobat ReaderでTXTファイル出力の方が精度が高かった。

ttowc.sh
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42

 for File in *; do
	    tr 'A-Z' 'a-z' < ../text/${File}.txt > ../text/${File}.smt
	    awk -f ../gs.awk ../txt/${File}.smt > ../wc/${File}.wc
 done
bash
 # chmod +x ttowc.sh 
bash
 root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
./ttowc.sh: line 6: ../text/2101.00678v1.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2101.00678v1.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2309.14322v2.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2309.14322v2.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2311.16502v4.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2311.16502v4.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1.wc.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1.wc.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1L.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1L.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1s.txt.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1s.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/text.txt: No such file or directory
./ttowc.sh: line 7: ../wc/text.wc: No such file or directory
./ttowc.sh: line 6: ../text/ttowc.sh.txt: No such file or directory
./ttowc.sh: line 7: ../wc/ttowc.sh.wc: No such file or directory
./ttowc.sh: line 6: ../text/wc.txt: No such file or directory
./ttowc.sh: line 7: ../wc/wc.wc: No such file or directory
./ttowc.sh: line 6: ../text/wc.awk.txt: No such file or directory
./ttowc.sh: line 7: ../wc/wc.awk.wc: No such file or directory
ttowc.sh
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42

 for File in *; do
	    tr 'A-Z' 'a-z' < ../text/${File} > ../text/${File}.smt
	    awk -f ../gs.awk ../txt/${File}.smt > ../wc/${File}.wc
 done
bash
root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
./ttowc.sh: line 6: ../text/2101.00678v1.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2101.00678v1.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2309.14322v2.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2309.14322v2.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2311.16502v4.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2311.16502v4.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1.wc: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1.wc.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1L.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1L.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/2408.16293v1s.txt: No such file or directory
./ttowc.sh: line 7: ../wc/2408.16293v1s.txt.wc: No such file or directory
./ttowc.sh: line 6: ../text/text: No such file or directory
./ttowc.sh: line 7: ../wc/text.wc: No such file or directory
./ttowc.sh: line 6: ../text/ttowc.sh: No such file or directory
./ttowc.sh: line 7: ../wc/ttowc.sh.wc: No such file or directory
./ttowc.sh: line 6: ../text/wc: No such file or directory
./ttowc.sh: line 7: ../wc/wc.wc: No such file or directory
./ttowc.sh: line 6: ../text/wc.awk: No such file or directory
./ttowc.sh: line 7: ../wc/wc.awk.wc: No such file or directory
ttowc.sh
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42

 for File in *; do
	    tr 'A-Z' 'a-z' < ./text/${File} > ./text/${File}.smt
	    awk -f ../gs.awk ./txt/${File}.smt > ./wc/${File}.wc
 done
bash
 root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/2408.16293v1.wc: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/text: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/ttowc.sh: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/wc: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
./ttowc.sh: line 6: ./text/wc.awk: No such file or directory
awk: cannot open "../gs.awk" (No such file or directory)
ttowc.sh
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42

 for File in *; do
	    tr 'A-Z' 'a-z' < ./text/${File} > ./text/${File}.smt
	    awk -f wc.awk ./txt/${File}.smt > ./wc/${File}.wc
 done
bash
 root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
awk: cannot open "./txt/2101.00678v1.txt.smt" (No such file or directory)
awk: cannot open "./txt/2309.14322v2.txt.smt" (No such file or directory)
awk: cannot open "./txt/2311.16502v4.txt.smt" (No such file or directory)
awk: cannot open "./txt/2408.16293v1.txt.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/2408.16293v1.wc: No such file or directory
awk: cannot open "./txt/2408.16293v1.wc.smt" (No such file or directory)
awk: cannot open "./txt/2408.16293v1L.txt.smt" (No such file or directory)
awk: cannot open "./txt/2408.16293v1s.txt.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/text: No such file or directory
awk: cannot open "./txt/text.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/ttowc.sh: No such file or directory
awk: cannot open "./txt/ttowc.sh.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/wc: No such file or directory
awk: cannot open "./txt/wc.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/wc.awk: No such file or directory
awk: cannot open "./txt/wc.awk.smt" (No such file or directory)
# mkdir small
ttowc.sh
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42

 for File in *; do
	    tr 'A-Z' 'a-z' < ./text/${File} > ./small/${File}
	    awk -f wc.awk ./small/${File} > ./wc/${File}.wc
 done

はっときがついた。これまでの処理でできた不必要なファイルに処理しているものがエラーになっているのかもってなった。

おお、いっぱ無駄なファイルがある。データ中心設計っすね。

root@bbf22e8ed49c:/rmp/llm# ./ttowc.sh
./ttowc.sh: line 6: ./text/2408.16293v1.wc: No such file or directory
awk: cannot open "./small/2408.16293v1.wc.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/2408.16293v1L.txt: No such file or directory
awk: cannot open "./small/2408.16293v1L.txt.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/2408.16293v1s.txt: No such file or directory
awk: cannot open "./small/2408.16293v1s.txt.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/small: No such file or directory
awk: cannot open "./small/small.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/text: No such file or directory
awk: cannot open "./small/text.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/ttowc.sh: No such file or directory
awk: cannot open "./small/ttowc.sh.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/wc: No such file or directory
awk: cannot open "./small/wc.smt" (No such file or directory)
./ttowc.sh: line 6: ./text/wc.awk: No such file or directory
awk: cannot open "./small/wc.awk.smt" (No such file or directory)

ttowc.sh
#!/bin/bash
# https://news.mynavi.jp/article/bashonwindows-17/
# https://qiita.com/kaizen_nagoya/items/319672853519990cee42

 for File in *; do
	    tr 'A-Z' 'a-z' < ./${File} > ../small/${File}
	    awk -f wc.awk ../small/${File} > ../wc/${File}.wc
 done

root@bbf22e8ed49c:/rmp/llm# cd text
root@bbf22e8ed49c:/rmp/llm/text# ../ttowc.sh
../ttowc.sh: line 6: ./text/2101.00678v1.txt: No such file or directory
../ttowc.sh: line 7: ./wc/2101.00678v1.txt.wc: No such file or directory
../ttowc.sh: line 6: ./text/2309.14322v2.txt: No such file or directory
../ttowc.sh: line 7: ./wc/2309.14322v2.txt.wc: No such file or directory
../ttowc.sh: line 6: ./text/2311.16502v4.txt: No such file or directory
../ttowc.sh: line 7: ./wc/2311.16502v4.txt.wc: No such file or directory
../ttowc.sh: line 6: ./text/2408.16293v1.txt: No such file or directory
../ttowc.sh: line 7: ./wc/2408.16293v1.txt.wc: No such file or directory
../ttowc.sh: line 6: ./text/old: No such file or directory
../ttowc.sh: line 7: ./wc/old.wc: No such file or directory

ここで新たな問題が発覚。ここで編集して、同じ修正をdockerで手打ちしていて、./text/という文字列を削除し忘れていた。

#!/bin/bash

https://news.mynavi.jp/article/bashonwindows-17/

https://qiita.com/kaizen_nagoya/items/319672853519990cee42

for File in *.txt; do

    tr 'A-Z' 'a-z' < ./${File}.txt > ../small/${File}.smt
    awk -f wc.awk ../small/${File}.smt > ../wc/${File}.wc

done


oot@bbf22e8ed49c:/rmp/llm/text# ./ttowc.sh 
./ttowc.sh: line 7: ./2101.00678v1.txt.txt: No such file or directory
awk: cannot open "wc.awk" (No such file or directory)
./ttowc.sh: line 7: ./2309.14322v2.txt.txt: No such file or directory
awk: cannot open "wc.awk" (No such file or directory)
./ttowc.sh: line 7: ./2311.16502v4.txt.txt: No such file or directory
awk: cannot open "wc.awk" (No such file or directory)
./ttowc.sh: line 7: ./2408.16293v1.txt.txt: No such file or directory
awk: cannot open "wc.awk" (No such file or directory)

root@bbf22e8ed49c:/rmp/llm/text# ./ttowc.sh 
awk: cannot open "wc.awk" (No such file or directory)
awk: cannot open "wc.awk" (No such file or directory)
awk: cannot open "wc.awk" (No such file or directory)
awk: cannot open "wc.awk" (No such file or directory)
root@bbf22e8ed49c:/rmp/llm/text# cp ../wc.awk .
root@bbf22e8ed49c:/rmp/llm/text# ./ttowc.sh 

やっとうごいた。12回くらいの試行錯誤

1
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?