LoginSignup
0
0

More than 3 years have passed since last update.

YouTube, Deepspeech, with Google Colaboratory [testing_0005] : DeepSpeech output ’json’ [0002]

Last updated at Posted at 2021-03-01

deepSpeech の音認識結果を json で受け取ると、

                                         transcripts
0  {'confidence': -1793.291259765625, 'words': [{'wor...
1  {'confidence': -87105.90625, 'words': [{'word'...
2  {'confidence': -87105.90625, 'words': [{'word'...

こうなっていました。

これは、

import pandas as pd
import json
import pprint
#from collections import OrderedDict

with open ('/content/json (1).txt','r') as f:
    #jso = json.load(f, object_pairs_hook=OrderedDict)
    line = f.read()
    #jso = json.load(f)
    jso = pd.read_json(line)
    #print(jso)
    #jso = json.loads(line)

    pprint.pprint(jso)

の結果ですが、 json をパースする方法は色々あるようですが、詳細に内容を見る前にどうなっているのかなーと開いてみるにはこの pandas で見るのが良さそうでした。

全部観ると、単語ごとの出現箇所のタイムと、尺のラップされたものになるので非常に長い行数、または一行でずー―――と続く文字列となります。

import pandas as pd
import json
import pprint
#from collections import OrderedDict

with open ('json.txt','r') as f:
    #jso = json.load(f, object_pairs_hook=OrderedDict)
    #line = f.read()
    jso = json.load(f)
    #jso = pd.read_json(line)
    #jso = json.loads(line)

    print(jso)
    #pprint.pprint(jso)
{'transcripts': [{'confidence': -1793.291259765625, 'words': [{'word': 'you', 'start_time': 0.56, 'duration': 0.12}, {'word': 'may', 'start_time': 0.74, 'duration': 0.14}, {'word': 'write', 'start_time': 1.0, 'duration': 0.2}, {'word': 'me', 'start_time': 1.3, 'duration': 0.16}, {'word': 'down', 'start_time': 1.54, 'duration': 0.18}, {'word': 'in', 'start_time': 1.84, 'duration': 0.1}, {'word': 'history', 'start_time': 2.0, 'duration': 1.06}, {'word': 'with', 'start_time': 3.12, 'duration': 0.12}, {'word': 'your', 'start_time': 3.26, 'duration': 0.16}, {'word': 'visit', 'start_time': 3.5, 'duration': 0.32}, {'word': 'wished', 'start_time': 3.86, 'duration': 0.38}, {'word': 'lines', 'start_time': 4.34, 'duration': 1.1}, {'word': 'you', 'start_time': 5.52, 'duration': 0.1}, {'word': 'may', 'start_time': 5.66, 'duration': 0.22}, {'word': 'try', 'start_time': 6.0, 'duration': 0.3}, {'word': 'me', 'start_time': 6.34, 'duration': 0.12}, {'word': 'in', 'start_time': 6.54, 'duration': 0.06}, {'word': 'the', 'start_time': 6.64, 'duration': 0.08}, {'word': 'very', 'start_time': 6.78, 'duration': 0.28}, {'word': 'dirt', 'start_time': 7.14, 'duration': 1.12}, {'word': 'but', 'start_time': 8.32, 'duration': 0.22}, {'word': 'still', 'start_time': 8.6, 'duration': 0.3}, {'word': 'like', 'start_time': 8.98, 'duration': 0.38}, {'word': 'dust', 'start_time': 9.48, 'duration': 1.48}, {'word': 'or', 'start_time': 11.1, 'duration': 1.68}, {'word': 'does', 'start_time': 12.86, 'duration': 0.24}, {'word': 'my', 'start_time': 13.16, 'duration': 0.24}, {'word': 'sauciness', 'start_time': 13.48, 'duration': 0.52}, {'word': 'upset', 'start_time': 14.06, 'duration': 0.38}, {'word': 'you', 'start_time': 14.46, 'duration': 0.72}, {'word': 'why', 'start_time': 15.36, 'duration': 0.14}, {'word': 'are', 'start_time': 15.64, 'duration': 0.12}, {'word': 'you', 'start_time': 15.8, 'duration': 0.12}, {'word': 'beside', 'start_time': 16.02, 'duration': 0.32}, {'word': 'with', 'start_time': 16.38, 'duration': 0.12}, {'word': 'gloom', 'start_time': 16.58, 'duration': 0.72}, {'word': 'to', 'start_time': 17.42, 'duration': 0.1}, {'word': 'cause', 'start_time': 17.58, 'duration': 0.22}, {'word': 'i', 'start_time': 17.88, 'duration': 0.54}, {'word': 'walk', 'start_time': 18.5, 'duration': 0.22}, {'word': 'in', 'start_time': 18.8, 'duration': 0.1}, {'word': 'the', 'start_time': 18.96, 'duration': 0.18}, {'word': 'i', 'start_time': 19.28, 'duration': 0.14}, {'word': 'have', 'start_time': 19.48, 'duration': 0.52}, {'word': 'oil', 'start_time': 20.18, 'duration': 0.38}, {'word': 'wells', 'start_time': 20.68, 'duration': 0.38}, {'word': 'pumping', 'start_time': 21.12, 'duration': 0.64}, {'word': 'my', 'start_time': 21.82, 'duration': 0.26}, {'word': 'living', 'start_time': 22.14, 'duration': 0.42}, {'word': 'room', 'start_time': 22.66, 'duration': 1.68}, {'word': 'sailors', 'start_time': 24.44, 'duration': 1.38}, {'word': 'and', 'start_time': 25.94, 'duration': 0.1}, {'word': 'like', 'start_time': 26.12, 'duration': 0.5}, {'word': 'songs', 'start_time': 26.72, 'duration': 0.32}, {'word': 'with', 'start_time': 27.1, 'duration': 0.2}, {'word': 'a', 'start_time': 27.44, 'duration': 0.1}, {'word': 'cuttenclips', 'start_time': 27.62, 'duration': 3.38}, {'word': 'springing', 'start_time': 31.02, 'duration': 0.48}, {'word': 'high', 'start_time': 31.66, 'duration': 1.2}, {'word': 'still', 'start_time': 32.92, 'duration': 0.2}, {'word': 'i', 'start_time': 33.24, 'duration': 0.14}, {'word': 'wore', 'start_time': 33.46, 'duration': 0.9}, {'word': 'did', 'start_time': 34.42, 'duration': 0.16}, {'word': 'you', 'start_time': 34.62, 'duration': 0.12}, {'word': 'want', 'start_time': 34.8, 'duration': 0.18}, {'word': 'to', 'start_time': 35.04, 'duration': 0.08}, {'word': 'see', 'start_time': 35.18, 'duration': 0.16}, {'word': 'me', 'start_time': 35.4, 'duration': 0.18}, {'word': 'broken', 'start_time': 35.64, 'duration': 1.2}, {'word': 'bowed', 'start_time': 37.0, 'duration': 0.4}, {'word': 'head', 'start_time': 37.5, 'duration': 0.22}, {'word': 'and', 'start_time': 37.82, 'duration': 0.14}, {'word': 'lord', 'start_time': 38.08, 'duration': 0.46}, {'word': 'eyes', 'start_time': 38.82, 'duration': 0.82}, {'word': 'soltali', 'start_time': 39.7, 'duration': 1.0}, {'word': 'down', 'start_time': 40.76, 'duration': 0.24}, {'word': 'like', 'start_time': 41.04, 'duration': 0.36}, {'word': 'pedrosan', 'start_time': 41.46, 'duration': 1.9}, {'word': 'by', 'start_time': 43.46, 'duration': 0.18}, {'word': 'my', 'start_time': 43.78, 'duration': 0.3}, {'word': 'soul', 'start_time': 44.18, 'duration': 0.3}, {'word': 'societaires', 'start_time': 44.5, 'duration': 2.74}, {'word': 'oudenard', 'start_time': 47.32, 'duration': 4.08}, {'word': 'isolated', 'start_time': 51.5, 'duration': 1.7}, {'word': 'as', 'start_time': 53.32, 'duration': 0.1}, {'word': 'if', 'start_time': 53.54, 'duration': 0.16}, {'word': 'i', 'start_time': 53.8, 'duration': 0.12}, {'word': 'have', 'start_time': 53.98, 'duration': 0.26}, {'word': 'gold', 'start_time': 54.36, 'duration': 0.46}, {'word': "man's", 'start_time': 54.98, 'duration': 0.36}, {'word': 'digging', 'start_time': 55.42, 'duration': 0.28}, {'word': 'in', 'start_time': 55.78, 'duration': 0.08}, {'word': 'my', 'start_time': 55.92, 'duration': 0.2}, {'word': 'own', 'start_time': 56.3, 'duration': 0.26}, {'word': 'back', 'start_time': 56.64, 'duration': 0.28}, {'word': 'yard', 'start_time': 57.0, 'duration': 1.12}, {'word': 'you', 'start_time': 58.16, 'duration': 0.12}, {'word': 'can', 'start_time': 58.34, 'duration': 0.16}, {'word': 'shoot', 'start_time': 58.54, 'duration': 0.24}, {'word': 'me', 'start_time': 58.82, 'duration': 0.14}, {'word': 'with', 'start_time': 59.0, 'duration': 0.16}, {'word': 'your', 'start_time': 59.22, 'duration': 0.18}, {'word': 'words', 'start_time': 59.54, 'duration': 0.76}, {'word': 'you', 'start_time': 60.36, 'duration': 0.14}, {'word': 'can', 'start_time': 60.56, 'duration': 0.2}, {'word': 'cut', 'start_time': 60.84, 'duration': 0.2}, {'word': 'me', 'start_time': 61.06, 'duration': 0.14}, {'word': 'with', 'start_time': 61.26, 'duration': 0.16}, {'word': 'your', 'start_time': 61.46, 'duration': 0.22}, {'word': 'lies', 'start_time': 61.84, 'duration': 0.76}, {'word': 'you', 'start_time': 62.66, 'duration': 0.14}, {'word': 'can', 'start_time': 62.86, 'duration': 0.28}, {'word': 'kill', 'start_time': 63.2, 'duration': 0.22}, {'word': 'me', 'start_time': 63.46, 'duration': 0.14}, {'word': 'with', 'start_time': 63.66, 'duration': 0.12}, {'word': 'your', 'start_time': 63.84, 'duration': 0.16}, {'word': 'hatefulness', 'start_time': 64.06, 'duration': 0.54}, {'word': 'but', 'start_time': 64.66, 'duration': 0.1}, {'word': 'just', 'start_time': 64.84, 'duration': 0.26}, {'word': 'like', 'start_time': 65.14, 'duration': 0.42}, {'word': 'life', 'start_time': 65.64, 'duration': 1.42}, {'word': 'i', 'start_time': 67.18, 'duration': 1.46}, {'word': 'does', 'start_time': 68.68, 'duration': 0.24}, {'word': 'my', 'start_time': 68.96, 'duration': 0.26}, {'word': 'senatorian', 'start_time': 69.28, 'duration': 12.02}, {'word': 'as', 'start_time': 81.4, 'duration': 0.18}, {'word': 'if', 'start_time': 81.72, 'duration': 0.22}, {'word': 'i', 'start_time': 82.06, 'duration': 0.1}, {'word': 'have', 'start_time': 82.22, 'duration': 0.26}, {'word': 'diamonds', 'start_time': 82.56, 'duration': 0.5}, {'word': 'at', 'start_time': 83.16, 'duration': 0.1}, {'word': 'the', 'start_time': 83.3, 'duration': 0.1}, {'word': 'meeting', 'start_time': 83.46, 'duration': 0.4}, {'word': 'of', 'start_time': 84.0, 'duration': 0.12}, {'word': 'my', 'start_time': 84.22, 'duration': 0.3}, {'word': 'time', 'start_time': 84.72, 'duration': 2.72}, {'word': 'out', 'start_time': 87.58, 'duration': 0.18}, {'word': 'of', 'start_time': 87.84, 'duration': 0.12}, {'word': 'the', 'start_time': 88.0, 'duration': 0.12}, {'word': 'huts', 'start_time': 88.18, 'duration': 0.28}, {'word': 'of', 'start_time': 88.56, 'duration': 0.12}, {'word': 'history', 'start_time': 88.76, 'duration': 0.74}, {'word': 'shame', 'start_time': 89.6, 'duration': 0.32}, {'word': 'i', 'start_time': 90.04, 'duration': 0.18}, {'word': 'rise', 'start_time': 90.38, 'duration': 1.2}, {'word': 'up', 'start_time': 91.7, 'duration': 0.1}, {'word': 'from', 'start_time': 91.84, 'duration': 0.16}, {'word': 'a', 'start_time': 92.14, 'duration': 0.08}, {'word': 'past', 'start_time': 92.28, 'duration': 0.34}, {'word': 'rooted', 'start_time': 92.7, 'duration': 0.36}, {'word': 'in', 'start_time': 93.18, 'duration': 0.18}, {'word': 'pain', 'start_time': 93.46, 'duration': 0.72}, {'word': 'i', 'start_time': 94.32, 'duration': 0.14}, {'word': 'ran', 'start_time': 94.54, 'duration': 0.94}, {'word': 'a', 'start_time': 95.62, 'duration': 0.06}, {'word': 'black', 'start_time': 95.72, 'duration': 0.26}, {'word': 'ocean', 'start_time': 96.06, 'duration': 0.56}, {'word': 'heaving', 'start_time': 96.74, 'duration': 0.44}, {'word': 'and', 'start_time': 97.28, 'duration': 0.22}, {'word': 'would', 'start_time': 97.74, 'duration': 0.92}, {'word': 'willingly', 'start_time': 98.76, 'duration': 1.32}, {'word': 'and', 'start_time': 100.22, 'duration': 0.14}, {'word': 'bearing', 'start_time': 100.44, 'duration': 0.3}, {'word': 'it', 'start_time': 100.82, 'duration': 1.8}, {'word': 'leaving', 'start_time': 102.66, 'duration': 0.36}, {'word': 'behind', 'start_time': 103.1, 'duration': 0.56}, {'word': 'it', 'start_time': 103.78, 'duration': 0.2}, {'word': 'of', 'start_time': 104.1, 'duration': 0.14}, {'word': 'terror', 'start_time': 104.32, 'duration': 0.64}, {'word': 'and', 'start_time': 105.1, 'duration': 0.32}, {'word': 'fear', 'start_time': 105.54, 'duration': 0.8}, {'word': 'as', 'start_time': 107.04, 'duration': 0.9}, {'word': 'into', 'start_time': 108.14, 'duration': 0.3}, {'word': 'a', 'start_time': 108.52, 'duration': 0.06}, {'word': 'daybreak', 'start_time': 108.64, 'duration': 0.54}, {'word': 'miraculously', 'start_time': 109.24, 'duration': 1.04}, {'word': 'clear', 'start_time': 110.34, 'duration': 1.26}, {'word': 'in', 'start_time': 111.74, 'duration': 1.42}, {'word': 'bringing', 'start_time': 113.22, 'duration': 0.42}, {'word': 'the', 'start_time': 113.7, 'duration': 0.1}, {'word': 'gifts', 'start_time': 113.86, 'duration': 0.32}, {'word': 'that', 'start_time': 114.24, 'duration': 0.22}, {'word': 'my', 'start_time': 114.56, 'duration': 0.18}, {'word': 'ancestors', 'start_time': 114.86, 'duration': 0.66}, {'word': 'gave', 'start_time': 115.66, 'duration': 0.8}, {'word': 'i', 'start_time': 116.64, 'duration': 0.12}, {'word': 'am', 'start_time': 116.84, 'duration': 0.14}, {'word': 'the', 'start_time': 117.06, 'duration': 0.24}, {'word': 'whole', 'start_time': 117.4, 'duration': 0.58}, {'word': 'and', 'start_time': 118.1, 'duration': 0.1}, {'word': 'the', 'start_time': 118.26, 'duration': 0.18}, {'word': 'dream', 'start_time': 118.54, 'duration': 0.78}, {'word': 'of', 'start_time': 119.38, 'duration': 0.1}, {'word': 'the', 'start_time': 119.54, 'duration': 0.22}, {'word': 'slave', 'start_time': 119.8, 'duration': 1.18}, {'word': 'and', 'start_time': 121.2, 'duration': 0.3}, {'word': 'so', 'start_time': 121.64, 'duration': 4.68}, {'word': 'that', 'start_time': 126.42, 'duration': 0.14}]}, {'confidence': -1795.8509521484375, 'words': [{'word': 'you', 'start_time': 0.56, 'duration': 0.12}, {'word': 'may', 'start_time': 0.74, 'duration': 0.14}, {'word': 'write', 'start_time': 1.0, 'duration': 0.2}, {'word': 'me', 'start_time': 1.3, 'duration': 0.16}, {'word': 'down', 'start_time': 1.54, 'duration': 0.18}, {'word': 'in', 'start_time': 1.84, 'duration': 0.1}, {'word': 'history', 'start_time': 2.0, 'duration': 1.06}, {'word': 'with', 'start_time': 3.12, 'duration': 0.12}, {'word': 'your', 'start_time': 3.26, 'duration': 0.16}, {'word': 'visit', 'start_time': 3.5, 'duration': 0.32}, {'word': 'wished', 'start_time': 3.86, 'duration': 0.38}, {'word': 'lines', 'start_time': 4.34, 'duration': 1.1}, {'word': 'you', 'start_time': 5.52, 'duration': 0.1}, {'word': 'may', 'start_time': 5.66, 'duration': 0.22}, {'word': 'try', 'start_time': 6.0, 'duration': 0.3}, {'word': 'me', 'start_time': 6.34, 'duration': 0.12}, {'word': 'in', 'start_time': 6.54, 'duration': 0.06}, {'word': 'the', 'start_time': 6.64, 'duration': 0.08}, {'word': 'very', 'start_time': 6.78, 'duration': 0.28}, {'word': 'dirt', 'start_time': 7.14, 'duration': 1.12}, {'word': 'but', 'start_time': 8.32, 'duration': 0.22}, {'word': 'still', 'start_time': 8.6, 'duration': 0.3}, {'word': 'like', 'start_time': 8.98, 'duration': 0.38}, {'word': 'dust', 'start_time': 9.48, 'duration': 1.48}, {'word': 'or', 'start_time': 11.1, 'duration': 1.68}, {'word': 'does', 'start_time': 12.86, 'duration': 0.24}, {'word': 'my', 'start_time': 13.16, 'duration': 0.24}, {'word': 'sauciness', 'start_time': 13.48, 'duration': 0.52}, {'word': 'upset', 'start_time': 14.06, 'duration': 0.38}, {'word': 'you', 'start_time': 14.46, 'duration': 0.72}, {'word': 'why', 'start_time': 15.36, 'duration': 0.14}, {'word': 'are', 'start_time': 15.64, 'duration': 0.12}, {'word': 'you', 'start_time': 15.8, 'duration': 0.12}, {'word': 'beside', 'start_time': 16.02, 'duration': 0.32}, {'word': 'with', 'start_time': 16.38, 'duration': 0.12}, {'word': 'gloom', 'start_time': 16.58, 'duration': 0.72}, {'word': 'to', 'start_time': 17.42, 'duration': 0.1}, {'word': 'cause', 'start_time': 17.58, 'duration': 0.22}, {'word': 'i', 'start_time': 17.88, 'duration': 0.54}, {'word': 'walk', 'start_time': 18.5, 'duration': 0.22}, {'word': 'in', 'start_time': 18.8, 'duration': 0.1}, {'word': 'the', 'start_time': 18.96, 'duration': 0.18}, {'word': 'i', 'start_time': 19.28, 'duration': 0.14}, {'word': 'have', 'start_time': 19.48, 'duration': 0.52}, {'word': 'oil', 'start_time': 20.18, 'duration': 0.38}, {'word': 'wells', 'start_time': 20.68, 'duration': 0.38}, {'word': 'pumping', 'start_time': 21.12, 'duration': 0.64}, {'word': 'my', 'start_time': 21.82, 'duration': 0.26}, {'word': 'living', 'start_time': 22.14, 'duration': 0.42}, {'word': 'room', 'start_time': 22.66, 'duration': 1.68}, {'word': 'sailors', 'start_time': 24.44, 'duration': 1.38}, {'word': 'and', 'start_time': 25.94, 'duration': 0.1}, {'word': 'like', 'start_time': 26.12, 'duration': 0.5}, {'word': 'songs', 'start_time': 26.72, 'duration': 0.32}, {'word': 'with', 'start_time': 27.1, 'duration': 0.2}, {'word': 'a', 'start_time': 27.44, 'duration': 0.1}, {'word': 'cuttenclips', 'start_time': 27.62, 'duration': 3.38}, {'word': 'springing', 'start_time': 31.02, 'duration': 0.48}, {'word': 'high', 'start_time': 31.66, 'duration': 1.2}, {'word': 'still', 'start_time': 32.92, 'duration': 0.2}, {'word': 'i', 'start_time': 33.24, 'duration': 0.14}, {'word': 'wore', 'start_time': 33.46, 'duration': 0.9}, {'word': 'did', 'start_time': 34.42, 'duration': 0.16}, {'word': 'you', 'start_time': 34.62, 'duration': 0.12}, {'word': 'want', 'start_time': 34.8, 'duration': 0.18}, {'word': 'to', 'start_time': 35.04, 'duration': 0.08}, {'word': 'see', 'start_time': 35.18, 'duration': 0.16}, {'word': 'me', 'start_time': 35.4, 'duration': 0.18}, {'word': 'broken', 'start_time': 35.64, 'duration': 1.2}, {'word': 'bowed', 'start_time': 37.0, 'duration': 0.4}, {'word': 'head', 'start_time': 37.5, 'duration': 0.22}, {'word': 'and', 'start_time': 37.82, 'duration': 0.14}, {'word': 'lord', 'start_time': 38.08, 'duration': 0.46}, {'word': 'eyes', 'start_time': 38.82, 'duration': 0.82}, {'word': 'soltali', 'start_time': 39.7, 'duration': 1.0}, {'word': 'down', 'start_time': 40.76, 'duration': 0.24}, {'word': 'like', 'start_time': 41.04, 'duration': 0.36}, {'word': 'pedrosan', 'start_time': 41.46, 'duration': 1.9}, {'word': 'by', 'start_time': 43.46, 'duration': 0.18}, {'word': 'my', 'start_time': 43.78, 'duration': 0.3}, {'word': 'soul', 'start_time': 44.18, 'duration': 0.3}, {'word': 'societaires', 'start_time': 44.5, 'duration': 2.74}, {'word': 'oudenard', 'start_time': 47.32, 'duration': 4.08}, {'word': 'isolated', 'start_time': 51.5, 'duration': 1.7}, {'word': 'as', 'start_time': 53.32, 'duration': 0.1}, {'word': 'if', 'start_time': 53.54, 'duration': 0.16}, {'word': 'i', 'start_time': 53.8, 'duration': 0.12}, {'word': 'have', 'start_time': 53.98, 'duration': 0.26}, {'word': 'gold', 'start_time': 54.36, 'duration': 0.46}, {'word': "man's", 'start_time': 54.98, 'duration': 0.36}, {'word': 'digging', 'start_time': 55.42, 'duration': 0.28}, {'word': 'in', 'start_time': 55.78, 'duration': 0.08}, {'word': 'my', 'start_time': 55.92, 'duration': 0.2}, {'word': 'own', 'start_time': 56.3, 'duration': 0.26}, {'word': 'back', 'start_time': 56.64, 'duration': 0.28}, {'word': 'yard', 'start_time': 57.0, 'duration': 1.12}, {'word': 'you', 'start_time': 58.16, 'duration': 0.12}, {'word': 'can', 'start_time': 58.34, 'duration': 0.16}, {'word': 'shoot', 'start_time': 58.54, 'duration': 0.24}, {'word': 'me', 'start_time': 58.82, 'duration': 0.14}, {'word': 'with', 'start_time': 59.0, 'duration': 0.16}, {'word': 'your', 'start_time': 59.22, 'duration': 0.18}, {'word': 'words', 'start_time': 59.54, 'duration': 0.76}, {'word': 'you', 'start_time': 60.36, 'duration': 0.14}, {'word': 'can', 'start_time': 60.56, 'duration': 0.2}, {'word': 'cut', 'start_time': 60.84, 'duration': 0.2}, {'word': 'me', 'start_time': 61.06, 'duration': 0.14}, {'word': 'with', 'start_time': 61.26, 'duration': 0.16}, {'word': 'your', 'start_time': 61.46, 'duration': 0.22}, {'word': 'lies', 'start_time': 61.84, 'duration': 0.76}, {'word': 'you', 'start_time': 62.66, 'duration': 0.14}, {'word': 'can', 'start_time': 62.86, 'duration': 0.28}, {'word': 'kill', 'start_time': 63.2, 'duration': 0.22}, {'word': 'me', 'start_time': 63.46, 'duration': 0.14}, {'word': 'with', 'start_time': 63.66, 'duration': 0.12}, {'word': 'your', 'start_time': 63.84, 'duration': 0.16}, {'word': 'hatefulness', 'start_time': 64.06, 'duration': 0.54}, {'word': 'but', 'start_time': 64.66, 'duration': 0.1}, {'word': 'just', 'start_time': 64.84, 'duration': 0.26}, {'word': 'like', 'start_time': 65.14, 'duration': 0.42}, {'word': 'life', 'start_time': 65.64, 'duration': 1.42}, {'word': 'i', 'start_time': 67.18, 'duration': 1.46}, {'word': 'does', 'start_time': 68.68, 'duration': 0.24}, {'word': 'my', 'start_time': 68.96, 'duration': 0.26}, {'word': 'senatorian', 'start_time': 69.28, 'duration': 12.02}, {'word': 'as', 'start_time': 81.4, 'duration': 0.18}, {'word': 'if', 'start_time': 81.72, 'duration': 0.22}, {'word': 'i', 'start_time': 82.06, 'duration': 0.1}, {'word': 'have', 'start_time': 82.22, 'duration': 0.26}, {'word': 'diamonds', 'start_time': 82.56, 'duration': 0.5}, {'word': 'at', 'start_time': 83.16, 'duration': 0.1}, {'word': 'the', 'start_time': 83.3, 'duration': 0.1}, {'word': 'meeting', 'start_time': 83.46, 'duration': 0.4}, {'word': 'of', 'start_time': 84.0, 'duration': 0.12}, {'word': 'my', 'start_time': 84.22, 'duration': 0.3}, {'word': 'time', 'start_time': 84.72, 'duration': 2.72}, {'word': 'out', 'start_time': 87.58, 'duration': 0.18}, {'word': 'of', 'start_time': 87.84, 'duration': 0.12}, {'word': 'the', 'start_time': 88.0, 'duration': 0.12}, {'word': 'huts', 'start_time': 88.18, 'duration': 0.28}, {'word': 'of', 'start_time': 88.56, 'duration': 0.12}, {'word': 'history', 'start_time': 88.76, 'duration': 0.74}, {'word': 'shame', 'start_time': 89.6, 'duration': 0.32}, {'word': 'i', 'start_time': 90.04, 'duration': 0.18}, {'word': 'rise', 'start_time': 90.38, 'duration': 1.2}, {'word': 'up', 'start_time': 91.7, 'duration': 0.1}, {'word': 'from', 'start_time': 91.84, 'duration': 0.16}, {'word': 'a', 'start_time': 92.14, 'duration': 0.08}, {'word': 'past', 'start_time': 92.28, 'duration': 0.34}, {'word': 'rooted', 'start_time': 92.7, 'duration': 0.36}, {'word': 'in', 'start_time': 93.18, 'duration': 0.18}, {'word': 'pain', 'start_time': 93.46, 'duration': 0.72}, {'word': 'i', 'start_time': 94.32, 'duration': 0.14}, {'word': 'ran', 'start_time': 94.54, 'duration': 0.94}, {'word': 'a', 'start_time': 95.62, 'duration': 0.06}, {'word': 'black', 'start_time': 95.72, 'duration': 0.26}, {'word': 'ocean', 'start_time': 96.06, 'duration': 0.56}, {'word': 'heaving', 'start_time': 96.74, 'duration': 0.44}, {'word': 'and', 'start_time': 97.28, 'duration': 0.22}, {'word': 'would', 'start_time': 97.74, 'duration': 0.92}, {'word': 'willingly', 'start_time': 98.76, 'duration': 1.32}, {'word': 'and', 'start_time': 100.22, 'duration': 0.14}, {'word': 'bearing', 'start_time': 100.44, 'duration': 0.3}, {'word': 'it', 'start_time': 100.82, 'duration': 1.8}, {'word': 'leaving', 'start_time': 102.66, 'duration': 0.36}, {'word': 'behind', 'start_time': 103.1, 'duration': 0.56}, {'word': 'it', 'start_time': 103.78, 'duration': 0.2}, {'word': 'of', 'start_time': 104.1, 'duration': 0.14}, {'word': 'terror', 'start_time': 104.32, 'duration': 0.64}, {'word': 'and', 'start_time': 105.1, 'duration': 0.32}, {'word': 'fear', 'start_time': 105.54, 'duration': 0.8}, {'word': 'as', 'start_time': 107.04, 'duration': 0.9}, {'word': 'into', 'start_time': 108.14, 'duration': 0.3}, {'word': 'a', 'start_time': 108.52, 'duration': 0.06}, {'word': 'daybreak', 'start_time': 108.64, 'duration': 0.54}, {'word': 'miraculously', 'start_time': 109.24, 'duration': 1.04}, {'word': 'clear', 'start_time': 110.34, 'duration': 1.26}, {'word': 'in', 'start_time': 111.74, 'duration': 1.42}, {'word': 'bringing', 'start_time': 113.22, 'duration': 0.42}, {'word': 'the', 'start_time': 113.7, 'duration': 0.1}, {'word': 'gifts', 'start_time': 113.86, 'duration': 0.32}, {'word': 'that', 'start_time': 114.24, 'duration': 0.22}, {'word': 'my', 'start_time': 114.56, 'duration': 0.18}, {'word': 'ancestors', 'start_time': 114.86, 'duration': 0.66}, {'word': 'gave', 'start_time': 115.66, 'duration': 0.8}, {'word': 'i', 'start_time': 116.64, 'duration': 0.12}, {'word': 'am', 'start_time': 116.84, 'duration': 0.14}, {'word': 'the', 'start_time': 117.06, 'duration': 0.24}, {'word': 'whole', 'start_time': 117.4, 'duration': 0.58}, {'word': 'and', 'start_time': 118.1, 'duration': 0.1}, {'word': 'the', 'start_time': 118.26, 'duration': 0.18}, {'word': 'dream', 'start_time': 118.54, 'duration': 0.78}, {'word': 'of', 'start_time': 119.38, 'duration': 0.1}, {'word': 'the', 'start_time': 119.54, 'duration': 0.22}, {'word': 'slave', 'start_time': 119.8, 'duration': 1.18}, {'word': 'and', 'start_time': 121.2, 'duration': 0.3}, {'word': 'so', 'start_time': 121.64, 'duration': 4.68}, {'word': 'then', 'start_time': 126.42, 'duration': 0.14}]}, {'confidence': -1796.1273193359375, 'words': [{'word': 'you', 'start_time': 0.56, 'duration': 0.12}, {'word': 'may', 'start_time': 0.74, 'duration': 0.14}, {'word': 'write', 'start_time': 1.0, 'duration': 0.2}, {'word': 'me', 'start_time': 1.3, 'duration': 0.16}, {'word': 'down', 'start_time': 1.54, 'duration': 0.18}, {'word': 'in', 'start_time': 1.84, 'duration': 0.1}, {'word': 'history', 'start_time': 2.0, 'duration': 1.06}, {'word': 'with', 'start_time': 3.12, 'duration': 0.12}, {'word': 'your', 'start_time': 3.26, 'duration': 0.16}, {'word': 'visit', 'start_time': 3.5, 'duration': 0.32}, {'word': 'wished', 'start_time': 3.86, 'duration': 0.38}, {'word': 'lines', 'start_time': 4.34, 'duration': 1.1}, {'word': 'you', 'start_time': 5.52, 'duration': 0.1}, {'word': 'may', 'start_time': 5.66, 'duration': 0.22}, {'word': 'try', 'start_time': 6.0, 'duration': 0.3}, {'word': 'me', 'start_time': 6.34, 'duration': 0.12}, {'word': 'in', 'start_time': 6.54, 'duration': 0.06}, {'word': 'the', 'start_time': 6.64, 'duration': 0.08}, {'word': 'very', 'start_time': 6.78, 'duration': 0.28}, {'word': 'dirt', 'start_time': 7.14, 'duration': 1.12}, {'word': 'but', 'start_time': 8.32, 'duration': 0.22}, {'word': 'still', 'start_time': 8.6, 'duration': 0.3}, {'word': 'like', 'start_time': 8.98, 'duration': 0.38}, {'word': 'dust', 'start_time': 9.48, 'duration': 1.48}, {'word': 'or', 'start_time': 11.1, 'duration': 1.68}, {'word': 'does', 'start_time': 12.86, 'duration': 0.24}, {'word': 'my', 'start_time': 13.16, 'duration': 0.24}, {'word': 'sauciness', 'start_time': 13.48, 'duration': 0.52}, {'word': 'upset', 'start_time': 14.06, 'duration': 0.38}, {'word': 'you', 'start_time': 14.46, 'duration': 0.72}, {'word': 'why', 'start_time': 15.36, 'duration': 0.14}, {'word': 'are', 'start_time': 15.64, 'duration': 0.12}, {'word': 'you', 'start_time': 15.8, 'duration': 0.12}, {'word': 'beside', 'start_time': 16.02, 'duration': 0.32}, {'word': 'with', 'start_time': 16.38, 'duration': 0.12}, {'word': 'gloom', 'start_time': 16.58, 'duration': 0.72}, {'word': 'to', 'start_time': 17.42, 'duration': 0.1}, {'word': 'cause', 'start_time': 17.58, 'duration': 0.22}, {'word': 'i', 'start_time': 17.88, 'duration': 0.54}, {'word': 'walk', 'start_time': 18.5, 'duration': 0.22}, {'word': 'in', 'start_time': 18.8, 'duration': 0.1}, {'word': 'the', 'start_time': 18.96, 'duration': 0.18}, {'word': 'i', 'start_time': 19.28, 'duration': 0.14}, {'word': 'have', 'start_time': 19.48, 'duration': 0.52}, {'word': 'oil', 'start_time': 20.18, 'duration': 0.38}, {'word': 'wells', 'start_time': 20.68, 'duration': 0.38}, {'word': 'pumping', 'start_time': 21.12, 'duration': 0.64}, {'word': 'my', 'start_time': 21.82, 'duration': 0.26}, {'word': 'living', 'start_time': 22.14, 'duration': 0.42}, {'word': 'room', 'start_time': 22.66, 'duration': 1.68}, {'word': 'sailors', 'start_time': 24.44, 'duration': 1.38}, {'word': 'and', 'start_time': 25.94, 'duration': 0.1}, {'word': 'like', 'start_time': 26.12, 'duration': 0.5}, {'word': 'songs', 'start_time': 26.72, 'duration': 0.32}, {'word': 'with', 'start_time': 27.1, 'duration': 0.2}, {'word': 'a', 'start_time': 27.44, 'duration': 0.1}, {'word': 'cuttenclips', 'start_time': 27.62, 'duration': 3.38}, {'word': 'springing', 'start_time': 31.02, 'duration': 0.48}, {'word': 'high', 'start_time': 31.66, 'duration': 1.2}, {'word': 'still', 'start_time': 32.92, 'duration': 0.2}, {'word': 'i', 'start_time': 33.24, 'duration': 0.14}, {'word': 'wore', 'start_time': 33.46, 'duration': 0.9}, {'word': 'did', 'start_time': 34.42, 'duration': 0.16}, {'word': 'you', 'start_time': 34.62, 'duration': 0.12}, {'word': 'want', 'start_time': 34.8, 'duration': 0.18}, {'word': 'to', 'start_time': 35.04, 'duration': 0.08}, {'word': 'see', 'start_time': 35.18, 'duration': 0.16}, {'word': 'me', 'start_time': 35.4, 'duration': 0.18}, {'word': 'broken', 'start_time': 35.64, 'duration': 1.2}, {'word': 'bowed', 'start_time': 37.0, 'duration': 0.4}, {'word': 'head', 'start_time': 37.5, 'duration': 0.22}, {'word': 'and', 'start_time': 37.82, 'duration': 0.14}, {'word': 'lord', 'start_time': 38.08, 'duration': 0.46}, {'word': 'eyes', 'start_time': 38.82, 'duration': 0.82}, {'word': 'soltali', 'start_time': 39.7, 'duration': 1.0}, {'word': 'down', 'start_time': 40.76, 'duration': 0.24}, {'word': 'like', 'start_time': 41.04, 'duration': 0.36}, {'word': 'pedrosan', 'start_time': 41.46, 'duration': 1.9}, {'word': 'by', 'start_time': 43.46, 'duration': 0.18}, {'word': 'my', 'start_time': 43.78, 'duration': 0.3}, {'word': 'soul', 'start_time': 44.18, 'duration': 0.3}, {'word': 'societaires', 'start_time': 44.5, 'duration': 2.74}, {'word': 'oudenard', 'start_time': 47.32, 'duration': 4.08}, {'word': 'isolated', 'start_time': 51.5, 'duration': 1.7}, {'word': 'as', 'start_time': 53.32, 'duration': 0.1}, {'word': 'if', 'start_time': 53.54, 'duration': 0.16}, {'word': 'i', 'start_time': 53.8, 'duration': 0.12}, {'word': 'have', 'start_time': 53.98, 'duration': 0.26}, {'word': 'gold', 'start_time': 54.36, 'duration': 0.46}, {'word': "man's", 'start_time': 54.98, 'duration': 0.36}, {'word': 'digging', 'start_time': 55.42, 'duration': 0.28}, {'word': 'in', 'start_time': 55.78, 'duration': 0.08}, {'word': 'my', 'start_time': 55.92, 'duration': 0.2}, {'word': 'own', 'start_time': 56.3, 'duration': 0.26}, {'word': 'back', 'start_time': 56.64, 'duration': 0.28}, {'word': 'yard', 'start_time': 57.0, 'duration': 1.12}, {'word': 'you', 'start_time': 58.16, 'duration': 0.12}, {'word': 'can', 'start_time': 58.34, 'duration': 0.16}, {'word': 'shoot', 'start_time': 58.54, 'duration': 0.24}, {'word': 'me', 'start_time': 58.82, 'duration': 0.14}, {'word': 'with', 'start_time': 59.0, 'duration': 0.16}, {'word': 'your', 'start_time': 59.22, 'duration': 0.18}, {'word': 'words', 'start_time': 59.54, 'duration': 0.76}, {'word': 'you', 'start_time': 60.36, 'duration': 0.14}, {'word': 'can', 'start_time': 60.56, 'duration': 0.2}, {'word': 'cut', 'start_time': 60.84, 'duration': 0.2}, {'word': 'me', 'start_time': 61.06, 'duration': 0.14}, {'word': 'with', 'start_time': 61.26, 'duration': 0.16}, {'word': 'your', 'start_time': 61.46, 'duration': 0.22}, {'word': 'lies', 'start_time': 61.84, 'duration': 0.76}, {'word': 'you', 'start_time': 62.66, 'duration': 0.14}, {'word': 'can', 'start_time': 62.86, 'duration': 0.28}, {'word': 'kill', 'start_time': 63.2, 'duration': 0.22}, {'word': 'me', 'start_time': 63.46, 'duration': 0.14}, {'word': 'with', 'start_time': 63.66, 'duration': 0.12}, {'word': 'your', 'start_time': 63.84, 'duration': 0.16}, {'word': 'hatefulness', 'start_time': 64.06, 'duration': 0.54}, {'word': 'but', 'start_time': 64.66, 'duration': 0.1}, {'word': 'just', 'start_time': 64.84, 'duration': 0.26}, {'word': 'like', 'start_time': 65.14, 'duration': 0.42}, {'word': 'life', 'start_time': 65.64, 'duration': 1.42}, {'word': 'i', 'start_time': 67.18, 'duration': 1.46}, {'word': 'does', 'start_time': 68.68, 'duration': 0.24}, {'word': 'my', 'start_time': 68.96, 'duration': 0.26}, {'word': 'senatorian', 'start_time': 69.28, 'duration': 12.02}, {'word': 'as', 'start_time': 81.4, 'duration': 0.18}, {'word': 'if', 'start_time': 81.72, 'duration': 0.22}, {'word': 'i', 'start_time': 82.06, 'duration': 0.1}, {'word': 'have', 'start_time': 82.22, 'duration': 0.26}, {'word': 'diamonds', 'start_time': 82.56, 'duration': 0.5}, {'word': 'at', 'start_time': 83.16, 'duration': 0.1}, {'word': 'the', 'start_time': 83.3, 'duration': 0.1}, {'word': 'meeting', 'start_time': 83.46, 'duration': 0.4}, {'word': 'of', 'start_time': 84.0, 'duration': 0.12}, {'word': 'my', 'start_time': 84.22, 'duration': 0.3}, {'word': 'time', 'start_time': 84.72, 'duration': 2.72}, {'word': 'out', 'start_time': 87.58, 'duration': 0.18}, {'word': 'of', 'start_time': 87.84, 'duration': 0.12}, {'word': 'the', 'start_time': 88.0, 'duration': 0.12}, {'word': 'huts', 'start_time': 88.18, 'duration': 0.28}, {'word': 'of', 'start_time': 88.56, 'duration': 0.12}, {'word': 'history', 'start_time': 88.76, 'duration': 0.74}, {'word': 'shame', 'start_time': 89.6, 'duration': 0.32}, {'word': 'i', 'start_time': 90.04, 'duration': 0.18}, {'word': 'rise', 'start_time': 90.38, 'duration': 1.2}, {'word': 'up', 'start_time': 91.7, 'duration': 0.1}, {'word': 'from', 'start_time': 91.84, 'duration': 0.16}, {'word': 'a', 'start_time': 92.14, 'duration': 0.08}, {'word': 'past', 'start_time': 92.28, 'duration': 0.34}, {'word': 'rooted', 'start_time': 92.7, 'duration': 0.36}, {'word': 'in', 'start_time': 93.18, 'duration': 0.18}, {'word': 'pain', 'start_time': 93.46, 'duration': 0.72}, {'word': 'i', 'start_time': 94.32, 'duration': 0.14}, {'word': 'ran', 'start_time': 94.54, 'duration': 0.94}, {'word': 'a', 'start_time': 95.62, 'duration': 0.06}, {'word': 'black', 'start_time': 95.72, 'duration': 0.26}, {'word': 'ocean', 'start_time': 96.06, 'duration': 0.56}, {'word': 'heaving', 'start_time': 96.74, 'duration': 0.44}, {'word': 'and', 'start_time': 97.28, 'duration': 0.22}, {'word': 'would', 'start_time': 97.74, 'duration': 0.92}, {'word': 'willingly', 'start_time': 98.76, 'duration': 1.32}, {'word': 'and', 'start_time': 100.22, 'duration': 0.14}, {'word': 'bearing', 'start_time': 100.44, 'duration': 0.3}, {'word': 'it', 'start_time': 100.82, 'duration': 1.8}, {'word': 'leaving', 'start_time': 102.66, 'duration': 0.36}, {'word': 'behind', 'start_time': 103.1, 'duration': 0.56}, {'word': 'it', 'start_time': 103.78, 'duration': 0.2}, {'word': 'of', 'start_time': 104.1, 'duration': 0.14}, {'word': 'terror', 'start_time': 104.32, 'duration': 0.64}, {'word': 'and', 'start_time': 105.1, 'duration': 0.32}, {'word': 'fear', 'start_time': 105.54, 'duration': 0.8}, {'word': 'as', 'start_time': 107.04, 'duration': 0.9}, {'word': 'into', 'start_time': 108.14, 'duration': 0.3}, {'word': 'a', 'start_time': 108.52, 'duration': 0.06}, {'word': 'daybreak', 'start_time': 108.64, 'duration': 0.54}, {'word': 'miraculously', 'start_time': 109.24, 'duration': 1.04}, {'word': 'clear', 'start_time': 110.34, 'duration': 1.26}, {'word': 'in', 'start_time': 111.74, 'duration': 1.42}, {'word': 'bringing', 'start_time': 113.22, 'duration': 0.42}, {'word': 'the', 'start_time': 113.7, 'duration': 0.1}, {'word': 'gifts', 'start_time': 113.86, 'duration': 0.32}, {'word': 'that', 'start_time': 114.24, 'duration': 0.22}, {'word': 'my', 'start_time': 114.56, 'duration': 0.18}, {'word': 'ancestors', 'start_time': 114.86, 'duration': 0.66}, {'word': 'gave', 'start_time': 115.66, 'duration': 0.8}, {'word': 'i', 'start_time': 116.64, 'duration': 0.12}, {'word': 'am', 'start_time': 116.84, 'duration': 0.14}, {'word': 'the', 'start_time': 117.06, 'duration': 0.24}, {'word': 'whole', 'start_time': 117.4, 'duration': 0.58}, {'word': 'and', 'start_time': 118.1, 'duration': 0.1}, {'word': 'the', 'start_time': 118.26, 'duration': 0.18}, {'word': 'dream', 'start_time': 118.54, 'duration': 0.78}, {'word': 'of', 'start_time': 119.38, 'duration': 0.1}, {'word': 'the', 'start_time': 119.54, 'duration': 0.22}, {'word': 'slaves', 'start_time': 119.8, 'duration': 1.18}, {'word': 'and', 'start_time': 121.2, 'duration': 0.3}, {'word': 'so', 'start_time': 121.64, 'duration': 4.68}, {'word': 'then', 'start_time': 126.42, 'duration': 0.14}]}]}

ちょっとまだ confidence を理解していないのですが、きっと認識の候補かなぁ...と仮定して、['transcripts'][0] つまり

                                         transcripts
0  {'confidence': -1793.291259765625, 'words': [{'wor...

のなかのものを抽出して、json から SubRip 形式の(つまり YouTube でデフォルトで使われている字幕のファイル形式ですが)ファイルに変換する python のコードを考えてみました。GoogleColab 用ですが、別にこれはたんに json ファイル、ここではなぜか、.json ではなく .txt にしていますが、から json をパースするプログラムなので、ローカル用に変更して実行しても十分速いのではないかなと思います。やってないので、「(やっ)タラ、(もし、や)レバ」です。

import json
import datetime
from google.colab import files
import copy
import sys
#sys.setrecursionlimit(30000)

#uploaded = files.upload()

upfilename = 'json.txt'


#for fn in uploaded.keys():
#    print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
#    upfilename = fn

def fmttime(seconds):
    secs = seconds #millisecs / 1000.0
    d = datetime.timedelta(seconds=secs)
    t = (datetime.datetime.min + d).time()
    milli = t.strftime('%f')[:3]
    value = t.strftime('%H:%M:%S,') + milli
    return value 

original_stdout = sys.stdout #""" stdout backup """
filename = 'subtitle.srt' #""" print subtitle text to this file """
with open(upfilename, 'r') as up_f:
    line = up_f.read()
    jso = json.loads(line)
    ###print(jso['transcripts'][0]['words'])
    with open(filename,'w',encoding='utf8') as down_f:

#        sys.stdout = down_f #""" stdout to file """"

        totaltime = 0
        sentence = []

        endtime = ''
        starttime = ''
        lastword_time = 0
        lineNum = 1

        #confidence:0
        confidence =jso['transcripts'][0]
        #print(confidence)
        for i,ob in enumerate(confidence['words']):
            #continue
            #print(i,ob) 
            talk_start = True
            talk_end = False

            for key in ob:
                if key == 'word':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    if ob[key] != '':
                        sentence.append(ob.get(key))
                    ###print(*sentence)

                elif key == 'start_time':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    time = ob[key]
                    if  time - lastword_time < 1:
                        talk_start = False
                        talk_end = False

                    elif time - lastword_time >= 1: # 1 secons silence
                        talk_start = False
                        talk_end = True
                        ### block >
                        totaltime = 0
                        endtime = fmttime(lastword_time)
                        if len(sentence) > 1:
                            temp = sentence.pop()
                            print(lineNum)
                            lineNum += 1
                            print(starttime,'-->',endtime2)
                            # this word goes to next caption
                            kotoba = ''
                            for word in sentence:
                                kotoba += word + ' '
                            print(kotoba.rstrip())
                            print()
                            sentence.clear()
                            sentence.append(temp) # new caption
                        ### <  block 

                    if len(sentence) == 1 :
                        talk_start = True
                        talk_end = False
                        starttime = fmttime(time)
                        p_time = time

                elif key == 'duration':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    totaltime += ob[key] 
                    lastword_time = p_time + totaltime

                    endtime2 = fmttime(lastword_time)

                    #print('in :',fmttime(time),'>>',*sentence)
                    #print('end :',fmttime(time+totaltime)) 
                    #print('< > :',fmttime(totaltime))

                    if totaltime >= 4: # 4 seconds speech gose to 1 caption
                        ### block >
                        totaltime = 0    
                        endtime = fmttime(lastword_time)
                        print(lineNum)
                        lineNum += 1
                        print(starttime,'-->',endtime)
                        kotoba = ''
                        for word in sentence:
                            kotoba += word + ' '

                        print(kotoba.rstrip())
                        print()
                        sentence.clear()
                        ### < block
                    elif totaltime < 4 and i + 1 == len(confidence['words']): 
                        ### block >
                        totaltime = 0    
                        endtime = fmttime(lastword_time)
                        print(lineNum)
                        lineNum += 1
                        print(starttime,'-->',endtime)
                        kotoba = ''
                        for word in sentence:
                            kotoba += word + ' '

                        print(kotoba.rstrip())
                        print()
                        sentence.clear()
                        ### < block
#        sys.stdout = original_stdout # stdout back 

#files.download(filename) # download .srt file 

SubRip の形式であれば(あればね。これがちゃんとSubRipの要件満たしていたら)、字幕編集のプログラムで見ることもできるでしょう、きっと、たぶんね。
(字幕編集プログラムについては、
https://qiita.com/dauuricus/items/863dd4d087b3aff6455d

いまおもいついたのですが、上記のようにセンテンスで取り出さずに word 単位でタイムシートにすれば、認識したことばと位置をつかって、Tracker のようなシンセサイザーみたいなのがつくれそうですね。
というところでようやく理解したのだけれども mozilla はもしかして TTS のために STT をやってるのかな?センテンスで取り出すニーズを補完する方がわかりやすいのは、対比して VOSK と比較した場合、VOSKでは json から 'text' ですぐにセンテンスが抽出されるようになっている。なので VOSK では比較的簡単に字幕ぽいの抽出はできる。1

confidence の意味がよくわからない。
なぜ文の評価のようになっているんだろうか?
https://discourse.mozilla.org/t/obtain-per-word-confidence-score/44969

ドキュメントだと検索しても、なぜなのかについては出てこなさそうであったため、どうやって confidence 0,1,2 を word ごとに順に並べて表示するように json から取り出すか考えて、

        def print_word(n) :


            for ob in jso['transcripts'][n]['words']:

                for key in ob:

                    if key == 'word':
                        print('confidence;',str(n)+':',ob[key])
                        n = n + 1
                        if n > 2:
                            n = 0              
                        p = ob.pop(key)
                        #print('pop',p)
                        print_word(n)
                        break
                    else:
                        ob.pop(key)
                        break

        print_word(0)

ようやく期待したものにはなったが、これはもっとうまいやり方が必ずあるはず。

こうなった
import copy
import sys
##sys.setrecursionlimit(30000)

        def list_copy(n):
            temp = []
            for i in range(n):
                c = copy.deepcopy(jso['transcripts'][i]['words'])
                temp.append(c)
            print_word(temp,0)

        # check confidence
        def print_word(copy,n) :
            if (len(copy[n])) > 0:
                #print(len(copy[n]))  
                dic = copy[n].pop(0)
               ##for ob in jso['transcripts'][n]['words']:

                key = [v for v in dic.values()]
                print("confidence:",str(n)+':', key[0])
                ##print("confidence:",str(n)+':', dic)
                n +=1
                if n > len(copy)-1:
                    n = 0

               ## for ob in dic.values():
               ##     print('confidence;',str(n)+':',ob)
               ##     n += 1
               ##     if n > len(copy)-1:
               ##         n = 0
               ##     break

                print_word(copy,n)
            else:
                for k in range(len(copy)):
                    if k > len(copy) - 1:
                        return
                        break
                    else:
                        n += 1
                        if n > len(copy) - 1:
                            n = 0
                        if len(copy[n]) < 1:
                            continue
                        else:
                            print_word(copy,n)

               ## n += 1
               ## if n > len(copy)-1:
               ##     n = 0
               ## if len(copy[n]) < 1:
               ##     n += 1
               ##     if n > len(copy)-1:
               ##         n = 0
               ##     if len(copy[n]) < 1:
               ##         return
               ## else:
               ##     print_word(copy,n)  

        list_copy(3)

Confidence リストをコピーして word を抽出してプリントするだけ。
できたと思って、もっと長い 7000 ループのものを回してみると再帰のエラーになったため、改良した。多分これでいいはず。リストからポップしていって、そのディクショナリーのキーが 'word' であるか比べて、そうであればプリントして、3 つのそれぞれリストが空になれば 3 つとも止まって再帰関数を抜ける。

一応将来的に confidence の数が増えてもこのままでいいはず。
現在は 3 つの confidence であるので、list_copy() で 3 としている。

だがしかし、どこまで最適化してもこの方法だと、扱う行数が増えればリカーシブマックスなんとかエラー、つまり再帰の呼び出し回数の上限を超えてしまうエラーになるので、pandas でこうやって見る方が再帰パズルで考え尽くすよりは、次のことに早く進めれる。
(までに数日かかった)

import pandas as pd
import json

with open ('json.txt','r') as f:

    jso = pd.read_json(f)

    print(jso.to_string())

並べて確かめると word の単語は3つともに同じであったりなかったりする。

confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: may
confidence; 1: may
confidence; 2: may
confidence; 0: write
confidence; 1: write
confidence; 2: write
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: down
confidence; 1: down
confidence; 2: down
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: history
confidence; 1: history
confidence; 2: history
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: your
confidence; 1: your
confidence; 2: your
confidence; 0: visit
confidence; 1: visit
confidence; 2: visit
confidence; 0: wished
confidence; 1: wished
confidence; 2: wished
confidence; 0: lines
confidence; 1: lines
confidence; 2: lines
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: may
confidence; 1: may
confidence; 2: may
confidence; 0: try
confidence; 1: try
confidence; 2: try
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: very
confidence; 1: very
confidence; 2: very
confidence; 0: dirt
confidence; 1: dirt
confidence; 2: dirt
confidence; 0: but
confidence; 1: but
confidence; 2: but
confidence; 0: still
confidence; 1: still
confidence; 2: still
confidence; 0: like
confidence; 1: like
confidence; 2: like
confidence; 0: dust
confidence; 1: dust
confidence; 2: dust
confidence; 0: or
confidence; 1: or
confidence; 2: or
confidence; 0: does
confidence; 1: does
confidence; 2: does
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: sauciness
confidence; 1: sauciness
confidence; 2: sauciness
confidence; 0: upset
confidence; 1: upset
confidence; 2: upset
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: why
confidence; 1: why
confidence; 2: why
confidence; 0: are
confidence; 1: are
confidence; 2: are
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: beside
confidence; 1: beside
confidence; 2: beside
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: gloom
confidence; 1: gloom
confidence; 2: gloom
confidence; 0: to
confidence; 1: to
confidence; 2: to
confidence; 0: cause
confidence; 1: cause
confidence; 2: cause
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: walk
confidence; 1: walk
confidence; 2: walk
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: have
confidence; 1: have
confidence; 2: have
confidence; 0: oil
confidence; 1: oil
confidence; 2: oil
confidence; 0: wells
confidence; 1: wells
confidence; 2: wells
confidence; 0: pumping
confidence; 1: pumping
confidence; 2: pumping
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: living
confidence; 1: living
confidence; 2: living
confidence; 0: room
confidence; 1: room
confidence; 2: room
confidence; 0: sailors
confidence; 1: sailors
confidence; 2: sailors
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: like
confidence; 1: like
confidence; 2: like
confidence; 0: songs
confidence; 1: songs
confidence; 2: songs
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: a
confidence; 1: a
confidence; 2: a
confidence; 0: cuttenclips
confidence; 1: cuttenclips
confidence; 2: cuttenclips
confidence; 0: springing
confidence; 1: springing
confidence; 2: springing
confidence; 0: high
confidence; 1: high
confidence; 2: high
confidence; 0: still
confidence; 1: still
confidence; 2: still
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: wore
confidence; 1: wore
confidence; 2: wore
confidence; 0: did
confidence; 1: did
confidence; 2: did
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: want
confidence; 1: want
confidence; 2: want
confidence; 0: to
confidence; 1: to
confidence; 2: to
confidence; 0: see
confidence; 1: see
confidence; 2: see
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: broken
confidence; 1: broken
confidence; 2: broken
confidence; 0: bowed
confidence; 1: bowed
confidence; 2: bowed
confidence; 0: head
confidence; 1: head
confidence; 2: head
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: lord
confidence; 1: lord
confidence; 2: lord
confidence; 0: eyes
confidence; 1: eyes
confidence; 2: eyes
confidence; 0: soltali
confidence; 1: soltali
confidence; 2: soltali
confidence; 0: down
confidence; 1: down
confidence; 2: down
confidence; 0: like
confidence; 1: like
confidence; 2: like
confidence; 0: pedrosan
confidence; 1: pedrosan
confidence; 2: pedrosan
confidence; 0: by
confidence; 1: by
confidence; 2: by
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: soul
confidence; 1: soul
confidence; 2: soul
confidence; 0: societaires
confidence; 1: societaires
confidence; 2: societaires
confidence; 0: oudenard
confidence; 1: oudenard
confidence; 2: oudenard
confidence; 0: isolated
confidence; 1: isolated
confidence; 2: isolated
confidence; 0: as
confidence; 1: as
confidence; 2: as
confidence; 0: if
confidence; 1: if
confidence; 2: if
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: have
confidence; 1: have
confidence; 2: have
confidence; 0: gold
confidence; 1: gold
confidence; 2: gold
confidence; 0: man's
confidence; 1: man's
confidence; 2: man's
confidence; 0: digging
confidence; 1: digging
confidence; 2: digging
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: own
confidence; 1: own
confidence; 2: own
confidence; 0: back
confidence; 1: back
confidence; 2: back
confidence; 0: yard
confidence; 1: yard
confidence; 2: yard
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: can
confidence; 1: can
confidence; 2: can
confidence; 0: shoot
confidence; 1: shoot
confidence; 2: shoot
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: your
confidence; 1: your
confidence; 2: your
confidence; 0: words
confidence; 1: words
confidence; 2: words
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: can
confidence; 1: can
confidence; 2: can
confidence; 0: cut
confidence; 1: cut
confidence; 2: cut
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: your
confidence; 1: your
confidence; 2: your
confidence; 0: lies
confidence; 1: lies
confidence; 2: lies
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: can
confidence; 1: can
confidence; 2: can
confidence; 0: kill
confidence; 1: kill
confidence; 2: kill
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: your
confidence; 1: your
confidence; 2: your
confidence; 0: hatefulness
confidence; 1: hatefulness
confidence; 2: hatefulness
confidence; 0: but
confidence; 1: but
confidence; 2: but
confidence; 0: just
confidence; 1: just
confidence; 2: just
confidence; 0: like
confidence; 1: like
confidence; 2: like
confidence; 0: life
confidence; 1: life
confidence; 2: life
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: does
confidence; 1: does
confidence; 2: does
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: senatorian
confidence; 1: senatorian
confidence; 2: senatorian
confidence; 0: as
confidence; 1: as
confidence; 2: as
confidence; 0: if
confidence; 1: if
confidence; 2: if
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: have
confidence; 1: have
confidence; 2: have
confidence; 0: diamonds
confidence; 1: diamonds
confidence; 2: diamonds
confidence; 0: at
confidence; 1: at
confidence; 2: at
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: meeting
confidence; 1: meeting
confidence; 2: meeting
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: time
confidence; 1: time
confidence; 2: time
confidence; 0: out
confidence; 1: out
confidence; 2: out
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: huts
confidence; 1: huts
confidence; 2: huts
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: history
confidence; 1: history
confidence; 2: history
confidence; 0: shame
confidence; 1: shame
confidence; 2: shame
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: rise
confidence; 1: rise
confidence; 2: rise
confidence; 0: up
confidence; 1: up
confidence; 2: up
confidence; 0: from
confidence; 1: from
confidence; 2: from
confidence; 0: a
confidence; 1: a
confidence; 2: a
confidence; 0: past
confidence; 1: past
confidence; 2: past
confidence; 0: rooted
confidence; 1: rooted
confidence; 2: rooted
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: pain
confidence; 1: pain
confidence; 2: pain
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: ran
confidence; 1: ran
confidence; 2: ran
confidence; 0: a
confidence; 1: a
confidence; 2: a
confidence; 0: black
confidence; 1: black
confidence; 2: black
confidence; 0: ocean
confidence; 1: ocean
confidence; 2: ocean
confidence; 0: heaving
confidence; 1: heaving
confidence; 2: heaving
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: would
confidence; 1: would
confidence; 2: would
confidence; 0: willingly
confidence; 1: willingly
confidence; 2: willingly
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: bearing
confidence; 1: bearing
confidence; 2: bearing
confidence; 0: it
confidence; 1: it
confidence; 2: it
confidence; 0: leaving
confidence; 1: leaving
confidence; 2: leaving
confidence; 0: behind
confidence; 1: behind
confidence; 2: behind
confidence; 0: it
confidence; 1: it
confidence; 2: it
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: terror
confidence; 1: terror
confidence; 2: terror
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: fear
confidence; 1: fear
confidence; 2: fear
confidence; 0: as
confidence; 1: as
confidence; 2: as
confidence; 0: into
confidence; 1: into
confidence; 2: into
confidence; 0: a
confidence; 1: a
confidence; 2: a
confidence; 0: daybreak
confidence; 1: daybreak
confidence; 2: daybreak
confidence; 0: miraculously
confidence; 1: miraculously
confidence; 2: miraculously
confidence; 0: clear
confidence; 1: clear
confidence; 2: clear
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: bringing
confidence; 1: bringing
confidence; 2: bringing
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: gifts
confidence; 1: gifts
confidence; 2: gifts
confidence; 0: that
confidence; 1: that
confidence; 2: that
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: ancestors
confidence; 1: ancestors
confidence; 2: ancestors
confidence; 0: gave
confidence; 1: gave
confidence; 2: gave
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: am
confidence; 1: am
confidence; 2: am
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: whole
confidence; 1: whole
confidence; 2: whole
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: dream
confidence; 1: dream
confidence; 2: dream
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: slave
confidence; 1: slave
confidence; 2: slaves
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: so
confidence; 1: so
confidence; 2: so
confidence; 0: that
confidence; 1: then
confidence; 2: then

ということで、confidence については、やはりよくわからない。たぶん、この精度では、このように認識するという精度が三つあるという事かな?

その前に、話はじめ、話中、話終わりのフラグを付けたらば、もしかして、なんかよいのかな?ということを考えて、一応フラグを立ててみた。

        #confidence:0
        confidence =jso['transcripts'][0]
        #print(confidence)
        for i,ob in enumerate(confidence['words']):
            #continue
            #print(i,ob) 
            talk_start = True
            talk_end = False

            for key in ob:
                if key == 'word':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    if ob[key] != '':
                        sentence.append(ob.get(key))
                    ###print(*sentence)

                elif key == 'start_time':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    time = ob[key]
                    if  time - lastword_time < 1:
                        talk_start = False
                        talk_end = False

                    elif time - lastword_time >= 1: # 1 secons silence
                        talk_start = False
                        talk_end = True
                        ### block >
                        totaltime = 0
                        endtime = fmttime(lastword_time)
                        if len(sentence) > 1:
                            temp = sentence.pop()
                            print(lineNum)
                            lineNum += 1
                            print(starttime,'-->',endtime2)
                            # this word goes to next caption
                            kotoba = ''
                            for word in sentence:
                                kotoba += word + ' '
                            print(kotoba.rstrip())
                            print()
                            sentence.clear()
                            sentence.append(temp) # new caption
                        ### <  block 

                    if len(sentence) == 1 :
                        talk_start = True
                        talk_end = False
                        starttime = fmttime(time)
                        p_time = time

                elif key == 'duration':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    totaltime += ob[key] 
                    lastword_time = p_time + totaltime

                    endtime2 = fmttime(lastword_time)

                    #print('in :',fmttime(time),'>>',*sentence)
                    #print('end :',fmttime(time+totaltime)) 
                    #print('< > :',fmttime(totaltime))

                    if totaltime >= 4: # 4 seconds speech gose to 1 caption
                        ### block >
                        totaltime = 0    
                        endtime = fmttime(lastword_time)
                        print(lineNum)
                        lineNum += 1
                        print(starttime,'-->',endtime)
                        kotoba = ''
                        for word in sentence:
                            kotoba += word + ' '

                        print(kotoba.rstrip())
                        print()
                        sentence.clear()
                        ### < block
                    elif totaltime < 4 and i + 1 == len(confidence['words']): 
                        ### block >
                        totaltime = 0    
                        endtime = fmttime(lastword_time)
                        print(lineNum)
                        lineNum += 1
                        print(starttime,'-->',endtime)
                        kotoba = ''
                        for word in sentence:
                            kotoba += word + ' '

                        print(kotoba.rstrip())
                        print()
                        sentence.clear()
                        ### < block

Rf.
OpenMPT
https://en.m.wikipedia.org/wiki/OpenMPT

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0