More than 3 years have passed since last update.

YouTube, Deepspeech, with Google Colaboratory [testing_0005] : DeepSpeech output ’json’ [0002]

Last updated at 2021-03-07Posted at 2021-03-01

deepSpeech の音認識結果を json で受け取ると、

                                         transcripts
0  {'confidence': -1793.291259765625, 'words': [{'wor...
1  {'confidence': -87105.90625, 'words': [{'word'...
2  {'confidence': -87105.90625, 'words': [{'word'...

こうなっていました。

これは、

import pandas as pd
import json
import pprint
# from collections import OrderedDict

with open ('/content/json (1).txt','r') as f:
    #jso = json.load(f, object_pairs_hook=OrderedDict)
    line = f.read()
    #jso = json.load(f)
    jso = pd.read_json(line)
    #print(jso)
    #jso = json.loads(line)

    pprint.pprint(jso)

の結果ですが、 json をパースする方法は色々あるようですが、詳細に内容を見る前にどうなっているのかなーと開いてみるにはこの pandas で見るのが良さそうでした。

全部観ると、単語ごとの出現箇所のタイムと、尺のラップされたものになるので非常に長い行数、または一行でずー―――と続く文字列となります。

import pandas as pd
import json
import pprint
# from collections import OrderedDict

with open ('json.txt','r') as f:
    #jso = json.load(f, object_pairs_hook=OrderedDict)
    #line = f.read()
    jso = json.load(f)
    #jso = pd.read_json(line)
    #jso = json.loads(line)

    print(jso)
    #pprint.pprint(jso)

{'transcripts': [{'confidence': -1793.291259765625, 'words': [{'word': 'you', 'start_time': 0.56, 'duration': 0.12}, {'word': 'may', 'start_time': 0.74, 'duration': 0.14}, {'word': 'write', 'start_time': 1.0, 'duration': 0.2}, {'word': 'me', 'start_time': 1.3, 'duration': 0.16}, {'word': 'down', 'start_time': 1.54, 'duration': 0.18}, {'word': 'in', 'start_time': 1.84, 'duration': 0.1}, {'word': 'history', 'start_time': 2.0, 'duration': 1.06}, {'word': 'with', 'start_time': 3.12, 'duration': 0.12}, {'word': 'your', 'start_time': 3.26, 'duration': 0.16}, {'word': 'visit', 'start_time': 3.5, 'duration': 0.32}, {'word': 'wished', 'start_time': 3.86, 'duration': 0.38}, {'word': 'lines', 'start_time': 4.34, 'duration': 1.1}, {'word': 'you', 'start_time': 5.52, 'duration': 0.1}, {'word': 'may', 'start_time': 5.66, 'duration': 0.22}, {'word': 'try', 'start_time': 6.0, 'duration': 0.3}, {'word': 'me', 'start_time': 6.34, 'duration': 0.12}, {'word': 'in', 'start_time': 6.54, 'duration': 0.06}, {'word': 'the', 'start_time': 6.64, 'duration': 0.08}, {'word': 'very', 'start_time': 6.78, 'duration': 0.28}, {'word': 'dirt', 'start_time': 7.14, 'duration': 1.12}, {'word': 'but', 'start_time': 8.32, 'duration': 0.22}, {'word': 'still', 'start_time': 8.6, 'duration': 0.3}, {'word': 'like', 'start_time': 8.98, 'duration': 0.38}, {'word': 'dust', 'start_time': 9.48, 'duration': 1.48}, {'word': 'or', 'start_time': 11.1, 'duration': 1.68}, {'word': 'does', 'start_time': 12.86, 'duration': 0.24}, {'word': 'my', 'start_time': 13.16, 'duration': 0.24}, {'word': 'sauciness', 'start_time': 13.48, 'duration': 0.52}, {'word': 'upset', 'start_time': 14.06, 'duration': 0.38}, {'word': 'you', 'start_time': 14.46, 'duration': 0.72}, {'word': 'why', 'start_time': 15.36, 'duration': 0.14}, {'word': 'are', 'start_time': 15.64, 'duration': 0.12}, {'word': 'you', 'start_time': 15.8, 'duration': 0.12}, {'word': 'beside', 'start_time': 16.02, 'duration': 0.32}, {'word': 'with', 'start_time': 16.38, 'duration': 0.12}, {'word': 'gloom', 'start_time': 16.58, 'duration': 0.72}, {'word': 'to', 'start_time': 17.42, 'duration': 0.1}, {'word': 'cause', 'start_time': 17.58, 'duration': 0.22}, {'word': 'i', 'start_time': 17.88, 'duration': 0.54}, {'word': 'walk', 'start_time': 18.5, 'duration': 0.22}, {'word': 'in', 'start_time': 18.8, 'duration': 0.1}, {'word': 'the', 'start_time': 18.96, 'duration': 0.18}, {'word': 'i', 'start_time': 19.28, 'duration': 0.14}, {'word': 'have', 'start_time': 19.48, 'duration': 0.52}, {'word': 'oil', 'start_time': 20.18, 'duration': 0.38}, {'word': 'wells', 'start_time': 20.68, 'duration': 0.38}, {'word': 'pumping', 'start_time': 21.12, 'duration': 0.64}, {'word': 'my', 'start_time': 21.82, 'duration': 0.26}, {'word': 'living', 'start_time': 22.14, 'duration': 0.42}, {'word': 'room', 'start_time': 22.66, 'duration': 1.68}, {'word': 'sailors', 'start_time': 24.44, 'duration': 1.38}, {'word': 'and', 'start_time': 25.94, 'duration': 0.1}, {'word': 'like', 'start_time': 26.12, 'duration': 0.5}, {'word': 'songs', 'start_time': 26.72, 'duration': 0.32}, {'word': 'with', 'start_time': 27.1, 'duration': 0.2}, {'word': 'a', 'start_time': 27.44, 'duration': 0.1}, {'word': 'cuttenclips', 'start_time': 27.62, 'duration': 3.38}, {'word': 'springing', 'start_time': 31.02, 'duration': 0.48}, {'word': 'high', 'start_time': 31.66, 'duration': 1.2}, {'word': 'still', 'start_time': 32.92, 'duration': 0.2}, {'word': 'i', 'start_time': 33.24, 'duration': 0.14}, {'word': 'wore', 'start_time': 33.46, 'duration': 0.9}, {'word': 'did', 'start_time': 34.42, 'duration': 0.16}, {'word': 'you', 'start_time': 34.62, 'duration': 0.12}, {'word': 'want', 'start_time': 34.8, 'duration': 0.18}, {'word': 'to', 'start_time': 35.04, 'duration': 0.08}, {'word': 'see', 'start_time': 35.18, 'duration': 0.16}, {'word': 'me', 'start_time': 35.4, 'duration': 0.18}, {'word': 'broken', 'start_time': 35.64, 'duration': 1.2}, {'word': 'bowed', 'start_time': 37.0, 'duration': 0.4}, {'word': 'head', 'start_time': 37.5, 'duration': 0.22}, {'word': 'and', 'start_time': 37.82, 'duration': 0.14}, {'word': 'lord', 'start_time': 38.08, 'duration': 0.46}, {'word': 'eyes', 'start_time': 38.82, 'duration': 0.82}, {'word': 'soltali', 'start_time': 39.7, 'duration': 1.0}, {'word': 'down', 'start_time': 40.76, 'duration': 0.24}, {'word': 'like', 'start_time': 41.04, 'duration': 0.36}, {'word': 'pedrosan', 'start_time': 41.46, 'duration': 1.9}, {'word': 'by', 'start_time': 43.46, 'duration': 0.18}, {'word': 'my', 'start_time': 43.78, 'duration': 0.3}, {'word': 'soul', 'start_time': 44.18, 'duration': 0.3}, {'word': 'societaires', 'start_time': 44.5, 'duration': 2.74}, {'word': 'oudenard', 'start_time': 47.32, 'duration': 4.08}, {'word': 'isolated', 'start_time': 51.5, 'duration': 1.7}, {'word': 'as', 'start_time': 53.32, 'duration': 0.1}, {'word': 'if', 'start_time': 53.54, 'duration': 0.16}, {'word': 'i', 'start_time': 53.8, 'duration': 0.12}, {'word': 'have', 'start_time': 53.98, 'duration': 0.26}, {'word': 'gold', 'start_time': 54.36, 'duration': 0.46}, {'word': "man's", 'start_time': 54.98, 'duration': 0.36}, {'word': 'digging', 'start_time': 55.42, 'duration': 0.28}, {'word': 'in', 'start_time': 55.78, 'duration': 0.08}, {'word': 'my', 'start_time': 55.92, 'duration': 0.2}, {'word': 'own', 'start_time': 56.3, 'duration': 0.26}, {'word': 'back', 'start_time': 56.64, 'duration': 0.28}, {'word': 'yard', 'start_time': 57.0, 'duration': 1.12}, {'word': 'you', 'start_time': 58.16, 'duration': 0.12}, {'word': 'can', 'start_time': 58.34, 'duration': 0.16}, {'word': 'shoot', 'start_time': 58.54, 'duration': 0.24}, {'word': 'me', 'start_time': 58.82, 'duration': 0.14}, {'word': 'with', 'start_time': 59.0, 'duration': 0.16}, {'word': 'your', 'start_time': 59.22, 'duration': 0.18}, {'word': 'words', 'start_time': 59.54, 'duration': 0.76}, {'word': 'you', 'start_time': 60.36, 'duration': 0.14}, {'word': 'can', 'start_time': 60.56, 'duration': 0.2}, {'word': 'cut', 'start_time': 60.84, 'duration': 0.2}, {'word': 'me', 'start_time': 61.06, 'duration': 0.14}, {'word': 'with', 'start_time': 61.26, 'duration': 0.16}, {'word': 'your', 'start_time': 61.46, 'duration': 0.22}, {'word': 'lies', 'start_time': 61.84, 'duration': 0.76}, {'word': 'you', 'start_time': 62.66, 'duration': 0.14}, {'word': 'can', 'start_time': 62.86, 'duration': 0.28}, {'word': 'kill', 'start_time': 63.2, 'duration': 0.22}, {'word': 'me', 'start_time': 63.46, 'duration': 0.14}, {'word': 'with', 'start_time': 63.66, 'duration': 0.12}, {'word': 'your', 'start_time': 63.84, 'duration': 0.16}, {'word': 'hatefulness', 'start_time': 64.06, 'duration': 0.54}, {'word': 'but', 'start_time': 64.66, 'duration': 0.1}, {'word': 'just', 'start_time': 64.84, 'duration': 0.26}, {'word': 'like', 'start_time': 65.14, 'duration': 0.42}, {'word': 'life', 'start_time': 65.64, 'duration': 1.42}, {'word': 'i', 'start_time': 67.18, 'duration': 1.46}, {'word': 'does', 'start_time': 68.68, 'duration': 0.24}, {'word': 'my', 'start_time': 68.96, 'duration': 0.26}, {'word': 'senatorian', 'start_time': 69.28, 'duration': 12.02}, {'word': 'as', 'start_time': 81.4, 'duration': 0.18}, {'word': 'if', 'start_time': 81.72, 'duration': 0.22}, {'word': 'i', 'start_time': 82.06, 'duration': 0.1}, {'word': 'have', 'start_time': 82.22, 'duration': 0.26}, {'word': 'diamonds', 'start_time': 82.56, 'duration': 0.5}, {'word': 'at', 'start_time': 83.16, 'duration': 0.1}, {'word': 'the', 'start_time': 83.3, 'duration': 0.1}, {'word': 'meeting', 'start_time': 83.46, 'duration': 0.4}, {'word': 'of', 'start_time': 84.0, 'duration': 0.12}, {'word': 'my', 'start_time': 84.22, 'duration': 0.3}, {'word': 'time', 'start_time': 84.72, 'duration': 2.72}, {'word': 'out', 'start_time': 87.58, 'duration': 0.18}, {'word': 'of', 'start_time': 87.84, 'duration': 0.12}, {'word': 'the', 'start_time': 88.0, 'duration': 0.12}, {'word': 'huts', 'start_time': 88.18, 'duration': 0.28}, {'word': 'of', 'start_time': 88.56, 'duration': 0.12}, {'word': 'history', 'start_time': 88.76, 'duration': 0.74}, {'word': 'shame', 'start_time': 89.6, 'duration': 0.32}, {'word': 'i', 'start_time': 90.04, 'duration': 0.18}, {'word': 'rise', 'start_time': 90.38, 'duration': 1.2}, {'word': 'up', 'start_time': 91.7, 'duration': 0.1}, {'word': 'from', 'start_time': 91.84, 'duration': 0.16}, {'word': 'a', 'start_time': 92.14, 'duration': 0.08}, {'word': 'past', 'start_time': 92.28, 'duration': 0.34}, {'word': 'rooted', 'start_time': 92.7, 'duration': 0.36}, {'word': 'in', 'start_time': 93.18, 'duration': 0.18}, {'word': 'pain', 'start_time': 93.46, 'duration': 0.72}, {'word': 'i', 'start_time': 94.32, 'duration': 0.14}, {'word': 'ran', 'start_time': 94.54, 'duration': 0.94}, {'word': 'a', 'start_time': 95.62, 'duration': 0.06}, {'word': 'black', 'start_time': 95.72, 'duration': 0.26}, {'word': 'ocean', 'start_time': 96.06, 'duration': 0.56}, {'word': 'heaving', 'start_time': 96.74, 'duration': 0.44}, {'word': 'and', 'start_time': 97.28, 'duration': 0.22}, {'word': 'would', 'start_time': 97.74, 'duration': 0.92}, {'word': 'willingly', 'start_time': 98.76, 'duration': 1.32}, {'word': 'and', 'start_time': 100.22, 'duration': 0.14}, {'word': 'bearing', 'start_time': 100.44, 'duration': 0.3}, {'word': 'it', 'start_time': 100.82, 'duration': 1.8}, {'word': 'leaving', 'start_time': 102.66, 'duration': 0.36}, {'word': 'behind', 'start_time': 103.1, 'duration': 0.56}, {'word': 'it', 'start_time': 103.78, 'duration': 0.2}, {'word': 'of', 'start_time': 104.1, 'duration': 0.14}, {'word': 'terror', 'start_time': 104.32, 'duration': 0.64}, {'word': 'and', 'start_time': 105.1, 'duration': 0.32}, {'word': 'fear', 'start_time': 105.54, 'duration': 0.8}, {'word': 'as', 'start_time': 107.04, 'duration': 0.9}, {'word': 'into', 'start_time': 108.14, 'duration': 0.3}, {'word': 'a', 'start_time': 108.52, 'duration': 0.06}, {'word': 'daybreak', 'start_time': 108.64, 'duration': 0.54}, {'word': 'miraculously', 'start_time': 109.24, 'duration': 1.04}, {'word': 'clear', 'start_time': 110.34, 'duration': 1.26}, {'word': 'in', 'start_time': 111.74, 'duration': 1.42}, {'word': 'bringing', 'start_time': 113.22, 'duration': 0.42}, {'word': 'the', 'start_time': 113.7, 'duration': 0.1}, {'word': 'gifts', 'start_time': 113.86, 'duration': 0.32}, {'word': 'that', 'start_time': 114.24, 'duration': 0.22}, {'word': 'my', 'start_time': 114.56, 'duration': 0.18}, {'word': 'ancestors', 'start_time': 114.86, 'duration': 0.66}, {'word': 'gave', 'start_time': 115.66, 'duration': 0.8}, {'word': 'i', 'start_time': 116.64, 'duration': 0.12}, {'word': 'am', 'start_time': 116.84, 'duration': 0.14}, {'word': 'the', 'start_time': 117.06, 'duration': 0.24}, {'word': 'whole', 'start_time': 117.4, 'duration': 0.58}, {'word': 'and', 'start_time': 118.1, 'duration': 0.1}, {'word': 'the', 'start_time': 118.26, 'duration': 0.18}, {'word': 'dream', 'start_time': 118.54, 'duration': 0.78}, {'word': 'of', 'start_time': 119.38, 'duration': 0.1}, {'word': 'the', 'start_time': 119.54, 'duration': 0.22}, {'word': 'slave', 'start_time': 119.8, 'duration': 1.18}, {'word': 'and', 'start_time': 121.2, 'duration': 0.3}, {'word': 'so', 'start_time': 121.64, 'duration': 4.68}, {'word': 'that', 'start_time': 126.42, 'duration': 0.14}]}, {'confidence': -1795.8509521484375, 'words': [{'word': 'you', 'start_time': 0.56, 'duration': 0.12}, {'word': 'may', 'start_time': 0.74, 'duration': 0.14}, {'word': 'write', 'start_time': 1.0, 'duration': 0.2}, {'word': 'me', 'start_time': 1.3, 'duration': 0.16}, {'word': 'down', 'start_time': 1.54, 'duration': 0.18}, {'word': 'in', 'start_time': 1.84, 'duration': 0.1}, {'word': 'history', 'start_time': 2.0, 'duration': 1.06}, {'word': 'with', 'start_time': 3.12, 'duration': 0.12}, {'word': 'your', 'start_time': 3.26, 'duration': 0.16}, {'word': 'visit', 'start_time': 3.5, 'duration': 0.32}, {'word': 'wished', 'start_time': 3.86, 'duration': 0.38}, {'word': 'lines', 'start_time': 4.34, 'duration': 1.1}, {'word': 'you', 'start_time': 5.52, 'duration': 0.1}, {'word': 'may', 'start_time': 5.66, 'duration': 0.22}, {'word': 'try', 'start_time': 6.0, 'duration': 0.3}, {'word': 'me', 'start_time': 6.34, 'duration': 0.12}, {'word': 'in', 'start_time': 6.54, 'duration': 0.06}, {'word': 'the', 'start_time': 6.64, 'duration': 0.08}, {'word': 'very', 'start_time': 6.78, 'duration': 0.28}, {'word': 'dirt', 'start_time': 7.14, 'duration': 1.12}, {'word': 'but', 'start_time': 8.32, 'duration': 0.22}, {'word': 'still', 'start_time': 8.6, 'duration': 0.3}, {'word': 'like', 'start_time': 8.98, 'duration': 0.38}, {'word': 'dust', 'start_time': 9.48, 'duration': 1.48}, {'word': 'or', 'start_time': 11.1, 'duration': 1.68}, {'word': 'does', 'start_time': 12.86, 'duration': 0.24}, {'word': 'my', 'start_time': 13.16, 'duration': 0.24}, {'word': 'sauciness', 'start_time': 13.48, 'duration': 0.52}, {'word': 'upset', 'start_time': 14.06, 'duration': 0.38}, {'word': 'you', 'start_time': 14.46, 'duration': 0.72}, {'word': 'why', 'start_time': 15.36, 'duration': 0.14}, {'word': 'are', 'start_time': 15.64, 'duration': 0.12}, {'word': 'you', 'start_time': 15.8, 'duration': 0.12}, {'word': 'beside', 'start_time': 16.02, 'duration': 0.32}, {'word': 'with', 'start_time': 16.38, 'duration': 0.12}, {'word': 'gloom', 'start_time': 16.58, 'duration': 0.72}, {'word': 'to', 'start_time': 17.42, 'duration': 0.1}, {'word': 'cause', 'start_time': 17.58, 'duration': 0.22}, {'word': 'i', 'start_time': 17.88, 'duration': 0.54}, {'word': 'walk', 'start_time': 18.5, 'duration': 0.22}, {'word': 'in', 'start_time': 18.8, 'duration': 0.1}, {'word': 'the', 'start_time': 18.96, 'duration': 0.18}, {'word': 'i', 'start_time': 19.28, 'duration': 0.14}, {'word': 'have', 'start_time': 19.48, 'duration': 0.52}, {'word': 'oil', 'start_time': 20.18, 'duration': 0.38}, {'word': 'wells', 'start_time': 20.68, 'duration': 0.38}, {'word': 'pumping', 'start_time': 21.12, 'duration': 0.64}, {'word': 'my', 'start_time': 21.82, 'duration': 0.26}, {'word': 'living', 'start_time': 22.14, 'duration': 0.42}, {'word': 'room', 'start_time': 22.66, 'duration': 1.68}, {'word': 'sailors', 'start_time': 24.44, 'duration': 1.38}, {'word': 'and', 'start_time': 25.94, 'duration': 0.1}, {'word': 'like', 'start_time': 26.12, 'duration': 0.5}, {'word': 'songs', 'start_time': 26.72, 'duration': 0.32}, {'word': 'with', 'start_time': 27.1, 'duration': 0.2}, {'word': 'a', 'start_time': 27.44, 'duration': 0.1}, {'word': 'cuttenclips', 'start_time': 27.62, 'duration': 3.38}, {'word': 'springing', 'start_time': 31.02, 'duration': 0.48}, {'word': 'high', 'start_time': 31.66, 'duration': 1.2}, {'word': 'still', 'start_time': 32.92, 'duration': 0.2}, {'word': 'i', 'start_time': 33.24, 'duration': 0.14}, {'word': 'wore', 'start_time': 33.46, 'duration': 0.9}, {'word': 'did', 'start_time': 34.42, 'duration': 0.16}, {'word': 'you', 'start_time': 34.62, 'duration': 0.12}, {'word': 'want', 'start_time': 34.8, 'duration': 0.18}, {'word': 'to', 'start_time': 35.04, 'duration': 0.08}, {'word': 'see', 'start_time': 35.18, 'duration': 0.16}, {'word': 'me', 'start_time': 35.4, 'duration': 0.18}, {'word': 'broken', 'start_time': 35.64, 'duration': 1.2}, {'word': 'bowed', 'start_time': 37.0, 'duration': 0.4}, {'word': 'head', 'start_time': 37.5, 'duration': 0.22}, {'word': 'and', 'start_time': 37.82, 'duration': 0.14}, {'word': 'lord', 'start_time': 38.08, 'duration': 0.46}, {'word': 'eyes', 'start_time': 38.82, 'duration': 0.82}, {'word': 'soltali', 'start_time': 39.7, 'duration': 1.0}, {'word': 'down', 'start_time': 40.76, 'duration': 0.24}, {'word': 'like', 'start_time': 41.04, 'duration': 0.36}, {'word': 'pedrosan', 'start_time': 41.46, 'duration': 1.9}, {'word': 'by', 'start_time': 43.46, 'duration': 0.18}, {'word': 'my', 'start_time': 43.78, 'duration': 0.3}, {'word': 'soul', 'start_time': 44.18, 'duration': 0.3}, {'word': 'societaires', 'start_time': 44.5, 'duration': 2.74}, {'word': 'oudenard', 'start_time': 47.32, 'duration': 4.08}, {'word': 'isolated', 'start_time': 51.5, 'duration': 1.7}, {'word': 'as', 'start_time': 53.32, 'duration': 0.1}, {'word': 'if', 'start_time': 53.54, 'duration': 0.16}, {'word': 'i', 'start_time': 53.8, 'duration': 0.12}, {'word': 'have', 'start_time': 53.98, 'duration': 0.26}, {'word': 'gold', 'start_time': 54.36, 'duration': 0.46}, {'word': "man's", 'start_time': 54.98, 'duration': 0.36}, {'word': 'digging', 'start_time': 55.42, 'duration': 0.28}, {'word': 'in', 'start_time': 55.78, 'duration': 0.08}, {'word': 'my', 'start_time': 55.92, 'duration': 0.2}, {'word': 'own', 'start_time': 56.3, 'duration': 0.26}, {'word': 'back', 'start_time': 56.64, 'duration': 0.28}, {'word': 'yard', 'start_time': 57.0, 'duration': 1.12}, {'word': 'you', 'start_time': 58.16, 'duration': 0.12}, {'word': 'can', 'start_time': 58.34, 'duration': 0.16}, {'word': 'shoot', 'start_time': 58.54, 'duration': 0.24}, {'word': 'me', 'start_time': 58.82, 'duration': 0.14}, {'word': 'with', 'start_time': 59.0, 'duration': 0.16}, {'word': 'your', 'start_time': 59.22, 'duration': 0.18}, {'word': 'words', 'start_time': 59.54, 'duration': 0.76}, {'word': 'you', 'start_time': 60.36, 'duration': 0.14}, {'word': 'can', 'start_time': 60.56, 'duration': 0.2}, {'word': 'cut', 'start_time': 60.84, 'duration': 0.2}, {'word': 'me', 'start_time': 61.06, 'duration': 0.14}, {'word': 'with', 'start_time': 61.26, 'duration': 0.16}, {'word': 'your', 'start_time': 61.46, 'duration': 0.22}, {'word': 'lies', 'start_time': 61.84, 'duration': 0.76}, {'word': 'you', 'start_time': 62.66, 'duration': 0.14}, {'word': 'can', 'start_time': 62.86, 'duration': 0.28}, {'word': 'kill', 'start_time': 63.2, 'duration': 0.22}, {'word': 'me', 'start_time': 63.46, 'duration': 0.14}, {'word': 'with', 'start_time': 63.66, 'duration': 0.12}, {'word': 'your', 'start_time': 63.84, 'duration': 0.16}, {'word': 'hatefulness', 'start_time': 64.06, 'duration': 0.54}, {'word': 'but', 'start_time': 64.66, 'duration': 0.1}, {'word': 'just', 'start_time': 64.84, 'duration': 0.26}, {'word': 'like', 'start_time': 65.14, 'duration': 0.42}, {'word': 'life', 'start_time': 65.64, 'duration': 1.42}, {'word': 'i', 'start_time': 67.18, 'duration': 1.46}, {'word': 'does', 'start_time': 68.68, 'duration': 0.24}, {'word': 'my', 'start_time': 68.96, 'duration': 0.26}, {'word': 'senatorian', 'start_time': 69.28, 'duration': 12.02}, {'word': 'as', 'start_time': 81.4, 'duration': 0.18}, {'word': 'if', 'start_time': 81.72, 'duration': 0.22}, {'word': 'i', 'start_time': 82.06, 'duration': 0.1}, {'word': 'have', 'start_time': 82.22, 'duration': 0.26}, {'word': 'diamonds', 'start_time': 82.56, 'duration': 0.5}, {'word': 'at', 'start_time': 83.16, 'duration': 0.1}, {'word': 'the', 'start_time': 83.3, 'duration': 0.1}, {'word': 'meeting', 'start_time': 83.46, 'duration': 0.4}, {'word': 'of', 'start_time': 84.0, 'duration': 0.12}, {'word': 'my', 'start_time': 84.22, 'duration': 0.3}, {'word': 'time', 'start_time': 84.72, 'duration': 2.72}, {'word': 'out', 'start_time': 87.58, 'duration': 0.18}, {'word': 'of', 'start_time': 87.84, 'duration': 0.12}, {'word': 'the', 'start_time': 88.0, 'duration': 0.12}, {'word': 'huts', 'start_time': 88.18, 'duration': 0.28}, {'word': 'of', 'start_time': 88.56, 'duration': 0.12}, {'word': 'history', 'start_time': 88.76, 'duration': 0.74}, {'word': 'shame', 'start_time': 89.6, 'duration': 0.32}, {'word': 'i', 'start_time': 90.04, 'duration': 0.18}, {'word': 'rise', 'start_time': 90.38, 'duration': 1.2}, {'word': 'up', 'start_time': 91.7, 'duration': 0.1}, {'word': 'from', 'start_time': 91.84, 'duration': 0.16}, {'word': 'a', 'start_time': 92.14, 'duration': 0.08}, {'word': 'past', 'start_time': 92.28, 'duration': 0.34}, {'word': 'rooted', 'start_time': 92.7, 'duration': 0.36}, {'word': 'in', 'start_time': 93.18, 'duration': 0.18}, {'word': 'pain', 'start_time': 93.46, 'duration': 0.72}, {'word': 'i', 'start_time': 94.32, 'duration': 0.14}, {'word': 'ran', 'start_time': 94.54, 'duration': 0.94}, {'word': 'a', 'start_time': 95.62, 'duration': 0.06}, {'word': 'black', 'start_time': 95.72, 'duration': 0.26}, {'word': 'ocean', 'start_time': 96.06, 'duration': 0.56}, {'word': 'heaving', 'start_time': 96.74, 'duration': 0.44}, {'word': 'and', 'start_time': 97.28, 'duration': 0.22}, {'word': 'would', 'start_time': 97.74, 'duration': 0.92}, {'word': 'willingly', 'start_time': 98.76, 'duration': 1.32}, {'word': 'and', 'start_time': 100.22, 'duration': 0.14}, {'word': 'bearing', 'start_time': 100.44, 'duration': 0.3}, {'word': 'it', 'start_time': 100.82, 'duration': 1.8}, {'word': 'leaving', 'start_time': 102.66, 'duration': 0.36}, {'word': 'behind', 'start_time': 103.1, 'duration': 0.56}, {'word': 'it', 'start_time': 103.78, 'duration': 0.2}, {'word': 'of', 'start_time': 104.1, 'duration': 0.14}, {'word': 'terror', 'start_time': 104.32, 'duration': 0.64}, {'word': 'and', 'start_time': 105.1, 'duration': 0.32}, {'word': 'fear', 'start_time': 105.54, 'duration': 0.8}, {'word': 'as', 'start_time': 107.04, 'duration': 0.9}, {'word': 'into', 'start_time': 108.14, 'duration': 0.3}, {'word': 'a', 'start_time': 108.52, 'duration': 0.06}, {'word': 'daybreak', 'start_time': 108.64, 'duration': 0.54}, {'word': 'miraculously', 'start_time': 109.24, 'duration': 1.04}, {'word': 'clear', 'start_time': 110.34, 'duration': 1.26}, {'word': 'in', 'start_time': 111.74, 'duration': 1.42}, {'word': 'bringing', 'start_time': 113.22, 'duration': 0.42}, {'word': 'the', 'start_time': 113.7, 'duration': 0.1}, {'word': 'gifts', 'start_time': 113.86, 'duration': 0.32}, {'word': 'that', 'start_time': 114.24, 'duration': 0.22}, {'word': 'my', 'start_time': 114.56, 'duration': 0.18}, {'word': 'ancestors', 'start_time': 114.86, 'duration': 0.66}, {'word': 'gave', 'start_time': 115.66, 'duration': 0.8}, {'word': 'i', 'start_time': 116.64, 'duration': 0.12}, {'word': 'am', 'start_time': 116.84, 'duration': 0.14}, {'word': 'the', 'start_time': 117.06, 'duration': 0.24}, {'word': 'whole', 'start_time': 117.4, 'duration': 0.58}, {'word': 'and', 'start_time': 118.1, 'duration': 0.1}, {'word': 'the', 'start_time': 118.26, 'duration': 0.18}, {'word': 'dream', 'start_time': 118.54, 'duration': 0.78}, {'word': 'of', 'start_time': 119.38, 'duration': 0.1}, {'word': 'the', 'start_time': 119.54, 'duration': 0.22}, {'word': 'slave', 'start_time': 119.8, 'duration': 1.18}, {'word': 'and', 'start_time': 121.2, 'duration': 0.3}, {'word': 'so', 'start_time': 121.64, 'duration': 4.68}, {'word': 'then', 'start_time': 126.42, 'duration': 0.14}]}, {'confidence': -1796.1273193359375, 'words': [{'word': 'you', 'start_time': 0.56, 'duration': 0.12}, {'word': 'may', 'start_time': 0.74, 'duration': 0.14}, {'word': 'write', 'start_time': 1.0, 'duration': 0.2}, {'word': 'me', 'start_time': 1.3, 'duration': 0.16}, {'word': 'down', 'start_time': 1.54, 'duration': 0.18}, {'word': 'in', 'start_time': 1.84, 'duration': 0.1}, {'word': 'history', 'start_time': 2.0, 'duration': 1.06}, {'word': 'with', 'start_time': 3.12, 'duration': 0.12}, {'word': 'your', 'start_time': 3.26, 'duration': 0.16}, {'word': 'visit', 'start_time': 3.5, 'duration': 0.32}, {'word': 'wished', 'start_time': 3.86, 'duration': 0.38}, {'word': 'lines', 'start_time': 4.34, 'duration': 1.1}, {'word': 'you', 'start_time': 5.52, 'duration': 0.1}, {'word': 'may', 'start_time': 5.66, 'duration': 0.22}, {'word': 'try', 'start_time': 6.0, 'duration': 0.3}, {'word': 'me', 'start_time': 6.34, 'duration': 0.12}, {'word': 'in', 'start_time': 6.54, 'duration': 0.06}, {'word': 'the', 'start_time': 6.64, 'duration': 0.08}, {'word': 'very', 'start_time': 6.78, 'duration': 0.28}, {'word': 'dirt', 'start_time': 7.14, 'duration': 1.12}, {'word': 'but', 'start_time': 8.32, 'duration': 0.22}, {'word': 'still', 'start_time': 8.6, 'duration': 0.3}, {'word': 'like', 'start_time': 8.98, 'duration': 0.38}, {'word': 'dust', 'start_time': 9.48, 'duration': 1.48}, {'word': 'or', 'start_time': 11.1, 'duration': 1.68}, {'word': 'does', 'start_time': 12.86, 'duration': 0.24}, {'word': 'my', 'start_time': 13.16, 'duration': 0.24}, {'word': 'sauciness', 'start_time': 13.48, 'duration': 0.52}, {'word': 'upset', 'start_time': 14.06, 'duration': 0.38}, {'word': 'you', 'start_time': 14.46, 'duration': 0.72}, {'word': 'why', 'start_time': 15.36, 'duration': 0.14}, {'word': 'are', 'start_time': 15.64, 'duration': 0.12}, {'word': 'you', 'start_time': 15.8, 'duration': 0.12}, {'word': 'beside', 'start_time': 16.02, 'duration': 0.32}, {'word': 'with', 'start_time': 16.38, 'duration': 0.12}, {'word': 'gloom', 'start_time': 16.58, 'duration': 0.72}, {'word': 'to', 'start_time': 17.42, 'duration': 0.1}, {'word': 'cause', 'start_time': 17.58, 'duration': 0.22}, {'word': 'i', 'start_time': 17.88, 'duration': 0.54}, {'word': 'walk', 'start_time': 18.5, 'duration': 0.22}, {'word': 'in', 'start_time': 18.8, 'duration': 0.1}, {'word': 'the', 'start_time': 18.96, 'duration': 0.18}, {'word': 'i', 'start_time': 19.28, 'duration': 0.14}, {'word': 'have', 'start_time': 19.48, 'duration': 0.52}, {'word': 'oil', 'start_time': 20.18, 'duration': 0.38}, {'word': 'wells', 'start_time': 20.68, 'duration': 0.38}, {'word': 'pumping', 'start_time': 21.12, 'duration': 0.64}, {'word': 'my', 'start_time': 21.82, 'duration': 0.26}, {'word': 'living', 'start_time': 22.14, 'duration': 0.42}, {'word': 'room', 'start_time': 22.66, 'duration': 1.68}, {'word': 'sailors', 'start_time': 24.44, 'duration': 1.38}, {'word': 'and', 'start_time': 25.94, 'duration': 0.1}, {'word': 'like', 'start_time': 26.12, 'duration': 0.5}, {'word': 'songs', 'start_time': 26.72, 'duration': 0.32}, {'word': 'with', 'start_time': 27.1, 'duration': 0.2}, {'word': 'a', 'start_time': 27.44, 'duration': 0.1}, {'word': 'cuttenclips', 'start_time': 27.62, 'duration': 3.38}, {'word': 'springing', 'start_time': 31.02, 'duration': 0.48}, {'word': 'high', 'start_time': 31.66, 'duration': 1.2}, {'word': 'still', 'start_time': 32.92, 'duration': 0.2}, {'word': 'i', 'start_time': 33.24, 'duration': 0.14}, {'word': 'wore', 'start_time': 33.46, 'duration': 0.9}, {'word': 'did', 'start_time': 34.42, 'duration': 0.16}, {'word': 'you', 'start_time': 34.62, 'duration': 0.12}, {'word': 'want', 'start_time': 34.8, 'duration': 0.18}, {'word': 'to', 'start_time': 35.04, 'duration': 0.08}, {'word': 'see', 'start_time': 35.18, 'duration': 0.16}, {'word': 'me', 'start_time': 35.4, 'duration': 0.18}, {'word': 'broken', 'start_time': 35.64, 'duration': 1.2}, {'word': 'bowed', 'start_time': 37.0, 'duration': 0.4}, {'word': 'head', 'start_time': 37.5, 'duration': 0.22}, {'word': 'and', 'start_time': 37.82, 'duration': 0.14}, {'word': 'lord', 'start_time': 38.08, 'duration': 0.46}, {'word': 'eyes', 'start_time': 38.82, 'duration': 0.82}, {'word': 'soltali', 'start_time': 39.7, 'duration': 1.0}, {'word': 'down', 'start_time': 40.76, 'duration': 0.24}, {'word': 'like', 'start_time': 41.04, 'duration': 0.36}, {'word': 'pedrosan', 'start_time': 41.46, 'duration': 1.9}, {'word': 'by', 'start_time': 43.46, 'duration': 0.18}, {'word': 'my', 'start_time': 43.78, 'duration': 0.3}, {'word': 'soul', 'start_time': 44.18, 'duration': 0.3}, {'word': 'societaires', 'start_time': 44.5, 'duration': 2.74}, {'word': 'oudenard', 'start_time': 47.32, 'duration': 4.08}, {'word': 'isolated', 'start_time': 51.5, 'duration': 1.7}, {'word': 'as', 'start_time': 53.32, 'duration': 0.1}, {'word': 'if', 'start_time': 53.54, 'duration': 0.16}, {'word': 'i', 'start_time': 53.8, 'duration': 0.12}, {'word': 'have', 'start_time': 53.98, 'duration': 0.26}, {'word': 'gold', 'start_time': 54.36, 'duration': 0.46}, {'word': "man's", 'start_time': 54.98, 'duration': 0.36}, {'word': 'digging', 'start_time': 55.42, 'duration': 0.28}, {'word': 'in', 'start_time': 55.78, 'duration': 0.08}, {'word': 'my', 'start_time': 55.92, 'duration': 0.2}, {'word': 'own', 'start_time': 56.3, 'duration': 0.26}, {'word': 'back', 'start_time': 56.64, 'duration': 0.28}, {'word': 'yard', 'start_time': 57.0, 'duration': 1.12}, {'word': 'you', 'start_time': 58.16, 'duration': 0.12}, {'word': 'can', 'start_time': 58.34, 'duration': 0.16}, {'word': 'shoot', 'start_time': 58.54, 'duration': 0.24}, {'word': 'me', 'start_time': 58.82, 'duration': 0.14}, {'word': 'with', 'start_time': 59.0, 'duration': 0.16}, {'word': 'your', 'start_time': 59.22, 'duration': 0.18}, {'word': 'words', 'start_time': 59.54, 'duration': 0.76}, {'word': 'you', 'start_time': 60.36, 'duration': 0.14}, {'word': 'can', 'start_time': 60.56, 'duration': 0.2}, {'word': 'cut', 'start_time': 60.84, 'duration': 0.2}, {'word': 'me', 'start_time': 61.06, 'duration': 0.14}, {'word': 'with', 'start_time': 61.26, 'duration': 0.16}, {'word': 'your', 'start_time': 61.46, 'duration': 0.22}, {'word': 'lies', 'start_time': 61.84, 'duration': 0.76}, {'word': 'you', 'start_time': 62.66, 'duration': 0.14}, {'word': 'can', 'start_time': 62.86, 'duration': 0.28}, {'word': 'kill', 'start_time': 63.2, 'duration': 0.22}, {'word': 'me', 'start_time': 63.46, 'duration': 0.14}, {'word': 'with', 'start_time': 63.66, 'duration': 0.12}, {'word': 'your', 'start_time': 63.84, 'duration': 0.16}, {'word': 'hatefulness', 'start_time': 64.06, 'duration': 0.54}, {'word': 'but', 'start_time': 64.66, 'duration': 0.1}, {'word': 'just', 'start_time': 64.84, 'duration': 0.26}, {'word': 'like', 'start_time': 65.14, 'duration': 0.42}, {'word': 'life', 'start_time': 65.64, 'duration': 1.42}, {'word': 'i', 'start_time': 67.18, 'duration': 1.46}, {'word': 'does', 'start_time': 68.68, 'duration': 0.24}, {'word': 'my', 'start_time': 68.96, 'duration': 0.26}, {'word': 'senatorian', 'start_time': 69.28, 'duration': 12.02}, {'word': 'as', 'start_time': 81.4, 'duration': 0.18}, {'word': 'if', 'start_time': 81.72, 'duration': 0.22}, {'word': 'i', 'start_time': 82.06, 'duration': 0.1}, {'word': 'have', 'start_time': 82.22, 'duration': 0.26}, {'word': 'diamonds', 'start_time': 82.56, 'duration': 0.5}, {'word': 'at', 'start_time': 83.16, 'duration': 0.1}, {'word': 'the', 'start_time': 83.3, 'duration': 0.1}, {'word': 'meeting', 'start_time': 83.46, 'duration': 0.4}, {'word': 'of', 'start_time': 84.0, 'duration': 0.12}, {'word': 'my', 'start_time': 84.22, 'duration': 0.3}, {'word': 'time', 'start_time': 84.72, 'duration': 2.72}, {'word': 'out', 'start_time': 87.58, 'duration': 0.18}, {'word': 'of', 'start_time': 87.84, 'duration': 0.12}, {'word': 'the', 'start_time': 88.0, 'duration': 0.12}, {'word': 'huts', 'start_time': 88.18, 'duration': 0.28}, {'word': 'of', 'start_time': 88.56, 'duration': 0.12}, {'word': 'history', 'start_time': 88.76, 'duration': 0.74}, {'word': 'shame', 'start_time': 89.6, 'duration': 0.32}, {'word': 'i', 'start_time': 90.04, 'duration': 0.18}, {'word': 'rise', 'start_time': 90.38, 'duration': 1.2}, {'word': 'up', 'start_time': 91.7, 'duration': 0.1}, {'word': 'from', 'start_time': 91.84, 'duration': 0.16}, {'word': 'a', 'start_time': 92.14, 'duration': 0.08}, {'word': 'past', 'start_time': 92.28, 'duration': 0.34}, {'word': 'rooted', 'start_time': 92.7, 'duration': 0.36}, {'word': 'in', 'start_time': 93.18, 'duration': 0.18}, {'word': 'pain', 'start_time': 93.46, 'duration': 0.72}, {'word': 'i', 'start_time': 94.32, 'duration': 0.14}, {'word': 'ran', 'start_time': 94.54, 'duration': 0.94}, {'word': 'a', 'start_time': 95.62, 'duration': 0.06}, {'word': 'black', 'start_time': 95.72, 'duration': 0.26}, {'word': 'ocean', 'start_time': 96.06, 'duration': 0.56}, {'word': 'heaving', 'start_time': 96.74, 'duration': 0.44}, {'word': 'and', 'start_time': 97.28, 'duration': 0.22}, {'word': 'would', 'start_time': 97.74, 'duration': 0.92}, {'word': 'willingly', 'start_time': 98.76, 'duration': 1.32}, {'word': 'and', 'start_time': 100.22, 'duration': 0.14}, {'word': 'bearing', 'start_time': 100.44, 'duration': 0.3}, {'word': 'it', 'start_time': 100.82, 'duration': 1.8}, {'word': 'leaving', 'start_time': 102.66, 'duration': 0.36}, {'word': 'behind', 'start_time': 103.1, 'duration': 0.56}, {'word': 'it', 'start_time': 103.78, 'duration': 0.2}, {'word': 'of', 'start_time': 104.1, 'duration': 0.14}, {'word': 'terror', 'start_time': 104.32, 'duration': 0.64}, {'word': 'and', 'start_time': 105.1, 'duration': 0.32}, {'word': 'fear', 'start_time': 105.54, 'duration': 0.8}, {'word': 'as', 'start_time': 107.04, 'duration': 0.9}, {'word': 'into', 'start_time': 108.14, 'duration': 0.3}, {'word': 'a', 'start_time': 108.52, 'duration': 0.06}, {'word': 'daybreak', 'start_time': 108.64, 'duration': 0.54}, {'word': 'miraculously', 'start_time': 109.24, 'duration': 1.04}, {'word': 'clear', 'start_time': 110.34, 'duration': 1.26}, {'word': 'in', 'start_time': 111.74, 'duration': 1.42}, {'word': 'bringing', 'start_time': 113.22, 'duration': 0.42}, {'word': 'the', 'start_time': 113.7, 'duration': 0.1}, {'word': 'gifts', 'start_time': 113.86, 'duration': 0.32}, {'word': 'that', 'start_time': 114.24, 'duration': 0.22}, {'word': 'my', 'start_time': 114.56, 'duration': 0.18}, {'word': 'ancestors', 'start_time': 114.86, 'duration': 0.66}, {'word': 'gave', 'start_time': 115.66, 'duration': 0.8}, {'word': 'i', 'start_time': 116.64, 'duration': 0.12}, {'word': 'am', 'start_time': 116.84, 'duration': 0.14}, {'word': 'the', 'start_time': 117.06, 'duration': 0.24}, {'word': 'whole', 'start_time': 117.4, 'duration': 0.58}, {'word': 'and', 'start_time': 118.1, 'duration': 0.1}, {'word': 'the', 'start_time': 118.26, 'duration': 0.18}, {'word': 'dream', 'start_time': 118.54, 'duration': 0.78}, {'word': 'of', 'start_time': 119.38, 'duration': 0.1}, {'word': 'the', 'start_time': 119.54, 'duration': 0.22}, {'word': 'slaves', 'start_time': 119.8, 'duration': 1.18}, {'word': 'and', 'start_time': 121.2, 'duration': 0.3}, {'word': 'so', 'start_time': 121.64, 'duration': 4.68}, {'word': 'then', 'start_time': 126.42, 'duration': 0.14}]}]}

ちょっとまだ confidence を理解していないのですが、きっと認識の候補かなぁ...と仮定して、['transcripts'][0] つまり

                                         transcripts
0  {'confidence': -1793.291259765625, 'words': [{'wor...

のなかのものを抽出して、json から SubRip 形式の（つまり YouTube でデフォルトで使われている字幕のファイル形式ですが）ファイルに変換する python のコードを考えてみました。GoogleColab　用ですが、別にこれはたんに json ファイル、ここではなぜか、.json ではなく .txt にしていますが、から json をパースするプログラムなので、ローカル用に変更して実行しても十分速いのではないかなと思います。やってないので、「（やっ）タラ、（もし、や）レバ」です。

import json
import datetime
from google.colab import files
import copy
import sys
# sys.setrecursionlimit(30000)

# uploaded = files.upload()

upfilename = 'json.txt'


# for fn in uploaded.keys():
#    print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
#    upfilename = fn

def fmttime(seconds):
    secs = seconds #millisecs / 1000.0
    d = datetime.timedelta(seconds=secs)
    t = (datetime.datetime.min + d).time()
    milli = t.strftime('%f')[:3]
    value = t.strftime('%H:%M:%S,') + milli
    return value 

original_stdout = sys.stdout #""" stdout backup """
filename = 'subtitle.srt' #""" print subtitle text to this file """
with open(upfilename, 'r') as up_f:
    line = up_f.read()
    jso = json.loads(line)
    ###print(jso['transcripts'][0]['words'])
    with open(filename,'w',encoding='utf8') as down_f:

#        sys.stdout = down_f #""" stdout to file """"

        totaltime = 0
        sentence = []

        endtime = ''
        starttime = ''
        lastword_time = 0
        lineNum = 1
        
        #confidence:0
        confidence =jso['transcripts'][0]
        #print(confidence)
        for i,ob in enumerate(confidence['words']):
            #continue
            #print(i,ob) 
            talk_start = True
            talk_end = False

            for key in ob:
                if key == 'word':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    if ob[key] != '':
                        sentence.append(ob.get(key))
                    ###print(*sentence)

                elif key == 'start_time':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    time = ob[key]
                    if  time - lastword_time < 1:
                        talk_start = False
                        talk_end = False
       
                    elif time - lastword_time >= 1: # 1 secons silence
                        talk_start = False
                        talk_end = True
                        ### block >
                        totaltime = 0
                        endtime = fmttime(lastword_time)
                        if len(sentence) > 1:
                            temp = sentence.pop()
                            print(lineNum)
                            lineNum += 1
                            print(starttime,'-->',endtime2)
                            # this word goes to next caption
                            kotoba = ''
                            for word in sentence:
                                kotoba += word + ' '
                            print(kotoba.rstrip())
                            print()
                            sentence.clear()
                            sentence.append(temp) # new caption
                        ### <  block 

                    if len(sentence) == 1 :
                        talk_start = True
                        talk_end = False
                        starttime = fmttime(time)
                        p_time = time

                elif key == 'duration':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    totaltime += ob[key] 
                    lastword_time = p_time + totaltime

                    endtime2 = fmttime(lastword_time)

                    #print('in :',fmttime(time),'>>',*sentence)
                    #print('end :',fmttime(time+totaltime)) 
                    #print('< > :',fmttime(totaltime))

                    if totaltime >= 4: # 4 seconds speech gose to 1 caption
                        ### block >
                        totaltime = 0    
                        endtime = fmttime(lastword_time)
                        print(lineNum)
                        lineNum += 1
                        print(starttime,'-->',endtime)
                        kotoba = ''
                        for word in sentence:
                            kotoba += word + ' '
                        
                        print(kotoba.rstrip())
                        print()
                        sentence.clear()
                        ### < block
                    elif totaltime < 4 and i + 1 == len(confidence['words']): 
                        ### block >
                        totaltime = 0    
                        endtime = fmttime(lastword_time)
                        print(lineNum)
                        lineNum += 1
                        print(starttime,'-->',endtime)
                        kotoba = ''
                        for word in sentence:
                            kotoba += word + ' '
                        
                        print(kotoba.rstrip())
                        print()
                        sentence.clear()
                        ### < block
#        sys.stdout = original_stdout # stdout back 

# files.download(filename) # download .srt file

SubRip の形式であれば（あればね。これがちゃんとSubRipの要件満たしていたら）、字幕編集のプログラムで見ることもできるでしょう、きっと、たぶんね。
（字幕編集プログラムについては、
https://qiita.com/dauuricus/items/863dd4d087b3aff6455d ）

いまおもいついたのですが、上記のようにセンテンスで取り出さずに word 単位でタイムシートにすれば、認識したことばと位置をつかって、Tracker　のようなシンセサイザーみたいなのがつくれそうですね。
というところでようやく理解したのだけれども mozilla はもしかして TTS のために STT をやってるのかな？センテンスで取り出すニーズを補完する方がわかりやすいのは、対比して VOSK と比較した場合、VOSKでは json から 'text' ですぐにセンテンスが抽出されるようになっている。なので VOSK では比較的簡単に字幕ぽいの抽出はできる。¹

confidence の意味がよくわからない。
なぜ文の評価のようになっているんだろうか？
https://discourse.mozilla.org/t/obtain-per-word-confidence-score/44969

ドキュメントだと検索しても、なぜなのかについては出てこなさそうであったため、どうやって confidence 0，1，2 を word ごとに順に並べて表示するように json から取り出すか考えて、

        def print_word(n) :
            
                
            for ob in jso['transcripts'][n]['words']:

                for key in ob:
                                            
                    if key == 'word':
                        print('confidence;',str(n)+':',ob[key])
                        n = n + 1
                        if n > 2:
                            n = 0              
                        p = ob.pop(key)
                        #print('pop',p)
                        print_word(n)
                        break
                    else:
                        ob.pop(key)
                        break
                
        print_word(0)

ようやく期待したものにはなったが、これはもっとうまいやり方が必ずあるはず。

こうなった

import copy
import sys
## sys.setrecursionlimit(30000)

        def list_copy(n):
            temp = []
            for i in range(n):
                c = copy.deepcopy(jso['transcripts'][i]['words'])
                temp.append(c)
            print_word(temp,0)
        
        # check confidence
        def print_word(copy,n) :
            if (len(copy[n])) > 0:
                #print(len(copy[n]))  
                dic = copy[n].pop(0)
               ##for ob in jso['transcripts'][n]['words']:
                
                key = [v for v in dic.values()]
                print("confidence:",str(n)+':', key[0])
                ##print("confidence:",str(n)+':', dic)
                n +=1
                if n > len(copy)-1:
                    n = 0
                
               ## for ob in dic.values():
               ##     print('confidence;',str(n)+':',ob)
               ##     n += 1
               ##     if n > len(copy)-1:
               ##         n = 0
               ##     break
                      
                print_word(copy,n)
            else:
                for k in range(len(copy)):
                    if k > len(copy) - 1:
                        return
                        break
                    else:
                        n += 1
                        if n > len(copy) - 1:
                            n = 0
                        if len(copy[n]) < 1:
                            continue
                        else:
                            print_word(copy,n)
                
               ## n += 1
               ## if n > len(copy)-1:
               ##     n = 0
               ## if len(copy[n]) < 1:
               ##     n += 1
               ##     if n > len(copy)-1:
               ##         n = 0
               ##     if len(copy[n]) < 1:
               ##         return
               ## else:
               ##     print_word(copy,n)  
                            
        list_copy(3)

Confidence リストをコピーして word を抽出してプリントするだけ。
できたと思って、もっと長い 7000 ループのものを回してみると再帰のエラーになったため、改良した。多分これでいいはず。リストからポップしていって、そのディクショナリーのキーが 'word' であるか比べて、そうであればプリントして、3 つのそれぞれリストが空になれば 3 つとも止まって再帰関数を抜ける。

一応将来的に confidence の数が増えてもこのままでいいはず。
現在は 3 つの confidence であるので、list_copy() で 3 としている。

だがしかし、どこまで最適化してもこの方法だと、扱う行数が増えればリカーシブマックスなんとかエラー、つまり再帰の呼び出し回数の上限を超えてしまうエラーになるので、pandas でこうやって見る方が再帰パズルで考え尽くすよりは、次のことに早く進めれる。
（までに数日かかった）

import pandas as pd
import json

with open ('json.txt','r') as f:

    jso = pd.read_json(f)

    print(jso.to_string())

並べて確かめると word の単語は３つともに同じであったりなかったりする。

confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: may
confidence; 1: may
confidence; 2: may
confidence; 0: write
confidence; 1: write
confidence; 2: write
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: down
confidence; 1: down
confidence; 2: down
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: history
confidence; 1: history
confidence; 2: history
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: your
confidence; 1: your
confidence; 2: your
confidence; 0: visit
confidence; 1: visit
confidence; 2: visit
confidence; 0: wished
confidence; 1: wished
confidence; 2: wished
confidence; 0: lines
confidence; 1: lines
confidence; 2: lines
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: may
confidence; 1: may
confidence; 2: may
confidence; 0: try
confidence; 1: try
confidence; 2: try
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: very
confidence; 1: very
confidence; 2: very
confidence; 0: dirt
confidence; 1: dirt
confidence; 2: dirt
confidence; 0: but
confidence; 1: but
confidence; 2: but
confidence; 0: still
confidence; 1: still
confidence; 2: still
confidence; 0: like
confidence; 1: like
confidence; 2: like
confidence; 0: dust
confidence; 1: dust
confidence; 2: dust
confidence; 0: or
confidence; 1: or
confidence; 2: or
confidence; 0: does
confidence; 1: does
confidence; 2: does
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: sauciness
confidence; 1: sauciness
confidence; 2: sauciness
confidence; 0: upset
confidence; 1: upset
confidence; 2: upset
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: why
confidence; 1: why
confidence; 2: why
confidence; 0: are
confidence; 1: are
confidence; 2: are
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: beside
confidence; 1: beside
confidence; 2: beside
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: gloom
confidence; 1: gloom
confidence; 2: gloom
confidence; 0: to
confidence; 1: to
confidence; 2: to
confidence; 0: cause
confidence; 1: cause
confidence; 2: cause
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: walk
confidence; 1: walk
confidence; 2: walk
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: have
confidence; 1: have
confidence; 2: have
confidence; 0: oil
confidence; 1: oil
confidence; 2: oil
confidence; 0: wells
confidence; 1: wells
confidence; 2: wells
confidence; 0: pumping
confidence; 1: pumping
confidence; 2: pumping
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: living
confidence; 1: living
confidence; 2: living
confidence; 0: room
confidence; 1: room
confidence; 2: room
confidence; 0: sailors
confidence; 1: sailors
confidence; 2: sailors
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: like
confidence; 1: like
confidence; 2: like
confidence; 0: songs
confidence; 1: songs
confidence; 2: songs
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: a
confidence; 1: a
confidence; 2: a
confidence; 0: cuttenclips
confidence; 1: cuttenclips
confidence; 2: cuttenclips
confidence; 0: springing
confidence; 1: springing
confidence; 2: springing
confidence; 0: high
confidence; 1: high
confidence; 2: high
confidence; 0: still
confidence; 1: still
confidence; 2: still
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: wore
confidence; 1: wore
confidence; 2: wore
confidence; 0: did
confidence; 1: did
confidence; 2: did
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: want
confidence; 1: want
confidence; 2: want
confidence; 0: to
confidence; 1: to
confidence; 2: to
confidence; 0: see
confidence; 1: see
confidence; 2: see
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: broken
confidence; 1: broken
confidence; 2: broken
confidence; 0: bowed
confidence; 1: bowed
confidence; 2: bowed
confidence; 0: head
confidence; 1: head
confidence; 2: head
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: lord
confidence; 1: lord
confidence; 2: lord
confidence; 0: eyes
confidence; 1: eyes
confidence; 2: eyes
confidence; 0: soltali
confidence; 1: soltali
confidence; 2: soltali
confidence; 0: down
confidence; 1: down
confidence; 2: down
confidence; 0: like
confidence; 1: like
confidence; 2: like
confidence; 0: pedrosan
confidence; 1: pedrosan
confidence; 2: pedrosan
confidence; 0: by
confidence; 1: by
confidence; 2: by
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: soul
confidence; 1: soul
confidence; 2: soul
confidence; 0: societaires
confidence; 1: societaires
confidence; 2: societaires
confidence; 0: oudenard
confidence; 1: oudenard
confidence; 2: oudenard
confidence; 0: isolated
confidence; 1: isolated
confidence; 2: isolated
confidence; 0: as
confidence; 1: as
confidence; 2: as
confidence; 0: if
confidence; 1: if
confidence; 2: if
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: have
confidence; 1: have
confidence; 2: have
confidence; 0: gold
confidence; 1: gold
confidence; 2: gold
confidence; 0: man's
confidence; 1: man's
confidence; 2: man's
confidence; 0: digging
confidence; 1: digging
confidence; 2: digging
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: own
confidence; 1: own
confidence; 2: own
confidence; 0: back
confidence; 1: back
confidence; 2: back
confidence; 0: yard
confidence; 1: yard
confidence; 2: yard
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: can
confidence; 1: can
confidence; 2: can
confidence; 0: shoot
confidence; 1: shoot
confidence; 2: shoot
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: your
confidence; 1: your
confidence; 2: your
confidence; 0: words
confidence; 1: words
confidence; 2: words
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: can
confidence; 1: can
confidence; 2: can
confidence; 0: cut
confidence; 1: cut
confidence; 2: cut
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: your
confidence; 1: your
confidence; 2: your
confidence; 0: lies
confidence; 1: lies
confidence; 2: lies
confidence; 0: you
confidence; 1: you
confidence; 2: you
confidence; 0: can
confidence; 1: can
confidence; 2: can
confidence; 0: kill
confidence; 1: kill
confidence; 2: kill
confidence; 0: me
confidence; 1: me
confidence; 2: me
confidence; 0: with
confidence; 1: with
confidence; 2: with
confidence; 0: your
confidence; 1: your
confidence; 2: your
confidence; 0: hatefulness
confidence; 1: hatefulness
confidence; 2: hatefulness
confidence; 0: but
confidence; 1: but
confidence; 2: but
confidence; 0: just
confidence; 1: just
confidence; 2: just
confidence; 0: like
confidence; 1: like
confidence; 2: like
confidence; 0: life
confidence; 1: life
confidence; 2: life
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: does
confidence; 1: does
confidence; 2: does
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: senatorian
confidence; 1: senatorian
confidence; 2: senatorian
confidence; 0: as
confidence; 1: as
confidence; 2: as
confidence; 0: if
confidence; 1: if
confidence; 2: if
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: have
confidence; 1: have
confidence; 2: have
confidence; 0: diamonds
confidence; 1: diamonds
confidence; 2: diamonds
confidence; 0: at
confidence; 1: at
confidence; 2: at
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: meeting
confidence; 1: meeting
confidence; 2: meeting
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: time
confidence; 1: time
confidence; 2: time
confidence; 0: out
confidence; 1: out
confidence; 2: out
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: huts
confidence; 1: huts
confidence; 2: huts
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: history
confidence; 1: history
confidence; 2: history
confidence; 0: shame
confidence; 1: shame
confidence; 2: shame
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: rise
confidence; 1: rise
confidence; 2: rise
confidence; 0: up
confidence; 1: up
confidence; 2: up
confidence; 0: from
confidence; 1: from
confidence; 2: from
confidence; 0: a
confidence; 1: a
confidence; 2: a
confidence; 0: past
confidence; 1: past
confidence; 2: past
confidence; 0: rooted
confidence; 1: rooted
confidence; 2: rooted
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: pain
confidence; 1: pain
confidence; 2: pain
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: ran
confidence; 1: ran
confidence; 2: ran
confidence; 0: a
confidence; 1: a
confidence; 2: a
confidence; 0: black
confidence; 1: black
confidence; 2: black
confidence; 0: ocean
confidence; 1: ocean
confidence; 2: ocean
confidence; 0: heaving
confidence; 1: heaving
confidence; 2: heaving
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: would
confidence; 1: would
confidence; 2: would
confidence; 0: willingly
confidence; 1: willingly
confidence; 2: willingly
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: bearing
confidence; 1: bearing
confidence; 2: bearing
confidence; 0: it
confidence; 1: it
confidence; 2: it
confidence; 0: leaving
confidence; 1: leaving
confidence; 2: leaving
confidence; 0: behind
confidence; 1: behind
confidence; 2: behind
confidence; 0: it
confidence; 1: it
confidence; 2: it
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: terror
confidence; 1: terror
confidence; 2: terror
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: fear
confidence; 1: fear
confidence; 2: fear
confidence; 0: as
confidence; 1: as
confidence; 2: as
confidence; 0: into
confidence; 1: into
confidence; 2: into
confidence; 0: a
confidence; 1: a
confidence; 2: a
confidence; 0: daybreak
confidence; 1: daybreak
confidence; 2: daybreak
confidence; 0: miraculously
confidence; 1: miraculously
confidence; 2: miraculously
confidence; 0: clear
confidence; 1: clear
confidence; 2: clear
confidence; 0: in
confidence; 1: in
confidence; 2: in
confidence; 0: bringing
confidence; 1: bringing
confidence; 2: bringing
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: gifts
confidence; 1: gifts
confidence; 2: gifts
confidence; 0: that
confidence; 1: that
confidence; 2: that
confidence; 0: my
confidence; 1: my
confidence; 2: my
confidence; 0: ancestors
confidence; 1: ancestors
confidence; 2: ancestors
confidence; 0: gave
confidence; 1: gave
confidence; 2: gave
confidence; 0: i
confidence; 1: i
confidence; 2: i
confidence; 0: am
confidence; 1: am
confidence; 2: am
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: whole
confidence; 1: whole
confidence; 2: whole
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: dream
confidence; 1: dream
confidence; 2: dream
confidence; 0: of
confidence; 1: of
confidence; 2: of
confidence; 0: the
confidence; 1: the
confidence; 2: the
confidence; 0: slave
confidence; 1: slave
confidence; 2: slaves
confidence; 0: and
confidence; 1: and
confidence; 2: and
confidence; 0: so
confidence; 1: so
confidence; 2: so
confidence; 0: that
confidence; 1: then
confidence; 2: then

ということで、confidence については、やはりよくわからない。たぶん、この精度では、このように認識するという精度が三つあるという事かな？

その前に、話はじめ、話中、話終わりのフラグを付けたらば、もしかして、なんかよいのかな？ということを考えて、一応フラグを立ててみた。

        #confidence:0
        confidence =jso['transcripts'][0]
        #print(confidence)
        for i,ob in enumerate(confidence['words']):
            #continue
            #print(i,ob) 
            talk_start = True
            talk_end = False

            for key in ob:
                if key == 'word':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    if ob[key] != '':
                        sentence.append(ob.get(key))
                    ###print(*sentence)

                elif key == 'start_time':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    time = ob[key]
                    if  time - lastword_time < 1:
                        talk_start = False
                        talk_end = False
       
                    elif time - lastword_time >= 1: # 1 secons silence
                        talk_start = False
                        talk_end = True
                        ### block >
                        totaltime = 0
                        endtime = fmttime(lastword_time)
                        if len(sentence) > 1:
                            temp = sentence.pop()
                            print(lineNum)
                            lineNum += 1
                            print(starttime,'-->',endtime2)
                            # this word goes to next caption
                            kotoba = ''
                            for word in sentence:
                                kotoba += word + ' '
                            print(kotoba.rstrip())
                            print()
                            sentence.clear()
                            sentence.append(temp) # new caption
                        ### <  block 

                    if len(sentence) == 1 :
                        talk_start = True
                        talk_end = False
                        starttime = fmttime(time)
                        p_time = time

                elif key == 'duration':
                    ###print(jso['transcripts'][0]['words'][i][key])
                    totaltime += ob[key] 
                    lastword_time = p_time + totaltime

                    endtime2 = fmttime(lastword_time)

                    #print('in :',fmttime(time),'>>',*sentence)
                    #print('end :',fmttime(time+totaltime)) 
                    #print('< > :',fmttime(totaltime))

                    if totaltime >= 4: # 4 seconds speech gose to 1 caption
                        ### block >
                        totaltime = 0    
                        endtime = fmttime(lastword_time)
                        print(lineNum)
                        lineNum += 1
                        print(starttime,'-->',endtime)
                        kotoba = ''
                        for word in sentence:
                            kotoba += word + ' '
                        
                        print(kotoba.rstrip())
                        print()
                        sentence.clear()
                        ### < block
                    elif totaltime < 4 and i + 1 == len(confidence['words']): 
                        ### block >
                        totaltime = 0    
                        endtime = fmttime(lastword_time)
                        print(lineNum)
                        lineNum += 1
                        print(starttime,'-->',endtime)
                        kotoba = ''
                        for word in sentence:
                            kotoba += word + ' '
                        
                        print(kotoba.rstrip())
                        print()
                        sentence.clear()
                        ### < block

Rf.
OpenMPT
https://en.m.wikipedia.org/wiki/OpenMPT

https://qiita.com/dauuricus/items/20d46731c1f6696c6c23 ↩

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up