This article is a Private article. Only a writer and users who know the URL can access it.
Please change open range to public in publish setting if you want to share this article with other users.
More than 3 years have passed since last update.
YouTube subtitles

Last updated at 2021-02-07Posted at 2021-02-05
##extracting video id from YouTube URL and extractiong subtitles to subtitle.txt
google colab
subtitle_text_extractor
from youtube_transcript_api import YouTubeTranscriptApi
#from googletrans import translator
from google.colab import files
#import time
import sys
from urllib.parse import urlparse, parse_qs

urltext = 'https://www.youtube.com/watch?app=desktop&t=0s&v=PNZtCP4K8AE'

args = [urltext]

video_id = ''

print()

def extract_video_id(url):
    query = urlparse(url)
    if query.hostname == 'youtu.be': return query.path[1:]
    if query.hostname in {'www.youtube.com', 'youtube.com'}:
        if query.path == '/watch': return parse_qs(query.query)['v'][0]
        if query.path[:7] == '/embed/': return query.path.split('/')[2]
        if query.path[:3] == '/v/': return query.path.split('/')[2]
    # fail?
    return None

for url in args:
    video_id = (extract_video_id(url))
    print('youtube video_id:',video_id)

##video_id = 'PNZtCP4K8AE' ## youtube video_id
line =[]
line[:] = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])

text_list = []
for l in line:
    ##print("text: ", l['text'])
    ##print("start:", l['start'])
    ##print("duration:", l['duration'])
    
    l['text']=l['text'].strip()
    l['text']=l['text'].rstrip('\n')
    l['text']=l['text'].replace('\n',' ')
    text_list.append(l['text'])
##print(line)    
del line
##print(text_list)

original_stdout = sys.stdout ## stdout backup
filename = 'subtitle.txt' ## print subtitle text to this file
with open(filename, 'w') as f:
    sys.stdout = f # stdout to file

    print("haywhnk-A.K.A-@dauuricus")
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
    print("YouTube captions")
    print("- - - - - - - - - - - - - - - - - - -  YouTube  - - - - - - - - - - - - - - - - - - -")
    print()
    print()
    line = YouTubeTranscriptApi.list_transcripts(video_id)    

    transcript = line.find_transcript(['en'])
    #print(transcript.fetch())
    
    caption_line =[]
    for count, dict_obj in enumerate(transcript.fetch()):
        print( count +1, dict_obj['text'] )
        caption_line.append(dict_obj['text'])
    print()
    print()
    print("************************************************************************************")
    print()
    print("Youtube captions")
    print("- - - - - - - - - - - - - - - - - - translated - - - - - - - - - - - - - - - - - - -")
    print()
    print()
    
    translated = transcript.translate('eo')
    for count, dict_obj in enumerate(translated.fetch()):# japanese
        print( count+1, dict_obj['text'] )
        

    print()
    print("-----------------------------------------------------------------------------------")
    print()
    print("captions text compositimg")
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
    print()
    print()
    ##text_list[:] = [a for a in text_list if a != ' ']
    ##text_list[:] = [l.replace('\n',' ') for l in text_list]

#    text_e = [] ##2 lines to 1 line
#    i = 0
#    txt_e = ''
#    for count,l in enumerate(caption_line):
#        if i == 0:
#          txt_e += l
#          i = i + 1
#          text_e.append(txt_e)
#        elif i == 1:
#          txt_e += ' ' +l
#          text_e.pop()
#          text_e.append(txt_e)
#          i = 0
#          txt_e = ''

    def line_edit2(textlines): ##2 lines to 1 line 2行を1行に
        text_compo = []
        txt = ''
        for count,l in enumerate(textlines):
          if (count+1)%2 == 0:
            txt = text_compo.pop()
            txt += ' ' +l
            text_compo.append(txt)
          else :
            txt = l
            text_compo.append(txt)
        return text_compo

    def line_edit3(textlines): ##3 lines to 1 line 3行を1行に
        text_compo = []
        txt = ''
        i = 0
        for count,l in enumerate(textlines):
          if i == 0:
            txt += l
            i = i + 1
            text_compo.append(txt)
          elif i == 1:
            txt = text_compo.pop()
            txt += ' ' + l
            i = i + 1
            text_compo.append(txt)
          elif i == 2:
            txt = text_compo.pop()
            txt += ' ' + l
            text_compo.append(txt)
            txt = ' '
            i = 0
        return text_compo

    def line_edit(textlines): ##2 lines to 1 line　カチリカチリ
        text_compo = []
        i = 0
        txt = ''
        for count,l in enumerate(textlines):
          if i == 0:
            txt += l
            i = i + 1
            text_compo.append(txt)
          elif i == 1:
            txt += ' ' +l
            text_compo.pop()
            text_compo.append(txt)
            i = 0
            txt = ''
        return text_compo

#    text_compo = [] ##2 lines to 1 line
#    i = 0
#    txt = ''
#    for count,l in enumerate(text_list):
#        if i == 0:
#          txt += l
#          i = i + 1
#          text_compo.append(txt)
#        elif i == 1:
#          txt += ' ' +l
#          text_compo.pop()
#          text_compo.append(txt)
#          i = 0
#          txt = ''
    print()
    print()
    print("************************************************************************************")
    print()
    print()
    for count, l in enumerate(text_list):
        print(count+1,l)
    print()
    print()
    print("************************************************************************************")
    print("  1/2 ")
    print()
    text_compo = line_edit(caption_line)
    for count, l in enumerate(text_compo):
        print(count+1,'  ',l)
    
    print()
    print()
    text_compo = line_edit(text_list)
    for count, l in enumerate(text_compo):
        print(count+1,'  ',l)

    print()
    print("************************************************************************************")
    print()
    print()
    print("************************************************************************************")
    print()
    print("Thank you.")

    sys.stdout = original_stdout # stdout back 

#files.download(filename)
case: 3 lines goes to 1 line
Rf.
https://qiita.com/dauuricus/private/9e70c4c25566fedb9c19
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up