##extracting video id from YouTube URL and extractiong subtitles to subtitle.txt
google colab
subtitle_text_extractor
from youtube_transcript_api import YouTubeTranscriptApi
#from googletrans import translator
from google.colab import files
#import time
import sys
from urllib.parse import urlparse, parse_qs
urltext = 'https://www.youtube.com/watch?app=desktop&t=0s&v=PNZtCP4K8AE'
args = [urltext]
video_id = ''
print()
def extract_video_id(url):
query = urlparse(url)
if query.hostname == 'youtu.be': return query.path[1:]
if query.hostname in {'www.youtube.com', 'youtube.com'}:
if query.path == '/watch': return parse_qs(query.query)['v'][0]
if query.path[:7] == '/embed/': return query.path.split('/')[2]
if query.path[:3] == '/v/': return query.path.split('/')[2]
# fail?
return None
for url in args:
video_id = (extract_video_id(url))
print('youtube video_id:',video_id)
##video_id = 'PNZtCP4K8AE' ## youtube video_id
line =[]
line[:] = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])
text_list = []
for l in line:
##print("text: ", l['text'])
##print("start:", l['start'])
##print("duration:", l['duration'])
l['text']=l['text'].strip()
l['text']=l['text'].rstrip('\n')
l['text']=l['text'].replace('\n',' ')
text_list.append(l['text'])
##print(line)
del line
##print(text_list)
original_stdout = sys.stdout ## stdout backup
filename = 'subtitle.txt' ## print subtitle text to this file
with open(filename, 'w') as f:
sys.stdout = f # stdout to file
print("haywhnk-A.K.A-@dauuricus")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print("YouTube captions")
print("- - - - - - - - - - - - - - - - - - - YouTube - - - - - - - - - - - - - - - - - - -")
print()
print()
line = YouTubeTranscriptApi.list_transcripts(video_id)
transcript = line.find_transcript(['en'])
#print(transcript.fetch())
caption_line =[]
for count, dict_obj in enumerate(transcript.fetch()):
print( count +1, dict_obj['text'] )
caption_line.append(dict_obj['text'])
print()
print()
print("************************************************************************************")
print()
print("Youtube captions")
print("- - - - - - - - - - - - - - - - - - translated - - - - - - - - - - - - - - - - - - -")
print()
print()
translated = transcript.translate('eo')
for count, dict_obj in enumerate(translated.fetch()):# japanese
print( count+1, dict_obj['text'] )
print()
print("-----------------------------------------------------------------------------------")
print()
print("captions text compositimg")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print()
print()
##text_list[:] = [a for a in text_list if a != ' ']
##text_list[:] = [l.replace('\n',' ') for l in text_list]
# text_e = [] ##2 lines to 1 line
# i = 0
# txt_e = ''
# for count,l in enumerate(caption_line):
# if i == 0:
# txt_e += l
# i = i + 1
# text_e.append(txt_e)
# elif i == 1:
# txt_e += ' ' +l
# text_e.pop()
# text_e.append(txt_e)
# i = 0
# txt_e = ''
def line_edit2(textlines): ##2 lines to 1 line 2行を1行に
text_compo = []
txt = ''
for count,l in enumerate(textlines):
if (count+1)%2 == 0:
txt = text_compo.pop()
txt += ' ' +l
text_compo.append(txt)
else :
txt = l
text_compo.append(txt)
return text_compo
def line_edit3(textlines): ##3 lines to 1 line 3行を1行に
text_compo = []
txt = ''
i = 0
for count,l in enumerate(textlines):
if i == 0:
txt += l
i = i + 1
text_compo.append(txt)
elif i == 1:
txt = text_compo.pop()
txt += ' ' + l
i = i + 1
text_compo.append(txt)
elif i == 2:
txt = text_compo.pop()
txt += ' ' + l
text_compo.append(txt)
txt = ' '
i = 0
return text_compo
def line_edit(textlines): ##2 lines to 1 line カチリカチリ
text_compo = []
i = 0
txt = ''
for count,l in enumerate(textlines):
if i == 0:
txt += l
i = i + 1
text_compo.append(txt)
elif i == 1:
txt += ' ' +l
text_compo.pop()
text_compo.append(txt)
i = 0
txt = ''
return text_compo
# text_compo = [] ##2 lines to 1 line
# i = 0
# txt = ''
# for count,l in enumerate(text_list):
# if i == 0:
# txt += l
# i = i + 1
# text_compo.append(txt)
# elif i == 1:
# txt += ' ' +l
# text_compo.pop()
# text_compo.append(txt)
# i = 0
# txt = ''
print()
print()
print("************************************************************************************")
print()
print()
for count, l in enumerate(text_list):
print(count+1,l)
print()
print()
print("************************************************************************************")
print(" 1/2 ")
print()
text_compo = line_edit(caption_line)
for count, l in enumerate(text_compo):
print(count+1,' ',l)
print()
print()
text_compo = line_edit(text_list)
for count, l in enumerate(text_compo):
print(count+1,' ',l)
print()
print("************************************************************************************")
print()
print()
print("************************************************************************************")
print()
print("Thank you.")
sys.stdout = original_stdout # stdout back
#files.download(filename)
Rf.
https://qiita.com/dauuricus/private/9e70c4c25566fedb9c19