A program that extracts subtitles from YouTube written in python.
For example you want to get subtitle from this YouTube video
Minecraft 1.7: Modding Tutorial --Episode 1 --Development Environment!*
If so, video_id is the last 11 characters.1
youtube.com/watch?v=0ULz-oCUbEg
video_id = '0ULz-oCUbEg'
If you write and specify this in the program, the subtitle file will be downloaded from the Youtube clip.
Before watching the video, you should look at the subtitles in advance and check the contents in text.
Google Colaboratory The following 'youtube-subtitle-trans.ipynb';python program runs on a web service called Google Colaboratory. You have to create an account for Google colab, but you can use it for free. It runs a python program in a browser.
This program also runs in the browser.
Copy the code below to the "cell" of Google colab, paste it, and press something like the play button on the left to execute the code.
If you run these two and the installation is successful, the third slightly longer code is the program for viewing subtitles, and it will work.
Install two software modules
googletrans==4.0.0-rc1
pip install googletrans==4.0.0-rc1
youtube_transcript_api
pip install youtube_transcript_api
Program to download subtitles
youtube-subtitle-trans.ipynb
from youtube_transcript_api import YouTubeTranscriptApi
from googletrans import Translator
video_id = '0ULz-oCUbEg' ## youtube video_id
line =[]
line[:] = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])
text_list = []
for l in line:
##print("text: ", l['text'])
##print("start:", l['start'])
##print("duration:", l['duration'])
l['text']=l['text'].strip()
l['text']=l['text'].rstrip('\n')
l['text']=l['text'].replace('\n',' ')
text_list.append(l['text'])
##print(line)
del line
##print(text_list)
print("@dauuricus")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print("YouTube captions")
print("- - - - - - - - - - - - - - - - - - - YouTube - - - - - - - - - - - - - - - - - - -")
print()
print()
line = YouTubeTranscriptApi.list_transcripts(video_id)
transcript = line.find_transcript(['en'])
##print(transcript.fetch())
for count, dict_obj in enumerate(transcript.fetch()):
print( count +1, dict_obj['text'] )
print()
print()
print("************************************************************************************")
print()
print("Youtube captions")
print("- - - - - - - - - - - - - - - - - - translated - - - - - - - - - - - - - - - - - - -")
print()
print()
translated = transcript.translate('ja')
for count, dict_obj in enumerate(translated.fetch()):
print( count+1, dict_obj['text'] )
print()
print("-----------------------------------------------------------------------------------")
print()
#print("google translate API")
#print("- - - - - - - - - - - - - - - - - googletrans - - - - - - - - - - - - - - - - - -")
print()
print()
####line_list = []
####for l in text_list:
#### line_list.append(l.replace('\n',' '))
##text_list[:] = [a for a in text_list if a != ' ']
##text_list[:] = [l.replace('\n',' ') for l in text_list]
text_compo = [] ##2 lines to 1 line
i = 0
txt = ''
for count,l in enumerate(text_list):
if i == 0:
txt += l
i = i + 1
text_compo.append(txt)
elif i == 1:
txt += ' ' +l
text_compo.pop()
text_compo.append(txt)
i = 0
txt = ''
#for count, l in enumerate(text_list):
# print(count+1,l)
print()
print("************************************************************************************")
print()
#print("google translate API")
#print("- - - - - - - - - - - - - - - - - translated - - - - - - - - - - - - - - - - - - -")
print()
print()
translator = Translator()
#for count, l in enumerate(text_list):
# translated = translator.translate(l, dest='ja')
# ##print(count+1, l)
# print(count+1, translated.text)
#num = 50
#obj_num = 1
#for count, l in enumerate(text_list):
# if count +1 < num:
# translated = translator.translate(l, dest='ja')
# #print(count+1, l)
# print(count+1, translated.text)
# else:
# translated = translator.translate(l, dest='ja')
# print(count+1, translated.text)
# del translator
# num = num + 50
# obj_num = obj_num + 1
# #print("")
# #print("--- translator :", obj_num)
# #print("")
# translator = Translator()
print()
print("------------------------------------------------------------------------------------")
print()
print("google translate API")
print("- - - - - - - - - - - - - - - - - - 2 lines to 1 - - - - - - - - - - - - - - - - - -")
print()
print()
for count, l in enumerate(text_compo):
print(count+1,l)
print()
print("************************************************************************************")
print()
print("google translate API")
print("- - - - - - - - - - - - - - - - - - translated - - - - - - - - - - - - - - - - - - -")
print()
print()
#for count, l in enumerate(text_compo):
# translated = translator.translate(l, dest='ja')
# ##print(count+1, l)
# print(count+1, translated.text)
num = 30
#obj_num = 1
for count, l in enumerate(text_compo):
if count + 1 < num:
translated = translator.translate(l, dest='ja')
#print(count+1, l)
print(count+1, translated.text)
else:
translated = translator.translate(l, dest='ja')
print(count+1, translated.text)
del translator
num = num + 30
#obj_num = obj_num + 1
#print("")
#print("--- translator :", obj_num)
#print("")
translator = Translator()
print()
print()
print("************************************************************************************")
print()
print("Thank you.")
Cf. Full comparison
googletrans Supported languages list
It is for specifying the language of subtitles programmatically.
Hebrew is like these two.
- 'iw':'hebrew'
- 'he':'hebrew'
In the program code above, there is a place called dest ='ja'
, 'ja'
is japanese
, I think that if you change this to 'he'
,'hebrew'
, or 'iw'
, it will be translated into Hebrew.
Also, there is a place called ['en']
in the program code, it means English because default setting in the program module 'googletrans' is English, if the original subtitle is English, it will be as it is, if the subtitle is in another language, from the list below Select the desired language and change it.
However, at this point, English and Spanish ('es'
,'spanish'
) seem to add subtitles automatically, but subtitles in other languages don't seem to work very well.
For the time being, the original subtitles are in English and are set to translate into Japanese, so it is from ['en']
to 'ja'
.
So try python.
If you have a long YouTube video, Google Translate will give an error, but the subtitle translation will be displayed.
In addition, the line up to the end of the YouTube video is displayed as a set of number and sentence, so you can check how far it has been translated.
import googletrans
box =[]
for i in range(len(googletrans.LANGUAGES)):
box.append(googletrans.LANGUAGES.popitem())
box.reverse()
for num,language in enumerate(box):
print(num,language)
languages list | |||
---|---|---|---|
'af': 'afrikaans' | 'sq': 'albanian' | 'am': 'amharic' | 'ar': 'arabic' |
'hy': 'armenian' | 'az': 'azerbaijani' | 'eu': 'basque' | 'be': 'belarusian' |
'bn': 'bengali' | 'bs': 'bosnian' | 'bg': 'bulgarian' | 'ca': 'catalan' |
'ceb': 'cebuano' | 'ny': 'chichewa' | 'zh-cn': 'chinese (simplified)' | 'zh-tw': 'chinese (traditional)' |
'co': 'corsican' | 'hr': 'croatian' | 'cs': 'czech' | 'da': 'danish' |
'nl': 'dutch' | 'en': 'english' | 'eo': 'esperanto' | 'et': 'estonian' |
'fi': 'finnish' | 'fr': 'french' | 'fy': 'frisian' | 'gl': 'galician' |
'ka': 'georgian' | 'de': 'german' | 'el': 'greek' | 'gu': 'gujarati' |
'ht': 'haitian creole' | 'ha': 'hausa' | 'haw': 'hawaiian' | 'iw': 'hebrew' |
'he': 'hebrew' | 'hi': 'hindi' | 'hmn': 'hmong' | 'hu': 'hungarian' |
'is': 'icelandic' | 'ig': 'igbo' | 'id': 'indonesian' | 'ga': 'irish' |
'it': 'italian' | 'ja': 'japanese' | 'jw': 'javanese' | 'kn': 'kannada' |
'kk': 'kazakh' | 'km': 'khmer' | 'ko': 'korean' | 'ku': 'kurdish (kurmanji)' |
'ky': 'kyrgyz' | 'lo': 'lao' | 'la': 'latin' | 'lv': 'latvian' |
'lt': 'lithuanian' | 'lb': 'luxembourgish' | 'mk': 'macedonian' | mg': 'malagasy' |
'ms': 'malay' | 'ml': 'malayalam' | 'mt': 'maltese' | 'mi': 'maori' |
'mr': 'marathi' | 'mn': 'mongolian' | 'my': 'myanmar (burmese)' | 'ne': 'nepali' |
'no': 'norwegian' | 'or': 'odia' | 'ps': 'pashto' | 'fa': 'persian' |
'pl': 'polish' | 'pt': 'portuguese' | 'pa': 'punjabi' | 'ro': 'romanian' |
'ru': 'russian' | 'sm': 'samoan' | 'gd': 'scots gaelic' | 'sr': 'serbian' |
'st': 'sesotho' | 'sn': 'shona' | 'sd': 'sindhi' | 'si': 'sinhala' |
'sk': 'slovak' | 'sl': 'slovenian' | 'so': 'somali' | 'es': 'spanish' |
'su': 'sundanese' | 'sw': 'swahili' | 'sv': 'swedish' | 'tg': 'tajik' |
'ta': 'tamil' | 'te': 'telugu' | 'th': 'thai' | 'tr': 'turkish' |
'uk': 'ukrainian' | 'ur': 'urdu' | 'ug': 'uyghur' | 'uz': 'uzbek' |
'vi': 'vietnamese' | 'cy': 'welsh' | 'xh': 'xhosa' | 'yi': 'yiddish' |
'yo': 'yoruba'a | 'zu': 'zulu' |
-
Extracting video_id from YouTube URL programming code(https://qiita.com/dauuricus/private/9e70c4c25566fedb9c19) ↩