This article is a Private article. Only a writer and users who know the URL can access it.
Please change open range to public in publish setting if you want to share this article with other users.

More than 3 years have passed since last update.

youtube_transcript_api , googletrans

Python

Last updated at 2021-02-13Posted at 2021-02-12

pip install youtube_transcript_api googletrans==4.0.0-rc1 h2==3.*

###Easy and lazy compositting

case: google colab

from youtube_transcript_api import YouTubeTranscriptApi
from google.colab import files
#import time
import sys
from urllib.parse import urlparse, parse_qs

urltext ='https://www.youtube.com/watch?v=JV7GqhDQosE' 
args = [urltext]
video_id = ''

print()
print()


def extract_video_id(url):
    query = urlparse(url)
    if query.hostname == 'youtu.be': return query.path[1:]
    if query.hostname in {'www.youtube.com', 'youtube.com'}:
        if query.path == '/watch': return parse_qs(query.query)['v'][0]
        if query.path[:7] == '/embed/': return query.path.split('/')[2]
        if query.path[:3] == '/v/': return query.path.split('/')[2]
    # fail?
    return None

for url in args:
    video_id = (extract_video_id(url))
    print('youtube video_id:',video_id)

line =[]
line[:] = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])

text_list = []
for l in line:
    ##print("text: ", l['text'])
    ##print("start:", l['start'])
    ##print("duration:", l['duration'])
    
    l['text']=l['text'].strip()
    l['text']=l['text'].rstrip('\n')
    l['text']=l['text'].rstrip('\r')
    l['text']=l['text'].replace('\r','')
    l['text']=l['text'].replace('\n',' ')
    text_list.append(l['text'])

##text_list[:] = [a for a in text_list if a != ' ']
##text_list[:] = [l.replace('\n',' ') for l in text_list]
##print(line)    
del line
##print(text_list)

##original_stdout = sys.stdout ## stdout backup
filename = 'subtitle.txt' ## print subtitle text to this file
with open(filename, 'w') as f:
    ##sys.stdout = f # stdout to file

    print('youtube video_id:',video_id)
    print()
    print("haywhnk-A.K.A-@dauuricus")
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
    print("YouTube captions")
    print("- - - - - - - - - - - - - - - - - - -  YouTube  - - - - - - - - - - - - - - - - - - -")
    print()
    print()
    line = YouTubeTranscriptApi.list_transcripts(video_id)    

    transcript = line.find_transcript(['en'])
   #print(transcript.fetch())
    
    caption_line =[]
    for count, dict_obj in enumerate(transcript.fetch()):
        ##print(count+1,'  ', dict_obj['text'] )
        caption_line.append(dict_obj['text'])
    for count, l in enumerate(caption_line):
        print(count+1,'  ',l)

    print()
    print()
    print("************************************************************************************")
    print()
    print("Youtube captions")
    print("- - - - - - - - - - - - - - - - - - translated - - - - - - - - - - - - - - - - - - -")
    print()
    print()
  
    translated = transcript.translate('ja')
    for count, dict_obj in enumerate(translated.fetch()):# japanese
        print( count+1,'  ', dict_obj['text'] )
        

##    print()
##    print("-----------------------------------------------------------------------------------")
##    print()
##    print("captions text compositimg")
##    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
##    print()
##    print()
    
    def line_edit2(textlines): ##2 lines to 1 line
        text_compo = []
        txt = ''
        for count,l in enumerate(textlines):
          if (count+1)%2 == 0:
            txt = text_compo.pop()
            txt += ' ' +l
            text_compo.append(txt)
          else :
            txt = l
            text_compo.append(txt)
        return text_compo

    def line_edit3(textlines): ##3 lines to 1 line
        text_compo = []
        txt = ''
        i = 0
        for count,l in enumerate(textlines):
          if i == 0:
            txt += l
            i = i + 1
            text_compo.append(txt)
          elif i == 1:
            txt = text_compo.pop()
            txt += ' ' + l
            i = i + 1
            text_compo.append(txt)
          elif i == 2:
            txt = text_compo.pop()
            txt += ' ' + l
            text_compo.append(txt)
            txt = ''
            i = 0
        return text_compo

    def line_edit(textlines): ##2 lines to 1 line
        text_compo = []
        i = 0
        txt = ''
        for count,l in enumerate(textlines):
          if i == 0:
            txt += l
            i = i + 1
            text_compo.append(txt)
          elif i == 1:
            txt += ' ' +l
            text_compo.pop()
            text_compo.append(txt)
            i = 0
            txt = ''
        return text_compo

    print()
    print()
    print("************************************************************************************")
    print()
    print()
    print()
    print()
    print("************************************************************************************")
    print("shrink text")
    print()
##    for count, l in enumerate(text_e):
##        print(count+1,l)
    print()
    print()
#    text_compo = (line_edit2(text_list))
#    for count, l in enumerate(text_compo):
#        print(l)
    print()
    print()
    text_compo = (line_edit3(text_list))
    text_compo[:] = (line_edit3(text_compo))
    #for count, l in enumerate(text_compo):
    #    print(l)
    text_compo2 = (line_edit2(text_compo))
    text_compo2[:] = (line_edit2(text_compo2))
    for count, l in enumerate(text_compo2):
        print(l)

    print()
    print("************************************************************************************")
    print()
    print()
    print("************************************************************************************")
    print()
    print("Thank you.")


#sys.stdout = original_stdout # stdout back 

##files.download(filename)

#import re
import h2.connection
import h2.config
from googletrans import Translator
#import sys

#####uploaded = files.upload()

####filename = ''
####for fn in uploaded.keys():
####  print('User uploaded file "{name}" with length {length} bytes'.format(
####      name=fn, length=len(uploaded[fn])))
####  filename = fn


#filename = 'subtitle.txt'
#args= sys.argv
##args = [('translate.py'),filename]

##print('open '+args[1])
##with open(args[1]) as f: # uploaded file
##  line = f.readlines() 

##line[:] = [l.strip() for l in line]
##line[:] = [l.rstrip('\n') for l in line]
##line[:] = [a for a in line if a != '']
##line[:] = [l.replace('\n',' ') for l in line]
##line[:] = [l.replace('\r',' ') for l in line]
#print(line)

#print()

####for line_num,l in enumerate(line):
####  if re.search(r'.*?i'm$',l):
####    print(line_num,'   ',l)
####  elif re.search(r'.*?to/Z',l):
####    print(line_num,'   ',l)
####  if re.search(r'.*?the$',l):
####    print(line_num,'   ',l)
####  elif re.search(r'.*?the/Z',l):
####    print(line_num,'   ',l)


#for line_num,l in enumerate(line):
#    print(line_num,'   ',l)
    
translator = Translator()
num = 20
#obj_num = 1
filename = 'translated.txt'
backup_stdout = sys.stdout
print("translating...")
print()

with open(filename,'w') as f:
    #sys.stdout = f
            
    for count, l in enumerate(text_compo2):
        if count +1< num:
            translated = translator.translate(l, dest='ja')
            ##print(count+1,'  ', l) # original text
            print(translated.text)
        else:
            translated = translator.translate(l, dest='ja')
            ##print(count+1,'  ', l) # original text
            print(translated.text)        
            del translator
            num = num + 20
            #obj_num = obj_num + 1
            #print("")
            #print("--- translator :", obj_num)
            #print("")
            translator = Translator()        
    #sys.stdout = backup_stdout # back
del translator
print("saving...",filename)

#   files.download(filename) # translated.txt

case: not google colab

youtube-translate.py


from youtube_transcript_api import YouTubeTranscriptApi
#import time
import sys
from urllib.parse import urlparse, parse_qs

urltext ='https://www.youtube.com/watch?v=dQqkHjvj2zU' 
args = [urltext]
video_id = ''

print()
print()


def extract_video_id(url):
    query = urlparse(url)
    if query.hostname == 'youtu.be': return query.path[1:]
    if query.hostname in {'www.youtube.com', 'youtube.com'}:
        if query.path == '/watch': return parse_qs(query.query)['v'][0]
        if query.path[:7] == '/embed/': return query.path.split('/')[2]
        if query.path[:3] == '/v/': return query.path.split('/')[2]
    # fail?
    return None

for url in args:
    video_id = (extract_video_id(url))
    print('youtube video_id:',video_id)

#video_id = 'PNZtCP4K8AE' ## youtube video_id example
line =[]
line[:] = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])

text_list = []
for l in line:
    ##print("text: ", l['text'])
    ##print("start:", l['start'])
    ##print("duration:", l['duration'])
    
    l['text']=l['text'].strip()
    l['text']=l['text'].rstrip('\n')
    l['text']=l['text'].rstrip('\r')
    l['text']=l['text'].replace('\r','')
    l['text']=l['text'].replace('\n',' ')
    text_list.append(l['text'])

##text_list[:] = [a for a in text_list if a != ' ']
##text_list[:] = [l.replace('\n',' ') for l in text_list]
##print(line)    
del line
##print(text_list)

##original_stdout = sys.stdout ## stdout backup
filename = 'subtitle.txt' ## print subtitle text to this file
with open(filename, 'w') as f:
    ##sys.stdout = f # stdout to file

    print('youtube video_id:',video_id)
    print()
    print("haywhnk-A.K.A-@dauuricus")
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
    print("YouTube captions")
    print("- - - - - - - - - - - - - - - - - - -  YouTube  - - - - - - - - - - - - - - - - - - -")
    print()
    print()
    line = YouTubeTranscriptApi.list_transcripts(video_id)    

    transcript = line.find_transcript(['en'])
   #print(transcript.fetch())
    
    caption_line =[]
    for count, dict_obj in enumerate(transcript.fetch()):
        print( dict_obj['text'] )
##        caption_line.append(dict_obj['text'])
##    print()
##    print()
##    print("************************************************************************************")
##    print()
##    print("Youtube captions")
##    print("- - - - - - - - - - - - - - - - - - translated - - - - - - - - - - - - - - - - - - -")
##    print()
##    print()
  
##    translated = transcript.translate('ja')
##    for count, dict_obj in enumerate(translated.fetch()):# japanese
##        print( count+1, dict_obj['text'] )
        

##    print()
##    print("-----------------------------------------------------------------------------------")
##    print()
##    print("captions text compositimg")
##    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
##    print()
##    print()
    
    def line_edit2(textlines): ##2 lines to 1 line
        text_compo = []
        txt = ''
        for count,l in enumerate(textlines):
          if (count+1)%2 == 0:
            txt = text_compo.pop()
            txt += ' ' +l
            text_compo.append(txt)
          else :
            txt = l
            text_compo.append(txt)
        return text_compo

    def line_edit3(textlines): ##3 lines to 1 line
        text_compo = []
        txt = ''
        i = 0
        for count,l in enumerate(textlines):
          if i == 0:
            txt += l
            i = i + 1
            text_compo.append(txt)
          elif i == 1:
            txt = text_compo.pop()
            txt += ' ' + l
            i = i + 1
            text_compo.append(txt)
          elif i == 2:
            txt = text_compo.pop()
            txt += ' ' + l
            text_compo.append(txt)
            txt = ''
            i = 0
        return text_compo

    def line_edit(textlines): ##2 lines to 1 line
        text_compo = []
        i = 0
        txt = ''
        for count,l in enumerate(textlines):
          if i == 0:
            txt += l
            i = i + 1
            text_compo.append(txt)
          elif i == 1:
            txt += ' ' +l
            text_compo.pop()
            text_compo.append(txt)
            i = 0
            txt = ''
        return text_compo

###    print()
###    print()
###    print("************************************************************************************")
###    print()
###    print()
##    for count, l in enumerate(text_list):
##        print(count+1,l)
    print()
    print()
    print("************************************************************************************")
    print("shrink text")
    print()
##    for count, l in enumerate(text_e):
##        print(count+1,l)
    print()
    print()
#    text_compo = (line_edit2(text_list))
#    for count, l in enumerate(text_compo):
#        print(l)
    print()
    print()
    text_compo = (line_edit2(text_list))
    text_compo[:] = (line_edit3(text_compo))
    #for count, l in enumerate(text_compo):
    #    print(l)
    text_compo2 = (line_edit2(text_compo))
    text_compo2[:] = (line_edit2(text_compo2))
    for count, l in enumerate(text_compo2):
        print(l)

    print()
    print("************************************************************************************")
    print()
    print()
    print("************************************************************************************")
    print()
    print("Thank you.")


#sys.stdout = original_stdout # stdout back 


#import re
import h2.connection
import h2.config
from googletrans import Translator
    
translator = Translator()
num = 20
#obj_num = 1
filename = 'translated.txt'
backup_stdout = sys.stdout
print("translating...")
with open(filename,'w') as f:
    #sys.stdout = f
            
    for count, l in enumerate(text_compo2):
        if count < 0:
          continue
        else:
            if count +1< num:
               translated = translator.translate(l, dest='ja')
               ##print(count+1,'  ', l) # original text
               print(translated.text)
            else:
               translated = translator.translate(l, dest='ja')
               ##print(count+1,'  ', l) # original text
               print(translated.text)        
               del translator
               num = num + 20
               #obj_num = obj_num + 1
               #print("")
               #print("--- translator :", obj_num)
               #print("")
               translator = Translator()        
    #sys.stdout = backup_stdout # back
del translator
print("saving...",filename)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up