Fetching YouTube transcript data from a list of video

I have a function that can fetch the transcript of a youtube video. (You don't have to understand every line)

!pip install youtube_transcript_api
!pip install simplejson
from youtube_transcript_api import YouTubeTranscriptApi

import urllib.request
import json
import urllib


VideoID = 'LfC6pv8VISk'

def fetch_transcript():
    
    params = {"format": "json", "url": "https://www.youtube.com/watch?v=%s" % VideoID}
    url = "https://www.youtube.com/oembed"
    query_string = urllib.parse.urlencode(params)
    url = url   "?"   query_string

    with urllib.request.urlopen(url) as response:
        response_text = response.read()
        data = json.loads(response_text.decode())
        print('Titel: '   data['title'])
    
    # retrieve the available transcripts
    transcript_list = YouTubeTranscriptApi.list_transcripts(VideoID)

    # iterate over all available transcripts
    for transcript in transcript_list:

        # fetch the actual transcript data
        data = transcript.fetch()
        print(data)         

    # filter for language 
    transcript = transcript_list.find_transcript(['en'])  

fetch_transcript()

Output: [{'text': "Okay, here we go. This one's gonna be\nquick. So get your coffee ready today.", 'start': 0.15, 'duration': 2.97...}]

Now I collected a list of video_ids like:

VideoID = ['LfC6pv8VISk', 'befUVytFC80', '4c_rKOaTquM']

How can I iterate over the list using my function? My function only takes a single string like 'LfC6pv8VISk'

                                                           #the single string ↓

params = {"format": "json", "url": "https://www.youtube.com/watch?v=%s" % VideoID}

for i in VideoID ... fetch_transcript() doesn't work. It feels so easy but I'm stuck.

CodePudding user response：

Try making your function accept video id as parameter.

You can safely ignore imports from typing, it's just for annotations.

from typing import Iterable, Tuple, Dict, List, Any

from youtube_transcript_api import YouTubeTranscriptApi

import urllib.request
import json
import urllib


def fetch_transcript(vid_id) -> Tuple[str, List[Dict[str, Any]]]:
    params = {"format": "json", "url": f"https://www.youtube.com/watch?v={vid_id}"}
    url = "https://www.youtube.com/oembed"

    query_string = urllib.parse.urlencode(params)
    url = url   "?"   query_string

    print(f"Fetching from {url}")

    with urllib.request.urlopen(url) as response:
        response_text = response.read()
        title = json.loads(response_text.decode())["title"]

    # retrieve the available transcripts
    transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)

    return title, transcript_list.find_transcript(['en']).fetch()


def fetch_transcript_gen(vid_ids: Iterable):
    for video_id in vid_ids:
        yield fetch_transcript(video_id)


if __name__ == '__main__':
    from pprint import pprint
    for title_, data_ in fetch_transcript_gen(['LfC6pv8VISk', 'befUVytFC80', '4c_rKOaTquM']):
        print(f"\nTitle: {title_}\nData length: {len(data_)}\n")
        pprint(data_)

Output:

Fetching from https://www.youtube.com/oembed?format=json&url=https://www.youtube.com/watch?v=LfC6pv8VISk

Title: KILL Linux processes!! (also manage them) // Linux for Hackers // EP 7
Data length: 399

[{'duration': 2.97,
  'start': 0.15,
  'text': "Okay, here we go. This one's gonna be\n"
          'quick. So get your coffee ready today.'},
 {'duration': 4.4,
  'start': 3.12,
  'text': "We're gonna learn how to manage our\n"
          'processes, processes  and Lennox.'},
...

Fetching from https://www.youtube.com/oembed?format=json&url=https://www.youtube.com/watch?v=befUVytFC80

Title: you need to create a Cryptocurrency RIGHT NOW!! (Solana token)
Data length: 837

...

Fetching from https://www.youtube.com/oembed?format=json&url=https://www.youtube.com/watch?v=4c_rKOaTquM

Title: putting 5G and MEC to the test!! (does it even matter??)
Data length: 741

...