I have a function that can fetch the transcript of a youtube video. (You don't have to understand every line)
!pip install youtube_transcript_api
!pip install simplejson
from youtube_transcript_api import YouTubeTranscriptApi
import urllib.request
import json
import urllib
VideoID = 'LfC6pv8VISk'
def fetch_transcript():
params = {"format": "json", "url": "https://www.youtube.com/watch?v=%s" % VideoID}
url = "https://www.youtube.com/oembed"
query_string = urllib.parse.urlencode(params)
url = url "?" query_string
with urllib.request.urlopen(url) as response:
response_text = response.read()
data = json.loads(response_text.decode())
print('Titel: ' data['title'])
# retrieve the available transcripts
transcript_list = YouTubeTranscriptApi.list_transcripts(VideoID)
# iterate over all available transcripts
for transcript in transcript_list:
# fetch the actual transcript data
data = transcript.fetch()
print(data)
# filter for language
transcript = transcript_list.find_transcript(['en'])
fetch_transcript()
Output: [{'text': "Okay, here we go. This one's gonna be\nquick. So get your coffee ready today.", 'start': 0.15, 'duration': 2.97...}]
Now I collected a list of video_ids like:
VideoID = ['LfC6pv8VISk', 'befUVytFC80', '4c_rKOaTquM']
How can I iterate over the list using my function? My function only takes a single string like 'LfC6pv8VISk'
#the single string ↓
params = {"format": "json", "url": "https://www.youtube.com/watch?v=%s" % VideoID}
for i in VideoID ... fetch_transcript() doesn't work. It feels so easy but I'm stuck.
CodePudding user response:
Try making your function accept video id as parameter.
You can safely ignore imports from typing, it's just for annotations.
from typing import Iterable, Tuple, Dict, List, Any
from youtube_transcript_api import YouTubeTranscriptApi
import urllib.request
import json
import urllib
def fetch_transcript(vid_id) -> Tuple[str, List[Dict[str, Any]]]:
params = {"format": "json", "url": f"https://www.youtube.com/watch?v={vid_id}"}
url = "https://www.youtube.com/oembed"
query_string = urllib.parse.urlencode(params)
url = url "?" query_string
print(f"Fetching from {url}")
with urllib.request.urlopen(url) as response:
response_text = response.read()
title = json.loads(response_text.decode())["title"]
# retrieve the available transcripts
transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)
return title, transcript_list.find_transcript(['en']).fetch()
def fetch_transcript_gen(vid_ids: Iterable):
for video_id in vid_ids:
yield fetch_transcript(video_id)
if __name__ == '__main__':
from pprint import pprint
for title_, data_ in fetch_transcript_gen(['LfC6pv8VISk', 'befUVytFC80', '4c_rKOaTquM']):
print(f"\nTitle: {title_}\nData length: {len(data_)}\n")
pprint(data_)
Output:
Fetching from https://www.youtube.com/oembed?format=json&url=https://www.youtube.com/watch?v=LfC6pv8VISk
Title: KILL Linux processes!! (also manage them) // Linux for Hackers // EP 7
Data length: 399
[{'duration': 2.97,
'start': 0.15,
'text': "Okay, here we go. This one's gonna be\n"
'quick. So get your coffee ready today.'},
{'duration': 4.4,
'start': 3.12,
'text': "We're gonna learn how to manage our\n"
'processes, processes and Lennox.'},
...
Fetching from https://www.youtube.com/oembed?format=json&url=https://www.youtube.com/watch?v=befUVytFC80
Title: you need to create a Cryptocurrency RIGHT NOW!! (Solana token)
Data length: 837
...
Fetching from https://www.youtube.com/oembed?format=json&url=https://www.youtube.com/watch?v=4c_rKOaTquM
Title: putting 5G and MEC to the test!! (does it even matter??)
Data length: 741
...