Skip to content

Commit

Permalink
Add feature to extract timestamps from youtube videos (#1345)
Browse files Browse the repository at this point in the history
  • Loading branch information
Esparon1 committed May 6, 2024
1 parent 797dea1 commit 78301ee
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion embedchain/loaders/youtube_video.py
@@ -1,5 +1,12 @@
import hashlib

import json
import logging
try:
from youtube_transcript_api import YouTubeTranscriptApi
except ImportError:
raise ImportError(
'YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`'
)
try:
from langchain_community.document_loaders import YoutubeLoader
except ImportError:
Expand All @@ -25,6 +32,18 @@ def load_data(self, url):
metadata = doc[0].metadata
metadata["url"] = url


video_id = url.split("v=")[1].split('&')[0]
try:
# Fetching transcript data
transcript = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])
# convert transcript to json to avoid unicode symboles
metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
except Exception as e:
logging.exception(f"Failed to fetch transcript for video {url}")
metadata["transcript"] = "Unavailable"


output.append(
{
"content": content,
Expand Down

0 comments on commit 78301ee

Please sign in to comment.