Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 9701: NEXT_DATA field video extraction for bbc US website #9705

Merged
merged 27 commits into from
May 17, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
43 changes: 42 additions & 1 deletion yt_dlp/extractor/bbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': {
'id': 'world-europe-32668511',
'title': 'Russia stages massive WW2 parade',
'title': 'Russia stages massive WW2 parade despite Western boycott',
'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
},
'playlist_count': 2,
Expand Down Expand Up @@ -791,6 +791,17 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'timestamp': 1638230731,
'upload_date': '20211130',
kylegustavo marked this conversation as resolved.
Show resolved Hide resolved
},
}, {
# video with script id __NEXT_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/uk-68546268',
kylegustavo marked this conversation as resolved.
Show resolved Hide resolved
'info_dict': {
'id': 'p0hj0lq7',
'ext': 'mp4',
'title': 'Nasser Hospital doctor describes his treatment by IDF',
'description': 'Doctor Abu Sabha said he was detained by Israeli forces after the raid on Nasser Hospital and feared for his life.\n\nThe IDF said "during the activity, about 200 terrorists and suspects of terrorist activity were detained, including some who posed as medical teams, many weapons were found, as well as closed medicines intended for Israeli hostages."',
kylegustavo marked this conversation as resolved.
Show resolved Hide resolved
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1710270205000,
kylegustavo marked this conversation as resolved.
Show resolved Hide resolved
},
}, {
# single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
Expand Down Expand Up @@ -1255,6 +1266,36 @@ def extract_all(pattern):
lambda s: self._parse_json(s, playlist_id, fatal=False),
re.findall(pattern, webpage))))

# US accessed article with single embedded video (e.g.
# https://www.bbc.com/news/uk-68546268)
next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id), (
'props', 'pageProps', 'page'), get_all=False)
kylegustavo marked this conversation as resolved.
Show resolved Hide resolved
video_data = traverse_obj(next_data, (
..., 'contents', lambda _, v: v['type'] == 'video'), get_all=False)
if video_data:
timestamp = traverse_obj(next_data, (
..., 'contents', lambda _, v: v['type'] == 'timestamp',
'model', 'timestamp', {int_or_none}), get_all=False)
model = traverse_obj(video_data, (
'model', 'blocks', lambda _, v: v['type'] == 'media',
'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata',
'model'), get_all=False)
if model:
item_id = try_get(model, lambda x: x['versions'][0]['versionId'])
kylegustavo marked this conversation as resolved.
Show resolved Hide resolved
formats, subtitles = self._download_media_selector(item_id)
synopses = model.get('synopses') or {}
entries.append({
'id': item_id,
'title': model.get('title'),
'thumbnail': model.get('imageUrl'),
'formats': formats,
'subtitles': subtitles,
'timestamp': timestamp,
'description': dict_get(synopses, ('long', 'medium', 'short'))
})
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)

# Multiple video article (e.g.
# http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
Expand Down