Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ie/youtube] Extract comments with or without new format #9775

Merged
merged 28 commits into from
May 17, 2024
Merged
Changes from 12 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ee81ca4
apply patch from issues/9358#issuecomment-2072600506
jakeogh Apr 23, 2024
16cb4fe
fix typo in previous patch, like count, and use direct dict access
jakeogh Apr 23, 2024
6083596
handle KeyError: 'frameworkUpdates' when the old comment format is se…
jakeogh Apr 23, 2024
2ef6563
fix old comment extraction
jakeogh Apr 23, 2024
4da1db9
fix like_count
jakeogh Apr 24, 2024
800906c
fix indent
jakeogh Apr 24, 2024
2763473
fix another indent
jakeogh Apr 24, 2024
17bb443
replace dict access with try_get()
jakeogh Apr 24, 2024
3ef6517
replace dict access with traverse_obj() and use likeCountA11y
jakeogh Apr 24, 2024
a1102d7
add commentRenderer fix from @minamotorin
jakeogh May 6, 2024
f4c1de1
apply patch from @minamotorin of example code by @shoxie007
jakeogh May 6, 2024
5a3a4f1
@bbilly1 correctly parse like_count for cases > 1000
jakeogh May 6, 2024
cf9751a
two whitespace fixes
jakeogh May 15, 2024
f09e66b
use traverse_obj with mutations variable
jakeogh May 15, 2024
1a0cf3c
fix test for None
jakeogh May 15, 2024
4701ad6
remove .get() call from content
jakeogh May 15, 2024
6f5c669
move continue block, filter() comment_keys for None
jakeogh May 15, 2024
f6ced29
use get_first() and remove .get()
jakeogh May 15, 2024
8d428b4
use single traversal
jakeogh May 15, 2024
0ef6c93
use traverse_obj for time_text
jakeogh May 15, 2024
1cee8e7
whitespace change
jakeogh May 15, 2024
743ed06
remove pinned_text var
jakeogh May 15, 2024
1adea35
add {bool}
jakeogh May 15, 2024
1872982
fix author_is_verified
jakeogh May 15, 2024
90b1129
use single traversal pattern
jakeogh May 15, 2024
54b823b
readability
pukkandan May 15, 2024
053cde7
Apply suggestions from code review
bashonly May 15, 2024
47a6fb7
use traversal and check for `comment_keys`
bashonly May 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
97 changes: 85 additions & 12 deletions yt_dlp/extractor/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import time
import traceback
import urllib.parse
jakeogh marked this conversation as resolved.
Show resolved Hide resolved

from .common import InfoExtractor, SearchInfoExtractor
from .openload import PhantomJSwrapper
from ..compat import functools
Expand Down Expand Up @@ -3307,7 +3306,57 @@ def _extract_heatmap(self, data):
'value': ('intensityScoreNormalized', {float_or_none}),
})) or None

def _extract_comment(self, comment_renderer, parent=None):
def _extract_comment(self, view_model, entities, parent=None):
bashonly marked this conversation as resolved.
Show resolved Hide resolved
comment_entity_payload = traverse_obj(entities, (..., 'payload', 'commentEntityPayload', {dict}), get_all=False)
toolbar_entity_payload = traverse_obj(entities, (..., 'payload', 'engagementToolbarStateEntityPayload', {dict}), get_all=False)
comment_id = comment_entity_payload.get('properties').get('commentId')
jakeogh marked this conversation as resolved.
Show resolved Hide resolved

info = {
'id': comment_id,
'text': try_get(comment_entity_payload, lambda x: x['properties']['content']['content'], str),
'like_count': parse_count(try_get(comment_entity_payload, lambda x: x['toolbar']['likeCountA11y'], str)) or 0,
'author_id': traverse_obj(comment_entity_payload, ('author', 'channelId', {self.ucid_or_none})),
'author': try_get(comment_entity_payload, lambda x: x['author']['displayName'], str),
'author_thumbnail': traverse_obj(comment_entity_payload, ('author', 'avatarThumbnailUrl', {url_or_none})),
'parent': parent or 'root',
jakeogh marked this conversation as resolved.
Show resolved Hide resolved
}

# Timestamp is an estimate calculated from the current time and time_text
time_text = try_get(comment_entity_payload, lambda x: x['properties']['publishedTime'], str) or ''
jakeogh marked this conversation as resolved.
Show resolved Hide resolved
timestamp = self._parse_time_text(time_text)

info.update({
# FIXME: non-standard, but we need a way of showing that it is an estimate.
'_time_text': time_text,
'timestamp': timestamp,
})

info['author_url'] = urljoin(
'https://www.youtube.com',
traverse_obj(comment_entity_payload,
('author',
'channelCommand',
'innertubeCommand',
'browseEndpoint',
'canonicalBaseUrl'),
expected_type=str, get_all=False))
jakeogh marked this conversation as resolved.
Show resolved Hide resolved

author_is_uploader = traverse_obj(comment_entity_payload, ('author', 'isCreator'))
jakeogh marked this conversation as resolved.
Show resolved Hide resolved
if author_is_uploader is not None:
info['author_is_uploader'] = author_is_uploader

if toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED':
info['is_favorited'] = True

info['author_is_verified'] = traverse_obj(comment_entity_payload, ('author', 'isVerified')) == 'true'
jakeogh marked this conversation as resolved.
Show resolved Hide resolved

pinned_text = traverse_obj(view_model, 'pinnedText')
if pinned_text:
jakeogh marked this conversation as resolved.
Show resolved Hide resolved
info['is_pinned'] = True

return info

def _extract_comment_old(self, comment_renderer, parent=None):
comment_id = comment_renderer.get('commentId')
if not comment_id:
return
Expand Down Expand Up @@ -3388,21 +3437,39 @@ def extract_header(contents):
break
return _continuation

def extract_thread(contents):
def extract_thread(contents, entity_payloads):
if not parent:
tracker['current_page_thread'] = 0
for content in contents:
if not parent and tracker['total_parent_comments'] >= max_parents:
yield
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
comment_renderer = get_first(
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
expected_type=dict, default={})

comment = self._extract_comment(comment_renderer, parent)
if not comment:
continue
comment_id = comment['id']
# old comment format
if entity_payloads is None:
jakeogh marked this conversation as resolved.
Show resolved Hide resolved
comment_renderer = get_first(
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
expected_type=dict, default={})

comment = self._extract_comment_old(comment_renderer, parent)
if not comment:
continue
comment_id = comment['id']
jakeogh marked this conversation as resolved.
Show resolved Hide resolved

# new comment format
else:
view_model = traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel'))
if not view_model:
view_model = content.get('commentViewModel')
if not view_model:
continue
jakeogh marked this conversation as resolved.
Show resolved Hide resolved
comment_id = view_model['commentId']
comment_key = view_model.get('commentKey')
toolbar_state_key = view_model.get('toolbarStateKey')
entities = traverse_obj(entity_payloads, lambda _, v: v["entityKey"] in [comment_key, toolbar_state_key])

comment = self._extract_comment(view_model, entities, parent)
jakeogh marked this conversation as resolved.
Show resolved Hide resolved

jakeogh marked this conversation as resolved.
Show resolved Hide resolved
if comment.get('is_pinned'):
tracker['pinned_comment_ids'].add(comment_id)
# Sometimes YouTube may break and give us infinite looping comments.
Expand Down Expand Up @@ -3495,7 +3562,7 @@ def extract_thread(contents):
check_get_keys = None
if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
check_get_keys = [[*continuation_items_path, ..., (
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
try:
response = self._extract_response(
item_id=None, query=continuation,
Expand Down Expand Up @@ -3527,10 +3594,16 @@ def extract_thread(contents):
break
continue

for entry in extract_thread(continuation_items):
if 'frameworkUpdates' in response:
_iterator = extract_thread(continuation_items, response['frameworkUpdates']['entityBatchUpdate']['mutations'])
else:
_iterator = extract_thread(continuation_items, None)

for entry in _iterator:
jakeogh marked this conversation as resolved.
Show resolved Hide resolved
if not entry:
return
yield entry

jakeogh marked this conversation as resolved.
Show resolved Hide resolved
continuation = self._extract_continuation({'contents': continuation_items})
if continuation:
break
Expand Down