-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add management command for deleting duplicate captions from YouTube #2409
base: master
Are you sure you want to change the base?
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,122 @@ | ||||||||||
"""Management command to delete duplicate captions from YouTube videos""" | ||||||||||
|
||||||||||
import logging | ||||||||||
from io import BytesIO | ||||||||||
|
||||||||||
from django.core.management.base import CommandError | ||||||||||
from googleapiclient.errors import HttpError | ||||||||||
from googleapiclient.http import MediaIoBaseUpload | ||||||||||
|
||||||||||
from main.management.commands.filter import WebsiteFilterCommand | ||||||||||
from videos.constants import DESTINATION_YOUTUBE | ||||||||||
from videos.models import VideoFile | ||||||||||
from videos.youtube import CAPTION_UPLOAD_NAME, YouTubeApi | ||||||||||
from websites.models import Website | ||||||||||
|
||||||||||
log = logging.getLogger(__name__) | ||||||||||
LEGACY_CAPTIONS_NAME = "ocw_studio_upload" | ||||||||||
ERROR_NO_WEBSITES = "No matching websites found." | ||||||||||
|
||||||||||
|
||||||||||
class Command(WebsiteFilterCommand): | ||||||||||
""" | ||||||||||
Checks if the most recently updated captions track is 'CC (English)'. | ||||||||||
If it's 'ocw_studio_upload', copy into 'CC (English)' and remove the | ||||||||||
'ocw_studio_upload' track. | ||||||||||
If it's 'CC (English)' and there is a track named 'ocw_studio_upload', | ||||||||||
remove the 'ocw_studio_upload' track. | ||||||||||
""" | ||||||||||
|
||||||||||
help = __doc__ | ||||||||||
|
||||||||||
def handle(self, *args, **options): | ||||||||||
""" | ||||||||||
Handle the deletion of duplicate captions from YouTube videos. | ||||||||||
""" | ||||||||||
super().handle(*args, **options) | ||||||||||
|
||||||||||
all_websites = Website.objects.all() | ||||||||||
|
||||||||||
filtered_websites = self.filter_websites(websites=all_websites) | ||||||||||
|
||||||||||
if not filtered_websites.exists(): | ||||||||||
raise CommandError(ERROR_NO_WEBSITES) | ||||||||||
|
||||||||||
video_files = VideoFile.objects.filter( | ||||||||||
destination=DESTINATION_YOUTUBE, | ||||||||||
destination_id__isnull=False, | ||||||||||
video__website__in=filtered_websites, | ||||||||||
).select_related("video", "video__website") | ||||||||||
|
||||||||||
youtube = YouTubeApi() | ||||||||||
|
||||||||||
for vf in video_files: | ||||||||||
video_id = vf.destination_id | ||||||||||
try: | ||||||||||
captions_response = ( | ||||||||||
youtube.client.captions() | ||||||||||
.list(part="snippet", videoId=video_id) | ||||||||||
.execute() | ||||||||||
) | ||||||||||
items = captions_response.get("items", []) | ||||||||||
items.sort( | ||||||||||
key=lambda captions: captions["snippet"].get("lastUpdated", ""), | ||||||||||
reverse=True, | ||||||||||
) | ||||||||||
if not items: | ||||||||||
continue | ||||||||||
|
||||||||||
newest = items[0] | ||||||||||
newest_name = newest["snippet"]["name"] | ||||||||||
|
||||||||||
legacy_tracks = [ | ||||||||||
captions | ||||||||||
for captions in items | ||||||||||
if captions["snippet"]["name"] == LEGACY_CAPTIONS_NAME | ||||||||||
] | ||||||||||
|
||||||||||
if newest_name == LEGACY_CAPTIONS_NAME: | ||||||||||
caption_id = newest["id"] | ||||||||||
caption_content = ( | ||||||||||
youtube.client.captions().download(id=caption_id).execute() | ||||||||||
) | ||||||||||
|
||||||||||
media_body = MediaIoBaseUpload( | ||||||||||
BytesIO(caption_content), | ||||||||||
mimetype="text/vtt", | ||||||||||
chunksize=-1, | ||||||||||
resumable=True, | ||||||||||
) | ||||||||||
cc_english = [ | ||||||||||
captions | ||||||||||
for captions in items | ||||||||||
if captions["snippet"]["name"] == CAPTION_UPLOAD_NAME | ||||||||||
] | ||||||||||
if cc_english: | ||||||||||
youtube.client.captions().update( | ||||||||||
part="snippet", | ||||||||||
body={"id": cc_english[0]["id"]}, | ||||||||||
media_body=media_body, | ||||||||||
).execute() | ||||||||||
else: | ||||||||||
youtube.client.captions().insert( | ||||||||||
part="snippet", | ||||||||||
sync=False, | ||||||||||
body={ | ||||||||||
"snippet": { | ||||||||||
"language": "en", | ||||||||||
"name": CAPTION_UPLOAD_NAME, | ||||||||||
"videoId": video_id, | ||||||||||
} | ||||||||||
}, | ||||||||||
media_body=media_body, | ||||||||||
).execute() | ||||||||||
|
||||||||||
youtube.client.captions().delete(id=caption_id).execute() | ||||||||||
|
||||||||||
elif newest_name == CAPTION_UPLOAD_NAME and legacy_tracks: | ||||||||||
for track in legacy_tracks: | ||||||||||
youtube.client.captions().delete(id=track["id"]).execute() | ||||||||||
|
||||||||||
except HttpError: | ||||||||||
log.exception("Error processing video %s", video_id) | ||||||||||
Comment on lines
+134
to
+135
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider logging the specific
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is actually incorrect. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
""" | ||
Tests for the delete_duplicate_captions_youtube management command. | ||
|
||
Verifies that the command inspects YouTube caption tracks for each VideoFile and handles | ||
'ocw_studio_upload' vs. 'CC (English)' as expected. | ||
""" | ||
|
||
import pytest | ||
from django.core.management import call_command | ||
|
||
from videos.constants import DESTINATION_YOUTUBE | ||
from videos.factories import VideoFactory, VideoFileFactory | ||
from videos.youtube import CAPTION_UPLOAD_NAME | ||
from websites.factories import WebsiteFactory | ||
|
||
pytestmark = pytest.mark.django_db | ||
|
||
|
||
@pytest.fixture() | ||
def mock_youtube_api(mocker): | ||
"""Mock the YouTube API client""" | ||
mock_api_cls = mocker.patch( | ||
"videos.management.commands.delete_duplicate_captions_youtube.YouTubeApi" | ||
) | ||
mock_api = mock_api_cls.return_value | ||
mock_api.client.captions.return_value.list.return_value.execute.return_value = { | ||
"items": [ | ||
{ | ||
"id": "caption_id_legacy", | ||
"snippet": { | ||
"name": "ocw_studio_upload", | ||
"lastUpdated": "2023-10-01T12:00:00.000Z", | ||
}, | ||
}, | ||
{ | ||
"id": "caption_id_other", | ||
"snippet": { | ||
"name": CAPTION_UPLOAD_NAME, | ||
"lastUpdated": "2023-09-30T12:00:00.000Z", | ||
}, | ||
}, | ||
] | ||
} | ||
mock_api.client.captions.return_value.download.return_value.execute.return_value = ( | ||
b"some vtt data" | ||
) | ||
return mock_api | ||
|
||
|
||
def test_delete_duplicate_captions_youtube_command(mock_youtube_api): | ||
""" | ||
Tests that the command finds VideoFile objects, checks if the newest | ||
caption name is 'ocw_studio_upload', and copies it to 'CC (English)' before deleting it. | ||
""" | ||
website = WebsiteFactory.create(name="Test Site", short_id="test-site") | ||
|
||
video = VideoFactory.create(website=website) | ||
VideoFileFactory.create( | ||
video=video, | ||
destination=DESTINATION_YOUTUBE, | ||
destination_id="dummy_youtube_id", | ||
) | ||
|
||
call_command("delete_duplicate_captions_youtube", filter="test-site") | ||
|
||
mock_youtube_api.client.captions.return_value.list.assert_called_with( | ||
part="snippet", videoId="dummy_youtube_id" | ||
) | ||
mock_youtube_api.client.captions.return_value.download.assert_called_once() | ||
|
||
mock_youtube_api.client.captions.return_value.delete.assert_called_once_with( | ||
id="caption_id_legacy" | ||
) | ||
Comment on lines
+71
to
+78
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider consolidating the assertions into a single block or using calls = [
mocker.call.client.captions.return_value.list(part="snippet", videoId="dummy_youtube_id"),
mocker.call.client.captions.return_value.download(),
mocker.call.client.captions.return_value.delete(id="caption_id_legacy")
]
mock_youtube_api.assert_has_calls(calls) |
||
|
||
|
||
@pytest.fixture() | ||
def mock_youtube_api_cc_english_newest(mocker): | ||
""" | ||
Alternate fixture: newest track is 'CC (English)' | ||
and older track is 'ocw_studio_upload'. | ||
""" | ||
mock_api_cls = mocker.patch( | ||
"videos.management.commands.delete_duplicate_captions_youtube.YouTubeApi" | ||
) | ||
mock_api = mock_api_cls.return_value | ||
|
||
mock_api.client.captions.return_value.list.return_value.execute.return_value = { | ||
"items": [ | ||
{ | ||
"id": "caption_id_legacy", | ||
"snippet": { | ||
"name": "ocw_studio_upload", | ||
"lastUpdated": "2023-09-30T12:00:00.000Z", | ||
}, | ||
}, | ||
{ | ||
"id": "caption_id_other", | ||
"snippet": { | ||
"name": CAPTION_UPLOAD_NAME, | ||
"lastUpdated": "2023-10-01T12:00:00.000Z", | ||
}, | ||
}, | ||
] | ||
} | ||
mock_api.client.captions.return_value.download.return_value.execute.return_value = ( | ||
b"some vtt data" | ||
) | ||
return mock_api | ||
|
||
|
||
def test_delete_duplicate_captions_youtube_command_cc_english_newest( | ||
mock_youtube_api_cc_english_newest, | ||
): | ||
""" | ||
If 'ocw_studio_upload' is not the newest track, we do not download/update it | ||
to the 'CC (English)' track. Instead, if the newest is 'CC (English)' and there | ||
is a legacy track, we delete the legacy track without copying from it. | ||
""" | ||
website = WebsiteFactory.create(name="Test Site", short_id="test-site") | ||
video = VideoFactory.create(website=website) | ||
VideoFileFactory.create( | ||
video=video, | ||
destination=DESTINATION_YOUTUBE, | ||
destination_id="dummy_youtube_id", | ||
) | ||
|
||
call_command("delete_duplicate_captions_youtube", filter="test-site") | ||
|
||
mock_youtube_api_cc_english_newest.client.captions.return_value.list.assert_called_with( | ||
part="snippet", videoId="dummy_youtube_id" | ||
) | ||
mock_youtube_api_cc_english_newest.client.captions.return_value.download.assert_not_called() | ||
mock_youtube_api_cc_english_newest.client.captions.return_value.update.assert_not_called() | ||
mock_youtube_api_cc_english_newest.client.captions.return_value.insert.assert_not_called() | ||
mock_youtube_api_cc_english_newest.client.captions.return_value.delete.assert_called_once_with( | ||
id="caption_id_legacy" | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should get the the newest item that matches either of the two concerned names.
Maybe it's because the setting (or the sample video itself) but we get an auto-translation in
data:image/s3,"s3://crabby-images/9da6e/9da6e463b33b1f5498ec1b74f6946565269f47c5" alt="Screenshot 2025-02-25 at 5 24 52 PM"
data:image/s3,"s3://crabby-images/77f15/77f155d9e5a6269524b5ecb88679d0cb16387579" alt="Screenshot 2025-02-25 at 6 39 46 PM"
Russian
as well. in my testing i got the the newest item as below. Since, it did not match any of the expected values in theif-else
that followed, it did not work for me.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated in d0371bc