Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add management command for deleting duplicate captions from YouTube #2409

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions videos/management/commands/delete_duplicate_captions_youtube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Management command to delete duplicate captions from YouTube videos"""

import logging
from io import BytesIO

from django.core.management.base import CommandError
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseUpload

from main.management.commands.filter import WebsiteFilterCommand
from videos.constants import DESTINATION_YOUTUBE
from videos.models import VideoFile
from videos.youtube import CAPTION_UPLOAD_NAME, YouTubeApi
from websites.models import Website

log = logging.getLogger(__name__)
LEGACY_CAPTIONS_NAME = "ocw_studio_upload"
ERROR_NO_WEBSITES = "No matching websites found."


class Command(WebsiteFilterCommand):
"""
Checks if the most recently updated captions track is 'CC (English)'.
If it's 'ocw_studio_upload', copy into 'CC (English)' and remove the
'ocw_studio_upload' track.
If it's 'CC (English)' and there is a track named 'ocw_studio_upload',
remove the 'ocw_studio_upload' track.
"""

help = __doc__

def handle(self, *args, **options):
"""
Handle the deletion of duplicate captions from YouTube videos.
"""
super().handle(*args, **options)

all_websites = Website.objects.all()

filtered_websites = self.filter_websites(websites=all_websites)

if not filtered_websites.exists():
raise CommandError(ERROR_NO_WEBSITES)

video_files = VideoFile.objects.filter(
destination=DESTINATION_YOUTUBE,
destination_id__isnull=False,
video__website__in=filtered_websites,
).select_related("video", "video__website")

youtube = YouTubeApi()

for vf in video_files:
video_id = vf.destination_id
try:
captions_response = (
youtube.client.captions()
.list(part="snippet", videoId=video_id)
.execute()
)
items = captions_response.get("items", [])
items.sort(
key=lambda captions: captions["snippet"].get("lastUpdated", ""),
reverse=True,
)
if not items:
continue

newest = items[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should get the the newest item that matches either of the two concerned names.

Maybe it's because the setting (or the sample video itself) but we get an auto-translation in Russian as well. in my testing i got the the newest item as below. Since, it did not match any of the expected values in the if-else that followed, it did not work for me.
Screenshot 2025-02-25 at 5 24 52 PM
Screenshot 2025-02-25 at 6 39 46 PM

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated in d0371bc

newest_name = newest["snippet"]["name"]

legacy_tracks = [
captions
for captions in items
if captions["snippet"]["name"] == LEGACY_CAPTIONS_NAME
]

if newest_name == LEGACY_CAPTIONS_NAME:
caption_id = newest["id"]
caption_content = (
youtube.client.captions().download(id=caption_id).execute()
)

media_body = MediaIoBaseUpload(
BytesIO(caption_content),
mimetype="text/vtt",
chunksize=-1,
resumable=True,
)
cc_english = [
captions
for captions in items
if captions["snippet"]["name"] == CAPTION_UPLOAD_NAME
]
if cc_english:
youtube.client.captions().update(
part="snippet",
body={"id": cc_english[0]["id"]},
media_body=media_body,
).execute()
else:
youtube.client.captions().insert(
part="snippet",
sync=False,
body={
"snippet": {
"language": "en",
"name": CAPTION_UPLOAD_NAME,
"videoId": video_id,
}
},
media_body=media_body,
).execute()

youtube.client.captions().delete(id=caption_id).execute()

elif newest_name == CAPTION_UPLOAD_NAME and legacy_tracks:
for track in legacy_tracks:
youtube.client.captions().delete(id=track["id"]).execute()

except HttpError:
log.exception("Error processing video %s", video_id)
Comment on lines +134 to +135

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider logging the specific HttpError that occurred, or handling different types of HttpError differently. This could provide more context for debugging and monitoring. For example, a 404 error might indicate a video was deleted, while a quota error might require a different response.

Suggested change
except HttpError:
log.exception("Error processing video %s", video_id)
except HttpError as e:
log.exception("Error processing video %s: %s", video_id, e)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually incorrect. log.exception includes the actual exception, so making this suggested change causes a linting error.

137 changes: 137 additions & 0 deletions videos/management/commands/delete_duplicate_captions_youtube_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
Tests for the delete_duplicate_captions_youtube management command.

Verifies that the command inspects YouTube caption tracks for each VideoFile and handles
'ocw_studio_upload' vs. 'CC (English)' as expected.
"""

import pytest
from django.core.management import call_command

from videos.constants import DESTINATION_YOUTUBE
from videos.factories import VideoFactory, VideoFileFactory
from videos.youtube import CAPTION_UPLOAD_NAME
from websites.factories import WebsiteFactory

pytestmark = pytest.mark.django_db


@pytest.fixture()
def mock_youtube_api(mocker):
"""Mock the YouTube API client"""
mock_api_cls = mocker.patch(
"videos.management.commands.delete_duplicate_captions_youtube.YouTubeApi"
)
mock_api = mock_api_cls.return_value
mock_api.client.captions.return_value.list.return_value.execute.return_value = {
"items": [
{
"id": "caption_id_legacy",
"snippet": {
"name": "ocw_studio_upload",
"lastUpdated": "2023-10-01T12:00:00.000Z",
},
},
{
"id": "caption_id_other",
"snippet": {
"name": CAPTION_UPLOAD_NAME,
"lastUpdated": "2023-09-30T12:00:00.000Z",
},
},
]
}
mock_api.client.captions.return_value.download.return_value.execute.return_value = (
b"some vtt data"
)
return mock_api


def test_delete_duplicate_captions_youtube_command(mock_youtube_api):
"""
Tests that the command finds VideoFile objects, checks if the newest
caption name is 'ocw_studio_upload', and copies it to 'CC (English)' before deleting it.
"""
website = WebsiteFactory.create(name="Test Site", short_id="test-site")

video = VideoFactory.create(website=website)
VideoFileFactory.create(
video=video,
destination=DESTINATION_YOUTUBE,
destination_id="dummy_youtube_id",
)

call_command("delete_duplicate_captions_youtube", filter="test-site")

mock_youtube_api.client.captions.return_value.list.assert_called_with(
part="snippet", videoId="dummy_youtube_id"
)
mock_youtube_api.client.captions.return_value.download.assert_called_once()

mock_youtube_api.client.captions.return_value.delete.assert_called_once_with(
id="caption_id_legacy"
)
Comment on lines +71 to +78

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider consolidating the assertions into a single block or using assert_has_calls to verify the sequence of calls. This can improve test readability and maintainability.

    calls = [
        mocker.call.client.captions.return_value.list(part="snippet", videoId="dummy_youtube_id"),
        mocker.call.client.captions.return_value.download(),
        mocker.call.client.captions.return_value.delete(id="caption_id_legacy")
    ]
    mock_youtube_api.assert_has_calls(calls)



@pytest.fixture()
def mock_youtube_api_cc_english_newest(mocker):
"""
Alternate fixture: newest track is 'CC (English)'
and older track is 'ocw_studio_upload'.
"""
mock_api_cls = mocker.patch(
"videos.management.commands.delete_duplicate_captions_youtube.YouTubeApi"
)
mock_api = mock_api_cls.return_value

mock_api.client.captions.return_value.list.return_value.execute.return_value = {
"items": [
{
"id": "caption_id_legacy",
"snippet": {
"name": "ocw_studio_upload",
"lastUpdated": "2023-09-30T12:00:00.000Z",
},
},
{
"id": "caption_id_other",
"snippet": {
"name": CAPTION_UPLOAD_NAME,
"lastUpdated": "2023-10-01T12:00:00.000Z",
},
},
]
}
mock_api.client.captions.return_value.download.return_value.execute.return_value = (
b"some vtt data"
)
return mock_api


def test_delete_duplicate_captions_youtube_command_cc_english_newest(
mock_youtube_api_cc_english_newest,
):
"""
If 'ocw_studio_upload' is not the newest track, we do not download/update it
to the 'CC (English)' track. Instead, if the newest is 'CC (English)' and there
is a legacy track, we delete the legacy track without copying from it.
"""
website = WebsiteFactory.create(name="Test Site", short_id="test-site")
video = VideoFactory.create(website=website)
VideoFileFactory.create(
video=video,
destination=DESTINATION_YOUTUBE,
destination_id="dummy_youtube_id",
)

call_command("delete_duplicate_captions_youtube", filter="test-site")

mock_youtube_api_cc_english_newest.client.captions.return_value.list.assert_called_with(
part="snippet", videoId="dummy_youtube_id"
)
mock_youtube_api_cc_english_newest.client.captions.return_value.download.assert_not_called()
mock_youtube_api_cc_english_newest.client.captions.return_value.update.assert_not_called()
mock_youtube_api_cc_english_newest.client.captions.return_value.insert.assert_not_called()
mock_youtube_api_cc_english_newest.client.captions.return_value.delete.assert_called_once_with(
id="caption_id_legacy"
)