From 7071b4b6f48cb9a2f766064f3a5c23f92b293718 Mon Sep 17 00:00:00 2001 From: Guy Martin Date: Fri, 9 Feb 2024 16:34:50 +0000 Subject: [PATCH] feat: support VTT transcript file format in addition to SRT closes #433 --- modules/Admin/Language/en/Episode.php | 4 +- modules/Media/Entities/Transcript.php | 17 +++- modules/Media/TranscriptParser.php | 107 +++++++++++++++++++++++++- themes/cp_admin/episode/create.php | 2 +- themes/cp_admin/episode/edit.php | 2 +- 5 files changed, 123 insertions(+), 9 deletions(-) diff --git a/modules/Admin/Language/en/Episode.php b/modules/Admin/Language/en/Episode.php index 1a5e57a4..4fa846e3 100644 --- a/modules/Admin/Language/en/Episode.php +++ b/modules/Admin/Language/en/Episode.php @@ -139,9 +139,9 @@ return [ 'location_name' => 'Location name or address', 'location_name_hint' => 'This can be a real or fictional location', 'transcript' => 'Transcript (subtitles / closed captions)', - 'transcript_hint' => 'Only .srt are allowed.', + 'transcript_hint' => 'Only .srt or .vtt are allowed.', 'transcript_download' => 'Download transcript', - 'transcript_file' => 'Transcript file (.srt)', + 'transcript_file' => 'Transcript file (.srt or .vtt)', 'transcript_remote_url' => 'Remote url for transcript', 'transcript_file_delete' => 'Delete transcript file', 'chapters' => 'Chapters', diff --git a/modules/Media/Entities/Transcript.php b/modules/Media/Entities/Transcript.php index 33e9ac4e..f0e9a351 100644 --- a/modules/Media/Entities/Transcript.php +++ b/modules/Media/Entities/Transcript.php @@ -76,16 +76,25 @@ class Transcript extends BaseMedia private function saveJsonTranscript(): void { - $srtContent = file_get_contents($this->file->getRealPath()); + $transcriptContent = file_get_contents($this->file->getRealPath()); $transcriptParser = new TranscriptParser(); - if ($srtContent === false) { + if ($transcriptContent === false) { throw new Exception('Could not read transcript file at ' . $this->file->getRealPath()); } - $transcriptJson = $transcriptParser->loadString($srtContent) - ->parseSrt(); + $transcript_format = $this->file->getExtension(); + switch ($transcript_format) { + case 'vtt': + $transcriptJson = $transcriptParser->loadString($transcriptContent) + ->parseVtt(); + break; + case 'srt': + default: + $transcriptJson = $transcriptParser->loadString($transcriptContent) + ->parseSrt(); + } $tempFilePath = WRITEPATH . 'uploads/' . $this->file->getRandomName(); file_put_contents($tempFilePath, $transcriptJson); diff --git a/modules/Media/TranscriptParser.php b/modules/Media/TranscriptParser.php index a8b61e4a..90167d0a 100644 --- a/modules/Media/TranscriptParser.php +++ b/modules/Media/TranscriptParser.php @@ -3,7 +3,7 @@ declare(strict_types=1); /** - * Generates and renders a breadcrumb based on the current url segments + * Converts a SRT or VTT file to JSON * * @copyright 2022 Ad Aures * @license https://www.gnu.org/licenses/agpl-3.0.en.html AGPL3 @@ -107,9 +107,114 @@ class TranscriptParser return $jsonString; } + public function parseVtt(): string + { + if (! defined('VTT_STATE_HEADER')) { + define('VTT_STATE_HEADER', 0); + } + + if (! defined('VTT_STATE_BLANK')) { + define('VTT_STATE_BLANK', 1); + } + + if (! defined('VTT_STATE_TIME')) { + define('VTT_STATE_TIME', 2); + } + + if (! defined('VTT_STATE_TEXT')) { + define('VTT_STATE_TEXT', 3); + } + + $subs = []; + $state = VTT_STATE_HEADER; + $subNum = 0; + $subText = ''; + $subTime = ''; + + $lines = explode(PHP_EOL, $this->transcriptContent); + // add a newline as last item, if it isn't already a newline + if ($lines[array_key_last($lines)] !== '') { + $lines[] = PHP_EOL; + } + + foreach ($lines as $line) { + switch ($state) { + case VTT_STATE_HEADER: + $state = VTT_STATE_BLANK; + break; + + case VTT_STATE_BLANK: + $state = VTT_STATE_TIME; + break; + + case VTT_STATE_TIME: + $subTime = trim($line); + $state = VTT_STATE_TEXT; + break; + + case VTT_STATE_TEXT: + if (trim($line) === '') { + $sub = new stdClass(); + $sub->number = $subNum; + [$startTime, $endTime] = explode(' --> ', $subTime); + $sub->startTime = $this->getSecondsFromVTTTimeString($startTime); + $sub->endTime = $this->getSecondsFromVTTTimeString($endTime); + $sub->text = trim($subText); + if ($subSpeaker !== '') { + $sub->speaker = trim((string) $subSpeaker); + } + + $subText = ''; + $state = VTT_STATE_TIME; + $subs[] = $sub; + ++$subNum; + } elseif ($subText !== '') { + $subText .= PHP_EOL . $line; + } else { + /** VTT includes a lot of information on the spoken line + * An example may look like this: + * So this is it + * We need to break this down into it's components, namely: + * 1. The actual words for the caption + * 2. Who is speaking + * 3. Any styling cues encoded in the VTT (which we dump) + * More information: https://www.w3.org/TR/webvtt1/ + */ + $vtt_speaker_pattern = '/^<.*>/'; + $removethese = ['<', '>']; + preg_match($vtt_speaker_pattern, $line, $matches); + if (isset($matches[0])) { + $subVoiceCue = explode(' ', str_replace($removethese, '', $matches[0])); + $subSpeaker = $subVoiceCue[1]; + } else { + $subSpeaker = ''; + } + + $subText .= preg_replace($vtt_speaker_pattern, '', $line); + } + + break; + } + } + + $jsonString = json_encode($subs, JSON_PRETTY_PRINT); + + if (! $jsonString) { + throw new Exception('Failed to parse VTT to JSON.'); + } + + return $jsonString; + } + private function getSecondsFromTimeString(string $timeString): float { $timeString = explode(',', $timeString); return (strtotime($timeString[0]) - strtotime('TODAY')) + (float) "0.{$timeString[1]}"; } + + private function getSecondsFromVTTTimeString(string $timeString): float + { + $timeString = explode('.', $timeString); + return (strtotime($timeString[0]) - strtotime('TODAY')) + (float) "0.{$timeString[1]}"; + } } diff --git a/themes/cp_admin/episode/create.php b/themes/cp_admin/episode/create.php index aabe2e3e..2f3f6196 100644 --- a/themes/cp_admin/episode/create.php +++ b/themes/cp_admin/episode/create.php @@ -167,7 +167,7 @@
- +
diff --git a/themes/cp_admin/episode/edit.php b/themes/cp_admin/episode/edit.php index e506617d..4ad4aed2 100644 --- a/themes/cp_admin/episode/edit.php +++ b/themes/cp_admin/episode/edit.php @@ -197,7 +197,7 @@
- +