mirror of
https://code.castopod.org/adaures/castopod
synced 2025-04-19 04:51:17 +00:00
refactor: replace transcript parser by mantas-done/subtitles library
This commit is contained in:
parent
cf4808ab63
commit
ef0e641f69
@ -20,6 +20,7 @@
|
||||
"james-heinrich/getid3": "^2.0.0-beta5",
|
||||
"league/commonmark": "^2.4.2",
|
||||
"league/html-to-markdown": "5.1.1",
|
||||
"mantas-done/subtitles": "^1.0.21",
|
||||
"melbahja/seo": "^v2.1.1",
|
||||
"michalsn/codeigniter4-uuid": "v1.0.2",
|
||||
"mpratt/embera": "^2.0.36",
|
||||
|
1137
composer.lock
generated
1137
composer.lock
generated
File diff suppressed because it is too large
Load Diff
73
modules/Media/CastopodSubtitles/CastopodSubtitles.php
Normal file
73
modules/Media/CastopodSubtitles/CastopodSubtitles.php
Normal file
@ -0,0 +1,73 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @copyright 2024 Ad Aures
|
||||
* @license https://www.gnu.org/licenses/agpl-3.0.en.html AGPL3
|
||||
* @link https://castopod.org/
|
||||
*/
|
||||
|
||||
namespace Modules\Media\CastopodSubtitles;
|
||||
|
||||
use Done\Subtitles\Code\Converters\ConverterContract;
|
||||
use Done\Subtitles\Code\Converters\SrtConverter;
|
||||
use Done\Subtitles\Code\Converters\VttConverter;
|
||||
use Done\Subtitles\Subtitles;
|
||||
use Exception;
|
||||
|
||||
class CastopodSubtitles extends Subtitles
|
||||
{
|
||||
/**
|
||||
* @var array<array{extension:string,format:string,name:string,class:class-string<ConverterContract>}>
|
||||
*/
|
||||
public static $formats = [
|
||||
[
|
||||
'extension' => 'vtt',
|
||||
'format' => 'vtt',
|
||||
'name' => 'WebVTT',
|
||||
'class' => VttConverter::class,
|
||||
],
|
||||
[
|
||||
'extension' => 'srt',
|
||||
'format' => 'srt',
|
||||
'name' => 'SubRip',
|
||||
'class' => SrtConverter::class,
|
||||
],
|
||||
[
|
||||
'extension' => 'json',
|
||||
'format' => 'json',
|
||||
'name' => 'JSON',
|
||||
'class' => JSONConverter::class,
|
||||
],
|
||||
];
|
||||
|
||||
/**
|
||||
* @param string $format
|
||||
* @param array<mixed> $options
|
||||
* @return string
|
||||
*/
|
||||
public function content($format, $options = [])
|
||||
{
|
||||
/** @var ConverterContract $converter */
|
||||
$converter = $this->getConverterByFormat($format);
|
||||
|
||||
return $converter->internalFormatToFileContent($this->internal_format, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $format
|
||||
* @return ConverterContract
|
||||
*/
|
||||
private function getConverterByFormat($format)
|
||||
{
|
||||
foreach (self::$formats as $row) {
|
||||
if ($row['format'] === $format) {
|
||||
$full_class_name = $row['class'];
|
||||
/** @var ConverterContract $converter */
|
||||
$converter = new $full_class_name();
|
||||
return $converter;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Exception("Can't find suitable converter, for format: {$format}");
|
||||
}
|
||||
}
|
67
modules/Media/CastopodSubtitles/JSONConverter.php
Normal file
67
modules/Media/CastopodSubtitles/JSONConverter.php
Normal file
@ -0,0 +1,67 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @copyright 2024 Ad Aures
|
||||
* @license https://www.gnu.org/licenses/agpl-3.0.en.html AGPL3
|
||||
* @link https://castopod.org/
|
||||
*/
|
||||
|
||||
namespace Modules\Media\CastopodSubtitles;
|
||||
|
||||
use Done\Subtitles\Code\Converters\ConverterContract;
|
||||
|
||||
class JSONConverter implements ConverterContract
|
||||
{
|
||||
/**
|
||||
* @param string $file_content
|
||||
* @return bool
|
||||
*/
|
||||
public function canParseFileContent($file_content)
|
||||
{
|
||||
json_decode($file_content);
|
||||
return json_last_error() === JSON_ERROR_NONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $file_content
|
||||
* @param string $original_file_content
|
||||
* @return array<array{start:int,end:int,lines:array<string>}>
|
||||
*/
|
||||
public function fileContentToInternalFormat($file_content, $original_file_content)
|
||||
{
|
||||
/** @var array<array{startTime:int,endTime:int,text:string}> $jsonTranscriptArray */
|
||||
$jsonTranscriptArray = json_decode($file_content, true);
|
||||
|
||||
$internalFormat = [];
|
||||
foreach ($jsonTranscriptArray as $segment) {
|
||||
$internalFormat[] = [
|
||||
'start' => $segment['startTime'],
|
||||
'end' => $segment['endTime'],
|
||||
'lines' => explode(PHP_EOL, $segment['text']),
|
||||
];
|
||||
}
|
||||
|
||||
return $internalFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<array{start:int,end:int,lines:array<string>}> $internal_format
|
||||
* @param array<mixed> $output_settings
|
||||
*/
|
||||
public function internalFormatToFileContent(array $internal_format, array $output_settings): string
|
||||
{
|
||||
/** @var array<array{number:int,startTime:int,endTime:int,text:string}> $jsonTranscriptArray */
|
||||
$jsonTranscriptArray = [];
|
||||
|
||||
foreach ($internal_format as $key => $value) {
|
||||
$jsonTranscriptArray[] = [
|
||||
'number' => $key,
|
||||
'startTime' => $value['start'],
|
||||
'endTime' => $value['end'],
|
||||
'text' => implode(PHP_EOL, $value['lines']),
|
||||
];
|
||||
}
|
||||
|
||||
return (string) json_encode($jsonTranscriptArray);
|
||||
}
|
||||
}
|
@ -10,98 +10,7 @@ declare(strict_types=1);
|
||||
|
||||
namespace Modules\Media\Entities;
|
||||
|
||||
use CodeIgniter\Files\File;
|
||||
use Exception;
|
||||
use Modules\Media\TranscriptParser;
|
||||
|
||||
class Transcript extends BaseMedia
|
||||
{
|
||||
public ?string $json_key = null;
|
||||
|
||||
public ?string $json_url = null;
|
||||
|
||||
protected string $type = 'transcript';
|
||||
|
||||
public function initFileProperties(): void
|
||||
{
|
||||
parent::initFileProperties();
|
||||
|
||||
if ($this->file_metadata !== null && array_key_exists('json_key', $this->file_metadata)) {
|
||||
helper('media');
|
||||
|
||||
$this->json_key = $this->file_metadata['json_key'];
|
||||
$this->json_url = service('file_manager')
|
||||
->getUrl($this->json_key);
|
||||
}
|
||||
}
|
||||
|
||||
public function setFile(File $file): self
|
||||
{
|
||||
parent::setFile($file);
|
||||
|
||||
$metadata = lstat((string) $file) ?? [];
|
||||
|
||||
helper('filesystem');
|
||||
|
||||
// set metadata (generated json file path)
|
||||
$this->json_key = change_file_path($this->file_key, '', 'json');
|
||||
$metadata['json_key'] = $this->json_key;
|
||||
|
||||
$this->attributes['file_metadata'] = json_encode($metadata, JSON_INVALID_UTF8_IGNORE);
|
||||
|
||||
$this->file = $file;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function saveFile(): void
|
||||
{
|
||||
$this->saveJsonTranscript();
|
||||
|
||||
parent::saveFile();
|
||||
}
|
||||
|
||||
public function deleteFile(): bool
|
||||
{
|
||||
if (! parent::deleteFile()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($this->json_key) {
|
||||
return service('file_manager')->delete($this->json_key);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private function saveJsonTranscript(): void
|
||||
{
|
||||
$transcriptContent = file_get_contents($this->file->getRealPath());
|
||||
|
||||
$transcriptParser = new TranscriptParser();
|
||||
|
||||
if ($transcriptContent === false) {
|
||||
throw new Exception('Could not read transcript file at ' . $this->file->getRealPath());
|
||||
}
|
||||
|
||||
$transcript_format = $this->file->getExtension();
|
||||
switch ($transcript_format) {
|
||||
case 'vtt':
|
||||
$transcriptJson = $transcriptParser->loadString($transcriptContent)
|
||||
->parseVtt();
|
||||
break;
|
||||
case 'srt':
|
||||
default:
|
||||
$transcriptJson = $transcriptParser->loadString($transcriptContent)
|
||||
->parseSrt();
|
||||
}
|
||||
|
||||
$tempFilePath = WRITEPATH . 'uploads/' . $this->file->getRandomName();
|
||||
file_put_contents($tempFilePath, $transcriptJson);
|
||||
|
||||
$newTranscriptJson = new File($tempFilePath, true);
|
||||
|
||||
service('file_manager')
|
||||
->save($newTranscriptJson, $this->json_key);
|
||||
}
|
||||
}
|
||||
|
@ -1,220 +0,0 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Converts a SRT or VTT file to JSON
|
||||
*
|
||||
* @copyright 2022 Ad Aures
|
||||
* @license https://www.gnu.org/licenses/agpl-3.0.en.html AGPL3
|
||||
* @link https://castopod.org/
|
||||
*/
|
||||
|
||||
namespace Modules\Media;
|
||||
|
||||
use Exception;
|
||||
use stdClass;
|
||||
|
||||
class TranscriptParser
|
||||
{
|
||||
protected string $transcriptContent;
|
||||
|
||||
public function loadString(string $content): self
|
||||
{
|
||||
$this->transcriptContent = $content;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from: https://stackoverflow.com/a/11659306
|
||||
*
|
||||
* @return string Returns the json encoded string
|
||||
*/
|
||||
public function parseSrt(): string
|
||||
{
|
||||
if (! defined('SRT_STATE_SUBNUMBER')) {
|
||||
define('SRT_STATE_SUBNUMBER', 0);
|
||||
}
|
||||
|
||||
if (! defined('SRT_STATE_TIME')) {
|
||||
define('SRT_STATE_TIME', 1);
|
||||
}
|
||||
|
||||
if (! defined('SRT_STATE_TEXT')) {
|
||||
define('SRT_STATE_TEXT', 2);
|
||||
}
|
||||
|
||||
if (! defined('SRT_STATE_BLANK')) {
|
||||
define('SRT_STATE_BLANK', 3);
|
||||
}
|
||||
|
||||
$subs = [];
|
||||
$state = SRT_STATE_SUBNUMBER;
|
||||
$subNum = 0;
|
||||
$subText = '';
|
||||
$subTime = '';
|
||||
|
||||
$lines = explode(PHP_EOL, $this->transcriptContent);
|
||||
foreach ($lines as $line) {
|
||||
switch ($state) {
|
||||
case SRT_STATE_SUBNUMBER:
|
||||
$subNum = trim($line);
|
||||
$state = SRT_STATE_TIME;
|
||||
break;
|
||||
|
||||
case SRT_STATE_TIME:
|
||||
$subTime = trim($line);
|
||||
$state = SRT_STATE_TEXT;
|
||||
break;
|
||||
|
||||
case SRT_STATE_TEXT:
|
||||
if (trim($line) === '') {
|
||||
$sub = new stdClass();
|
||||
$sub->number = (int) $subNum;
|
||||
[$startTime, $endTime] = explode(' --> ', $subTime);
|
||||
$sub->startTime = $this->getSecondsFromTimeString($startTime);
|
||||
$sub->endTime = $this->getSecondsFromTimeString($endTime);
|
||||
$sub->text = trim($subText);
|
||||
$subText = '';
|
||||
$state = SRT_STATE_SUBNUMBER;
|
||||
$subs[] = $sub;
|
||||
} elseif ($subText !== '') {
|
||||
$subText .= PHP_EOL . $line;
|
||||
} else {
|
||||
$subText .= $line;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($state === SRT_STATE_TEXT) {
|
||||
// if file was missing the trailing newlines, we'll be in this
|
||||
// state here. Append the last read text and add the last sub.
|
||||
// @phpstan-ignore-next-line
|
||||
$sub->text = $subText;
|
||||
// @phpstan-ignore-next-line
|
||||
$subs[] = $sub;
|
||||
}
|
||||
|
||||
$jsonString = json_encode($subs, JSON_PRETTY_PRINT);
|
||||
|
||||
if (! $jsonString) {
|
||||
throw new Exception('Failed to parse SRT to JSON.');
|
||||
}
|
||||
|
||||
return $jsonString;
|
||||
}
|
||||
|
||||
public function parseVtt(): string
|
||||
{
|
||||
if (! defined('VTT_STATE_HEADER')) {
|
||||
define('VTT_STATE_HEADER', 0);
|
||||
}
|
||||
|
||||
if (! defined('VTT_STATE_BLANK')) {
|
||||
define('VTT_STATE_BLANK', 1);
|
||||
}
|
||||
|
||||
if (! defined('VTT_STATE_TIME')) {
|
||||
define('VTT_STATE_TIME', 2);
|
||||
}
|
||||
|
||||
if (! defined('VTT_STATE_TEXT')) {
|
||||
define('VTT_STATE_TEXT', 3);
|
||||
}
|
||||
|
||||
$subs = [];
|
||||
$state = VTT_STATE_HEADER;
|
||||
$subNum = 0;
|
||||
$subText = '';
|
||||
$subTime = '';
|
||||
|
||||
$lines = explode(PHP_EOL, $this->transcriptContent);
|
||||
// add a newline as last item, if it isn't already a newline
|
||||
if ($lines[array_key_last($lines)] !== '') {
|
||||
$lines[] = PHP_EOL;
|
||||
}
|
||||
|
||||
foreach ($lines as $line) {
|
||||
switch ($state) {
|
||||
case VTT_STATE_HEADER:
|
||||
$state = VTT_STATE_BLANK;
|
||||
break;
|
||||
|
||||
case VTT_STATE_BLANK:
|
||||
$state = VTT_STATE_TIME;
|
||||
break;
|
||||
|
||||
case VTT_STATE_TIME:
|
||||
$subTime = trim($line);
|
||||
$state = VTT_STATE_TEXT;
|
||||
break;
|
||||
|
||||
case VTT_STATE_TEXT:
|
||||
if (trim($line) === '') {
|
||||
$sub = new stdClass();
|
||||
$sub->number = $subNum;
|
||||
[$startTime, $endTime] = explode(' --> ', $subTime);
|
||||
$sub->startTime = $this->getSecondsFromVTTTimeString($startTime);
|
||||
$sub->endTime = $this->getSecondsFromVTTTimeString($endTime);
|
||||
$sub->text = trim($subText);
|
||||
if ($subSpeaker !== '') {
|
||||
$sub->speaker = trim((string) $subSpeaker);
|
||||
}
|
||||
|
||||
$subText = '';
|
||||
$state = VTT_STATE_TIME;
|
||||
$subs[] = $sub;
|
||||
++$subNum;
|
||||
} elseif ($subText !== '') {
|
||||
$subText .= PHP_EOL . $line;
|
||||
} else {
|
||||
/** VTT includes a lot of information on the spoken line
|
||||
* An example may look like this:
|
||||
* <v.loud.top John>So this is it
|
||||
* We need to break this down into it's components, namely:
|
||||
* 1. The actual words for the caption
|
||||
* 2. Who is speaking
|
||||
* 3. Any styling cues encoded in the VTT (which we dump)
|
||||
* More information: https://www.w3.org/TR/webvtt1/
|
||||
*/
|
||||
$vtt_speaker_pattern = '/^<.*>/';
|
||||
$removethese = ['<', '>'];
|
||||
preg_match($vtt_speaker_pattern, $line, $matches);
|
||||
if (isset($matches[0])) {
|
||||
$subVoiceCue = explode(' ', str_replace($removethese, '', $matches[0]));
|
||||
$subSpeaker = $subVoiceCue[1];
|
||||
} else {
|
||||
$subSpeaker = '';
|
||||
}
|
||||
|
||||
$subText .= preg_replace($vtt_speaker_pattern, '', $line);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$jsonString = json_encode($subs, JSON_PRETTY_PRINT);
|
||||
|
||||
if (! $jsonString) {
|
||||
throw new Exception('Failed to parse VTT to JSON.');
|
||||
}
|
||||
|
||||
return $jsonString;
|
||||
}
|
||||
|
||||
private function getSecondsFromTimeString(string $timeString): float
|
||||
{
|
||||
$timeString = explode(',', $timeString);
|
||||
return (strtotime($timeString[0]) - strtotime('TODAY')) + (float) "0.{$timeString[1]}";
|
||||
}
|
||||
|
||||
private function getSecondsFromVTTTimeString(string $timeString): float
|
||||
{
|
||||
$timeString = explode('.', $timeString);
|
||||
return (strtotime($timeString[0]) - strtotime('TODAY')) + (float) "0.{$timeString[1]}";
|
||||
}
|
||||
}
|
@ -13,6 +13,7 @@ namespace Modules\MediaClipper;
|
||||
use App\Entities\Episode;
|
||||
use Exception;
|
||||
use GdImage;
|
||||
use Modules\Media\CastopodSubtitles\CastopodSubtitles;
|
||||
use Modules\Media\Entities\Transcript;
|
||||
use Modules\Media\FileManagers\FileManagerInterface;
|
||||
use Modules\MediaClipper\Config\MediaClipper;
|
||||
@ -122,66 +123,17 @@ class VideoClipper
|
||||
throw new Exception('Episode does not have a transcript!');
|
||||
}
|
||||
|
||||
if ($this->episode->transcript->json_url) {
|
||||
$this->generateSubtitlesClipFromJson($this->episode->transcript->json_key);
|
||||
} else {
|
||||
$subtitlesInput = $this->episode->transcript->file_url;
|
||||
$subtitleClipCmd = "ffmpeg -y -i {$subtitlesInput} -ss {$this->start} -t {$this->duration} {$this->subtitlesClipOutput}";
|
||||
exec($subtitleClipCmd);
|
||||
}
|
||||
}
|
||||
$subtitleString = file_get_contents($this->episode->transcript->file_url);
|
||||
|
||||
public function generateSubtitlesClipFromJson(string $jsonFileKey): void
|
||||
{
|
||||
/** @var FileManagerInterface $fileManager */
|
||||
$fileManager = service('file_manager');
|
||||
|
||||
$jsonTranscriptString = (string) $fileManager->getFileContents($jsonFileKey);
|
||||
if ($jsonTranscriptString === '') {
|
||||
throw new Exception('Cannot get transcript json contents.');
|
||||
if (! $subtitleString) {
|
||||
throw new Exception('Could not load transcript: ' . $this->episode->transcript->file_url);
|
||||
}
|
||||
|
||||
$jsonTranscript = json_decode($jsonTranscriptString, true);
|
||||
if ($jsonTranscript === null) {
|
||||
throw new Exception('Transcript json is invalid.');
|
||||
}
|
||||
$subtitles = CastopodSubtitles::loadFromString($subtitleString);
|
||||
|
||||
$srtClip = '';
|
||||
$segmentIndex = 1;
|
||||
foreach ($jsonTranscript as $segment) {
|
||||
$startTime = null;
|
||||
$endTime = null;
|
||||
$subtitles->trim($this->start, $this->end);
|
||||
|
||||
if ($segment['startTime'] < $this->end && $segment['endTime'] > $this->start) {
|
||||
$startTime = $segment['startTime'] - $this->start;
|
||||
$endTime = $segment['endTime'] - $this->start;
|
||||
}
|
||||
|
||||
if ($segment['startTime'] < $this->start && $this->start < $segment['endTime']) {
|
||||
$startTime = 0;
|
||||
}
|
||||
|
||||
if ($segment['startTime'] < $this->end && $segment['endTime'] >= $this->end) {
|
||||
$endTime = $this->duration;
|
||||
}
|
||||
|
||||
if ($startTime !== null && $endTime !== null) {
|
||||
$formattedStartTime = $this->formatSeconds($startTime);
|
||||
$formattedEndTime = $this->formatSeconds($endTime);
|
||||
$srtClip .= <<<CODE_SAMPLE
|
||||
{$segmentIndex}
|
||||
{$formattedStartTime} --> {$formattedEndTime}
|
||||
{$segment['text']}
|
||||
|
||||
|
||||
CODE_SAMPLE;
|
||||
|
||||
++$segmentIndex;
|
||||
}
|
||||
}
|
||||
|
||||
// create srt clip file
|
||||
file_put_contents($this->subtitlesClipOutput, $srtClip);
|
||||
$subtitles->save($this->subtitlesClipOutput);
|
||||
}
|
||||
|
||||
public function formatSeconds(float $seconds): string
|
||||
|
Loading…
x
Reference in New Issue
Block a user