commit a8a0a85239536dac8f6ce9496cbcab1c7e5b23f0 Author: Juraj Bednar Date: Tue Nov 5 14:45:19 2024 +0100 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ccc280f --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +config.json +episodes*.db +markdown_to_speech +poetry.lock +*.xml +config*.json +run*.sh +*_audio +web-config.json +your_audio +web-episodes.db diff --git a/README-web.md b/README-web.md new file mode 100644 index 0000000..ea6a55a --- /dev/null +++ b/README-web.md @@ -0,0 +1,155 @@ +# Web Interface for RSS-to-Podcast Converter + +This document explains how to set up and use the web interface for the RSS-to-Podcast Converter. The web interface allows users to create accounts, add websites to their personal podcast feed, and obtain their personalized RSS feed URL. + +## Overview + +The web application provides: + +- **User Registration and Login**: Users can create accounts and log in securely. +- **Personalized Podcast Feed**: Each user gets a unique podcast feed identified by their username. +- **Adding Websites**: Users can add any website URL to their feed, which will be processed and converted into podcast episodes. +- **Lo-fi 8-bit ZX Spectrum Style**: The user interface is designed with a retro aesthetic. + +## Setup Instructions + +### Prerequisites + +- Python 3.x +- Required Python packages (install via `pip` or `poetry`): + - Flask + - Flask-Login + - SQLAlchemy + - Other dependencies from the main project + +### Database Recommendation + +**Important**: For the web application, it is highly recommended to use PostgreSQL instead of SQLite due to potential locking issues with multiple users accessing the database simultaneously. + +### Environment Variables + +#### `FLASK_SECRET_KEY` + +The application uses a secret key for session management. You can set the secret key via the `FLASK_SECRET_KEY` environment variable. If this variable is not set, the application will generate a random secret key every time it starts. This approach is suitable for development but not recommended for production. + +### Installation + +1. **Clone the Repository** + + ```bash + git clone https://github.com/yourusername/rss-to-podcast.git + cd rss-to-podcast + ``` + +2. **Install Dependencies** + + To install with web interface support, include the `web` dependency group: + + ```bash + poetry install --with web + ``` + + This installs additional dependencies required for the web interface, including `Flask`, `Flask-Login`, and `SQLAlchemy`. + +3. **Prepare the Database** + + - Create a PostgreSQL database for the application. + - Use the `database-tool.py` script to create the necessary tables: + + ```bash + python database-tool.py create --db postgresql://user:password@localhost:5432/mydatabase + ``` + +4. **Configure the Web Application** + + - Update `web-config.json` to include your PostgreSQL connection string: + + ```json + { + "database": "postgresql://user:password@localhost:5432/mydatabase", + ... + } + ``` + + - Ensure the `web-config.json` template is in the project root directory. + - Set the `FLASK_SECRET_KEY` environment variable for security. + +### Running the Web Application + +Start the Flask web server: + +```bash +python app.py --config web-config.json +``` + +The application will be accessible at `http://localhost:5000`. + +### Running the Queue Processor + +In a separate terminal, run the process that handles the queue of pending episodes: + +```bash +python process_website_queue.py --config web-config.json +``` + +This script continuously checks for pending episodes and processes them. + +## Usage Instructions + +### Registering an Account + +1. Navigate to `http://localhost:5000/register`. +2. Fill in a username and password. +3. Submit the form to create your account. + +### Logging In + +1. Go to `http://localhost:5000/login`. +2. Enter your username and password. +3. Click "Login" to access your dashboard. + +### Dashboard + +After logging in, you will see: + +- **Your Personalized RSS Feed URL**: This is the URL you can add to your podcast player to receive your episodes. +- **Copy Icon**: Click this icon to copy the feed URL to your clipboard. +- **Add Website**: An input box where you can enter a website URL. +- **"Load it up" Button**: Click this to add the website to your processing queue. + +### Adding a Website + +1. Enter the full URL of the website you want to convert into a podcast episode. +2. Click "Load it up". +3. You will receive a confirmation message: "The podcast has been loaded up into the queue, let's roll the tape!" + +### Accessing Your Podcast Feed + +- Copy your personalized RSS feed URL from the dashboard. +- Add this URL to your preferred podcast app to receive your episodes once they're processed. + +## Customization + +### Retro Styling + +The user interface is styled to resemble 8-bit ZX Spectrum graphics. You can customize the templates and CSS in the `templates` and `static` directories to modify the appearance. + +### Config Template + +The `web-config.json` file serves as the base configuration for all users. Placeholders like `{podcast_id}` are replaced with the user's specific details at runtime. + +### Security Considerations + +- Replace `app.secret_key` in `app.py` with a secure, random value. +- Ensure that the database is secured and not accessible from the web. +- Use PostgreSQL for better concurrency and to avoid locking issues. + +## Troubleshooting + +- **Database Errors**: Ensure the database credentials are correct and the database is running. +- **Processing Issues**: Check the logs of `process_website_queue.py` for any errors during episode processing. + +## Additional Information + +- The web interface interacts with the existing processing scripts, reusing the core functionality. +- Users are isolated by their `podcast_id`, which is derived from their username. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8b345fa --- /dev/null +++ b/README.md @@ -0,0 +1,253 @@ +# RSS-to-Podcast Converter + +A Python-based application that converts blog posts from an RSS feed into a value-for-value-enabled podcast, allowing listeners to engage with content via audio. This project automates the transformation of blog articles into podcast episodes by using text-to-speech technology, providing an easy way for listeners to consume written content as audio. + +Another use-case is converting websites into podcast episodes. They are added manually using `add_website.py`. In this case, there is no source RSS feed. + +For this use-case, there is also a web application, where users can generate their own podcast feed from articles they want to read. It powers [loaditfor.me](https://loaditfor.me/). Feel free to run your own instance, see [README-web.md](README-web.md). + +## Overview + +The RSS-to-Podcast Converter pulls articles from an RSS feed, processes the content using a text-to-speech (TTS) model, and generates podcast episodes in MP3 format. The episodes are then assembled into an RSS feed compatible with podcast players, complete with metadata, audio, and descriptions. The generated podcast includes a **value-for-value** system, enabling micropayments and splits for creators via the Lightning Network. + +This project uses a database to track processed episodes and ensure each article is only converted once. It allows manual skipping of articles that may not be suitable for TTS conversion, such as posts with embedded videos or images. + +## Features + +- **Automated Podcast Generation**: Converts blog articles from an RSS feed into podcast episodes using a TTS model. +- **Customizable Episode Templates**: Configurable episode description templates to link back to the original article. +- **Configurable Audio Stitching**: Customizable introduction and conclusion audio segments for each episode. +- **Value-for-Value Integration**: Supports micropayments with customizable splits, allowing listeners to contribute directly. +- **Automatic Skipping and Reprocessing**: Tracks processed articles to avoid duplicate conversions, with options to reprocess episodes if necessary. +- **Optional LLM Processing and Verification**: Uses an LLM to optimize text for TTS and verifies the content to avoid unsuitable output. +- **Customizable Output**: Allows custom intro, outro, and conversion settings for generated MP3 files. +- **Manual Episode Addition**: Supports adding episodes manually without a source RSS feed. +- **Flexible Feed Generation**: Can regenerate the RSS feed without processing episodes. +- **Website Content Addition**: Adds content from any website URL directly into the database for processing. +- **Support for SQLite and PostgreSQL**: Now supports both SQLite and PostgreSQL databases via SQLAlchemy. + +## Skipping Podcast Entries + +A new configuration option lets you automatically skip episodes whose titles match one or more regular expressions. This is especially useful if, for example, you want to avoid reprocessing content that is already in audio form. To use this feature, add a `skip_regexps` array to your configuration file. For instance, to skip any episode whose title contains "audio" or "Audio", add the following: + +```json +"skip_regexps": [ + "[Aa]udio" +] +``` + +When processing the RSS feed, if an episode’s title matches any of these patterns, the episode is immediately marked as skipped and is not converted to speech. + +## Dependencies + +Install dependencies using Poetry: + +```bash +poetry install +poetry add psycopg2-binary # for postgresql support +``` + +## Database Options + +This application supports both SQLite and PostgreSQL databases via SQLAlchemy. + +### Configuring the Database + +In your configuration file (`config.json`), you can specify the database connection using the `database_url` parameter. + +- **SQLite (Default)**: If you provide a filename in `database`, the application will use SQLite. +- **PostgreSQL**: If you provide a PostgreSQL connection string (starting `postgresql://`) in `database_url` or `database`, the application will connect to the specified PostgreSQL database. + +Example of using PostgreSQL in `config.json`: + +```json +{ + "database_url": "postgresql://user:password@localhost:5432/mydatabase", + ... +} +``` + +### Creating Database Tables + +You need to create the database tables before running the application. Use the `database-tool.py` script to create tables. + +For SQLite: + +```bash +python database-tool.py create --db episodes.db +``` + +For PostgreSQL: + +```bash +python database-tool.py create --db postgresql://user:password@localhost:5432/mydatabase +``` + +### Migrating Between Databases + +To migrate data between SQLite and PostgreSQL, use the `migrate` command in `database-tool.py`. + +Example migrating from SQLite to PostgreSQL: + +```bash +python database-tool.py migrate --from episodes.db --to postgresql://user:password@localhost:5432/mydatabase +``` + +Example migrating from PostgreSQL to SQLite: + +```bash +python database-tool.py migrate --from postgresql://user:password@localhost:5432/mydatabase --to episodes.db +``` + +## Configuration + +The project uses a JSON configuration file to define input sources, output settings, and TTS processing details. See the sample configuration file (`config.json.sample`) for details. + +## Running the Application + +When running the application, it will use the database specified in your configuration file. + +For example: + +```bash +python main.py --config config.json +``` + +## Managing the Database + +Use the `database-tool.py` script to manage your database, including creating tables and migrating data. + +### Creating Tables + +```bash +python database-tool.py create --db [database_url_or_filename] +``` + +### Migrating Data + +```bash +python database-tool.py migrate --from [source_db_url_or_filename] --to [destination_db_url_or_filename] +``` + +## Usage + +### Running the Conversion + +The project includes a command-line interface to manage feed processing. Use the following command to start processing the feed: + +```bash +python main.py --config config.json +``` + +### Command-Line Options for `main.py` + +- `--config`: Path to the configuration JSON file. +- `--episode-limit`: Limit the number of episodes to process. +- `--episode-guid`: Process a specific episode by GUID. +- `--reprocess`: Reprocess episodes that are already marked as processed. +- `--only-feed`: Generate the RSS feed without processing episodes. + +Example: + +```bash +python main.py --config config.json --episode-limit 10 --reprocess +``` + +### Managing Episodes with `episode-tool.py` + +`episode-tool.py` allows you to manage episodes in the database, including adding new episodes manually. + +#### Adding a New Episode Manually + +```bash +echo "This is the content of the episode." | python episode-tool.py --new-episode --title "Episode Title" --config config.json +``` + +- **Options**: + - `--new-episode`: Add a new episode to the database. + - `--title`: Title of the episode (required). + - `--guid`: GUID for the episode (optional). If not provided, it's generated based on the link or the current date and title. + - `--link`: Link associated with the episode (optional). + - `--description`: Description of the episode (optional). + - `--date`: Publication date of the episode (optional). Defaults to the current date and time. + - `--markdown`: Content is in Markdown format (default). + - `--html`: Content is in HTML format. + - `--config`: Path to the configuration JSON file. + +#### Other Episode Management Commands + +- **List All Episode GUIDs**: + + ```bash + python episode-tool.py --list-guids --config config.json + ``` + +- **Mark an Episode as Skipped**: + + ```bash + python episode-tool.py --guid "episode-guid" --skip --config config.json + ``` + +- **Reprocess an Episode**: + + ```bash + python episode-tool.py --guid "episode-guid" --reprocess --config config.json + ``` + +- **Delete an Episode**: + + ```bash + python episode-tool.py --guid "episode-guid" --delete --config config.json + ``` + +### Adding a Website with `add_website.py` + +`add_website.py` allows you to add content from any website URL directly into the database for processing. + +#### Usage + +```bash +python add_website.py "https://example.com/article" --config config.json +``` + +- **Positional Arguments**: + - `url`: The URL of the website to add. + +- **Options**: + - `--config`: Path to the configuration JSON file (optional, defaults to `config.json`). + - `--db`: Database filename or connection string (overrides the one specified in the config file). + +#### Example + +```bash +python add_website.py "https://example.com/blog-post" --db episodes.db +``` + +This command fetches the content and title from the provided URL using the `trafilatura` library and adds it to the database with the status set to `pending`. The content will then be processed the next time you run `main.py`. + +## Preprocessing with Regular Expressions + +You can specify optional preprocessing regular expressions in your configuration file under the `preprocess_regexps` key. This feature allows you to define an array of regular expressions and their replacements, which will be applied to both the title and content before converting them to speech. + +### Example Configuration + +```json +"preprocess_regexps": [ + { + "regexp": " 1-2 ", + "replacement": " one to two " + }, + { + "regexp": "\\bAI\\b", + "replacement": "Artificial Intelligence" + } +] +``` + +## Is this free? + +This project is free to use, modify, etc. It is a free and open source software. + +I invested quite a lot of work into this project and related projects that made speech to text possible. I ask you to leave the generated value4value block intact (you can add your splits via config). + +[If you found this useful, I appreciate returning the value - pay what it's worth to you](https://juraj.bednar.io/en/support-me/). diff --git a/add_website.py b/add_website.py new file mode 100644 index 0000000..65ae0e2 --- /dev/null +++ b/add_website.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +import argparse +import sys +import json +from datetime import datetime +import trafilatura +from utils import get_engine, create_tables, Episode +from sqlalchemy.orm import sessionmaker + +def add_website_to_db(db_url, url, podcast_id): + # Fetch and extract content using trafilatura + downloaded = trafilatura.fetch_url(url) + if not downloaded: + print(f"Failed to download content from URL: {url}") + sys.exit(1) + + content = trafilatura.extract(downloaded, include_comments=False) + metadata = trafilatura.extract_metadata(downloaded) + title = metadata.title if metadata and metadata.title else "Untitled" + + if not content.strip(): + print("No content extracted from the URL.") + sys.exit(1) + + # Generate GUID using the URL + article_guid = url + + # Get current date and time in the required format + pub_date = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000') + + # Connect to the database and insert the episode + engine = get_engine(db_url) + Session = sessionmaker(bind=engine) + session = Session() + create_tables(engine) + + episode = Episode( + podcast_id=podcast_id, + article_guid=article_guid, + title=title, + link=url, + pub_date=pub_date, + description='', + content=content, + processing_status='pending', + skipped=False + ) + session.add(episode) + session.commit() + session.close() + print(f"Website '{title}' added to the database with GUID '{article_guid}' under podcast_id '{podcast_id}'.") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Add a website to the episodes database.") + parser.add_argument("url", help="URL of the website to add.") + parser.add_argument("--config", default="config.json", help="Path to configuration file.") + parser.add_argument("--db", help="Database filename or connection string (overrides config).") + + args = parser.parse_args() + + # Load configuration + config = {} + if args.config: + try: + with open(args.config, 'r') as f: + config = json.load(f) + config['config_file_path'] = args.config + except Exception as e: + print(f"Warning: Could not load configuration file: {e}") + if not args.db: + sys.exit(1) + + # Set database filename or connection string + db_url = args.db or config.get('database', 'episodes.db') + + # Get podcast_id from config + podcast_id = config.get('podcast_id') + if not podcast_id: + podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id') + + # Add website to the database + add_website_to_db(db_url, args.url, podcast_id) diff --git a/app.py b/app.py new file mode 100644 index 0000000..89a733b --- /dev/null +++ b/app.py @@ -0,0 +1,154 @@ +from flask import Flask, render_template, request, redirect, url_for, flash, g +from flask_login import LoginManager, login_user, logout_user, login_required, current_user, UserMixin +from sqlalchemy.orm import sessionmaker +from sqlalchemy.exc import IntegrityError +from utils import get_engine, create_tables, User as UserModel, Episode +import json +import hashlib +import os +from add_website import add_website_to_db +from web_utils import generate_config, sanitize_username +import secrets +import argparse + +app = Flask(__name__) +app.secret_key = os.environ.get('FLASK_SECRET_KEY', secrets.token_hex(16)) + +login_manager = LoginManager() +login_manager.init_app(app) +login_manager.login_view = 'login' + +config = {} +engine = None +Session = None + +# Initialize database and session maker at the start +def initialize_app(config_path): + global config, engine, Session + try: + with open(config_path, 'r') as f: + config = json.load(f) + except FileNotFoundError: + print(f"Error: Config file '{config_path}' not found. Exiting.") + exit(1) + + db_url = config.get('database', 'web-episodes.db') + engine = get_engine(db_url) + Session = sessionmaker(bind=engine) + create_tables(engine) + +class User(UserMixin): + def __init__(self, id_, username): + self.id = id_ + self.username = username + +@login_manager.user_loader +def load_user(user_id): + session = get_db_session() + user_row = session.query(UserModel).filter(UserModel.id == int(user_id)).first() + session.close() + if user_row: + return User(user_row.id, user_row.username) + return None + +def get_db_session(): + if 'db_session' not in g: + if Session is None: + raise RuntimeError("Database session is not initialized. Ensure 'initialize_app()' is called before using the application.") + g.db_session = Session() + return g.db_session + +@app.teardown_appcontext +def close_db_session(exception): + db_session = g.pop('db_session', None) + if db_session is not None: + db_session.close() + +@app.route('/register', methods=['GET', 'POST']) +def register(): + if request.method == 'POST': + username = request.form['username'] + password = request.form['password'] + if not sanitize_username(username): + flash('Username can only contain letters, numbers, "-", and "_".') + return render_template('register.html') + # Salt and hash the password + salt = os.urandom(16) + password_hash = hashlib.pbkdf2_hmac('sha256', password.encode(), salt, 100000) + # Store salt and hash together + password_storage = (salt + password_hash).hex() # Convert to hex string for storage + + session = get_db_session() + try: + user = UserModel(username=username, password=password_storage) + session.add(user) + session.commit() + flash('Registration successful. Please log in.') + return redirect(url_for('login')) + except IntegrityError: + session.rollback() + flash('Username already exists.') + finally: + session.close() + return render_template('register.html') + +@app.route('/login', methods=['GET', 'POST']) +def login(): + if request.method == 'POST': + username = request.form['username'] + password = request.form['password'] + + session = get_db_session() + user_row = session.query(UserModel).filter(UserModel.username == username).first() + session.close() + if user_row: + stored_password = bytes.fromhex(user_row.password) + salt = stored_password[:16] + stored_hash = stored_password[16:] + password_hash = hashlib.pbkdf2_hmac('sha256', password.encode(), salt, 100000) + if password_hash == stored_hash: + user = User(user_row.id, user_row.username) + login_user(user) + return redirect(url_for('dashboard')) + else: + flash('Invalid username or password.') + else: + flash('Invalid username or password.') + return render_template('login.html') + +@app.route('/logout') +@login_required +def logout(): + logout_user() + return redirect(url_for('login')) + +@app.route('/', methods=['GET', 'POST']) +@login_required +def dashboard(): + podcast_id = f"{current_user.username}_001" + feed_config = generate_config(config, podcast_id) + feed_url = feed_config['output_rss_feed']['atom_link']['href'] + if request.method == 'POST': + url = request.form['url'] + db_url = config.get('database', 'web-episodes.db') + add_website_to_db(db_url, url, podcast_id) + flash("The podcast has been loaded up into the queue, let's roll the tape!") + + return render_template('dashboard.html', feed_url=feed_url) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Run the Flask application') + parser.add_argument('--port', type=int, help='Port to run the server on') + parser.add_argument('--host', help='Host to run the server on') + parser.add_argument('--config', default='web-config.json', help='Path to the configuration file') + + args = parser.parse_args() + + initialize_app(args.config) + + # Determine host and port + host = args.host or config.get('listen', {}).get('host', '127.0.0.1') + port = args.port or config.get('listen', {}).get('port', 5000) + + print(f"Starting server on {host}:{port}") + app.run(host=host, port=port, debug=False) diff --git a/config.json.sample b/config.json.sample new file mode 100644 index 0000000..41dc68b --- /dev/null +++ b/config.json.sample @@ -0,0 +1,136 @@ +{ + "source_rss_feed_url": "https://example.com/feed.xml", + "database": "episodes.db", + "audio_output_directory": "assets/podcast", + "audio_url_prefix": "https://example.com/audioblog/", + "feed_output_filename": "feed.xml", + "reprocess_episode": false, + "process_articles_since": "01-2020", + "episode_description_template": "For the original, text version of this blog, go to: {episode_link}.", + "output_rss_feed": { + "title": "Podcast Title", + "link": "https://example.com", + "description": "Podcast Description", + "language": "en-US", + "last_build_date": null, + "update_period": "hourly", + "update_frequency": 1, + "generator": "Your Generator", + "image": { + "url": "https://example.com/image.jpg", + "title": "Podcast Image Title", + "link": "https://example.com", + "width": 32, + "height": 32 + }, + "atom_link": { + "href": "https://example.com/audioblog/feed.xml", + "rel": "self", + "type": "application/rss+xml" + }, + "itunes": { + "summary": "iTunes Summary", + "author": "Author Name", + "explicit": false, + "image": { + "href": "https://example.com/image.jpg" + }, + "owner": { + "name": "Author Name", + "email": "author@example.com" + }, + "subtitle": "Podcast Subtitle", + "category": [ + { + "text": "Education", + "subcategory": "Self-Improvement" + }, + { + "text": "Society & Culture", + "subcategory": "Philosophy" + } + ] + }, + "googleplay": { + "category": "Education" + }, + "podcast": { + "locked": "yes", + "license": "Your License", + "medium": "podcast", + "podping": { + "usesPodping": "true" + }, + "value": { + "type": "lightning", + "method": "keysend", + "suggested": "0.00000005000", + "recipients": [ + { + "type": "node", + "split": "99", + "address": "02f1246b8fe904a5c5193504d8069532b1fb8692b84fb3eb64318b201238f60ff1", + "name": "Main Recipient" + }, + { + "name": "boostbot@fountain.fm", + "type": "node", + "split": "1", + "address": "03b6f613e88bd874177c28c6ad83b3baba43c4c656f56be1f8df84669556054b79", + "customKey": "906608", + "customValue": "01IMQkt4BFzAiSynxcQQqd", + "fee": "true" + } + ] + } + } + }, + "blog_to_speech_llm": { + "enabled": true, + "endpoint": "http://127.0.0.1:11434", + "model": "dolphin-llama3:70b", + "prompt": "Please convert the following text into a format suitable for an audio narration, with focus on minimal wording changes. Preserve markdown formatting. The text should be easy to understand for listeners without visual cues, such as parentheses or list formatting. Maintain the original meaning and details but adapt it for spoken language, do not summarize, simplify, keep the content as close to the original as possible. Break up long sentences if necessary to enhance clarity and listenability. If you add numbered bullet points, write them in words instead of numbers (\"First.\" instead of \"1.\"). Change words into pronounced forms. \"fountain.fm\" would become \"fountain dot fm\", \"2.0\" would be \"two point oh\". Only output converted text, nothing else. Text:.", + "max_chunk_size": 1000, + "max_length_difference": 0.1 + }, + "llm_verifier": { + "enabled": true, + "endpoint": "http://localhost:11434", + "model": "gemma3:27b", + "prompt": "Verify that the output text maintains the same meaning and content as the input text, without adding extra information or omitting important details. The output should be adapted for spoken language but should not significantly alter the original content. Respond with 'Valid' if the output meets these criteria, or 'Wrong' if it does not. Do not output anything else.", + "expected_output": "Valid" + }, + "tts_options": { + "ref_audio": "", + "pause": 0.5, + "alpha": 0.3, + "beta": 0.7, + "diffusion_steps": 5, + "embedding_scale": 1.0, + "min_similarity": 0.9, + "pause_h1_before": 2.0, + "pause_h1_after": 0.7, + "pause_h2_before": 1.5, + "pause_h2_after": 0.7, + "pause_h3_before": 0.7, + "pause_h3_after": 0.7, + "split_at_headings": false, + "max_retries": 1 + }, + "prefix_audio_files": ["intro.wav"], + "postfix_audio_files": ["outro.wav"], + "audio_speedup": 1.3, + "mp3_conversion": { + "bitrate": "192k", + "codec": "libmp3lame" + }, + "preprocess_regexps": [ + { + "regexp": " 1-2 ", + "replacement": " one to two " + } + ], + "skip_regexps": [ + "[Aa]udio" + ] +} diff --git a/content_processing.py b/content_processing.py new file mode 100644 index 0000000..4488396 --- /dev/null +++ b/content_processing.py @@ -0,0 +1,121 @@ +import re +from bs4 import BeautifulSoup +from markdownify import markdownify as md +from ollama_client import OllamaClient +import nltk +from nltk.tokenize import sent_tokenize +from colorama import init, Fore, Style + +# Initialize colorama +init(autoreset=True) + +def clean_and_convert_content(content, is_markdown): + if not is_markdown: + # Clean HTML and convert to Markdown + clean_html = clean_html_content(content) + markdown_content = convert_html_to_markdown(clean_html) + else: + markdown_content = content + + # Remove images and links + markdown_content = remove_images_and_links(markdown_content) + return markdown_content + +def prepare_for_speech(markdown_content, config): + # Optional LLM Processing + if config['blog_to_speech_llm']['enabled']: + markdown_content = process_with_llm(markdown_content, config) + return markdown_content + +def clean_html_content(html_content): + soup = BeautifulSoup(html_content, 'html.parser') + for script in soup(['script', 'style']): + script.decompose() + return str(soup) + +def convert_html_to_markdown(html_content): + return md(html_content, strip=['a', 'img', 'b', 'i'], heading_style="ATX", escape_asterisks=False, + escape_underscores=False, escape_misc=False, bullets='*') + +def remove_images_and_links(markdown_content): + markdown_content = re.sub(r'!\[.*?\]\(.*?\)', '', markdown_content) + markdown_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', markdown_content) + return markdown_content + +def process_with_llm(markdown_content, config): + try: + ollama_client = OllamaClient(config['blog_to_speech_llm']['model'], url=config['blog_to_speech_llm']['endpoint']) + max_chunk_size = config['blog_to_speech_llm']['max_chunk_size'] + + chunks = split_content(markdown_content, max_chunk_size) + processed_chunks = [] + + for chunk in chunks: + prompt = f"{config['blog_to_speech_llm']['prompt']}\n\n{chunk}" + print(f"{Fore.GREEN}Processing chunk with LLM:{Style.RESET_ALL} \"{chunk}\"") + response = ollama_client.generate(prompt) + print(f"{Fore.GREEN}Processed chunk:{Style.RESET_ALL} \"{response}\"") + + if response and isinstance(response, str): + verified = verify_output(chunk, response, config) + if verified: + processed_chunks.append(response) + else: + processed_chunks.append(chunk) + else: + processed_chunks.append(chunk) + + if len(processed_chunks) == len(chunks): + return '\n\n'.join(processed_chunks) + + print(f"{Fore.RED}LLM processing failed. Using original content.{Style.RESET_ALL}") + return markdown_content + except Exception as e: + print(f"{Fore.RED}Error in LLM processing: {e}{Style.RESET_ALL}") + return markdown_content + +def split_content(content, max_chunk_size): + sentences = sent_tokenize(content) + chunks = [] + current_chunk = "" + + for sentence in sentences: + if len(current_chunk) + len(sentence) + 1 <= max_chunk_size: + current_chunk += sentence + "\n" + else: + chunks.append(current_chunk.strip()) + current_chunk = sentence + "\n" + + if current_chunk: + chunks.append(current_chunk.strip()) + + return chunks + +def verify_output(input_text, output_text, config): + max_length_difference = config['blog_to_speech_llm']['max_length_difference'] + verifier_config = config['llm_verifier'] + + # Length check + input_length = len(input_text) + output_length = len(output_text) + length_difference = abs(input_length - output_length) / input_length + + if length_difference > max_length_difference: + print(f"{Fore.RED}Length difference ({length_difference:.2%}) exceeds maximum allowed ({max_length_difference:.2%}){Style.RESET_ALL}") + return False + + # If LLM verifier is not enabled, return True after passing length check + if not verifier_config['enabled']: + return True + + ollama_client = OllamaClient(verifier_config['model'], url=verifier_config['endpoint']) + + prompt = f"{verifier_config['prompt']}\n\nInput: {input_text}\n\nOutput: {output_text}" + + response = ollama_client.generate(prompt) + if response.strip() == verifier_config['expected_output']: + print(f"{Fore.GREEN}LLM validated the output, yay!{Style.RESET_ALL}") + return True + + print(f"{Fore.RED}LLM verification failed:{Style.RESET_ALL} {response.strip()}") + return False diff --git a/database-tool.py b/database-tool.py new file mode 100644 index 0000000..001acec --- /dev/null +++ b/database-tool.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +import argparse +from utils import get_engine, create_tables, Episode, User +from sqlalchemy.orm import sessionmaker + +def create_db(db_url): + engine = get_engine(db_url) + create_tables(engine) + print(f"Database tables created in {db_url}") + +def migrate_db(from_db_url, to_db_url): + from_engine = get_engine(from_db_url) + to_engine = get_engine(to_db_url) + + from_Session = sessionmaker(bind=from_engine) + to_Session = sessionmaker(bind=to_engine) + + create_tables(to_engine) + + from_session = from_Session() + to_session = to_Session() + + try: + # Migrate Episodes + episodes = from_session.query(Episode).all() + for episode in episodes: + to_session.merge(episode) + # Migrate Users + users = from_session.query(User).all() + for user in users: + to_session.merge(user) + to_session.commit() + print(f"Migration from {from_db_url} to {to_db_url} completed successfully.") + except Exception as e: + to_session.rollback() + print(f"Error during migration: {e}") + finally: + from_session.close() + to_session.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Database management tool.") + subparsers = parser.add_subparsers(dest='command') + + # Create command + create_parser = subparsers.add_parser('create', help='Create database tables.') + create_parser.add_argument('--db', required=True, help='Database filename or connection string.') + + # Migrate command + migrate_parser = subparsers.add_parser('migrate', help='Migrate database.') + migrate_parser.add_argument('--from', dest='from_db', required=True, help='Source database filename or connection string.') + migrate_parser.add_argument('--to', dest='to_db', required=True, help='Destination database filename or connection string.') + + args = parser.parse_args() + + if args.command == 'create': + create_db(args.db) + elif args.command == 'migrate': + migrate_db(args.from_db, args.to_db) + else: + parser.print_help() diff --git a/episode-tool.py b/episode-tool.py new file mode 100755 index 0000000..3e64dc0 --- /dev/null +++ b/episode-tool.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +import argparse +import sys +import json +from datetime import datetime +from utils import get_engine, create_tables, Episode, slugify +from sqlalchemy.orm import sessionmaker +from content_processing import clean_and_convert_content + +def list_episodes(db_url, list_type, exclude_options, podcast_id): + engine = get_engine(db_url) + Session = sessionmaker(bind=engine) + session = Session() + + try: + query = session.query(Episode).filter(Episode.podcast_id == podcast_id) + if exclude_options.get('exclude_skipped'): + query = query.filter(Episode.skipped == False) + if exclude_options.get('exclude_processed'): + query = query.filter(Episode.processing_status != 'processed') + if exclude_options.get('exclude_pending'): + query = query.filter(Episode.processing_status != 'pending') + if exclude_options.get('exclude_reprocess'): + query = query.filter(Episode.processing_status != 'reprocess') + + episodes = query.all() + finally: + session.close() + + for episode in episodes: + print(getattr(episode, list_type)) + +def select_episode(session, podcast_id, guid=None, title=None, link=None): + query = session.query(Episode).filter(Episode.podcast_id == podcast_id) + if guid: + query = query.filter(Episode.article_guid == guid) + elif title: + query = query.filter(Episode.title.like(f"%{title}%")) + else: + query = query.filter(Episode.link.like(f"%{link}%")) + return query.first() + +def mark_episode_skipped(db_url, podcast_id, guid=None, title=None, link=None, reprocess=False): + if not guid and not title and not link: + print("You must provide either an article GUID, title, or link to identify the episode.") + sys.exit(1) + + engine = get_engine(db_url) + Session = sessionmaker(bind=engine) + session = Session() + create_tables(engine) + + try: + episode = select_episode(session, podcast_id, guid, title, link) + if not episode: + print("Episode not found.") + sys.exit(1) + + if reprocess: + episode.processing_status = 'reprocess' + episode.skipped = False + status = 'reprocess' + else: + episode.processing_status = 'skipped' + episode.skipped = True + status = 'skipped' + session.commit() + print(f"Episode '{episode.title}' status set to '{status}'.") + finally: + session.close() + +def delete_episode(db_url, podcast_id, guid=None, title=None, link=None): + if not guid and not title and not link: + print("You must provide either an article GUID, title, or link to identify the episode.") + sys.exit(1) + + engine = get_engine(db_url) + Session = sessionmaker(bind=engine) + session = Session() + + try: + episode = select_episode(session, podcast_id, guid, title, link) + if not episode: + print("Episode not found.") + sys.exit(1) + session.delete(episode) + session.commit() + print(f"Episode '{episode.title}' has been deleted from the database.") + finally: + session.close() + +def add_new_episode(db_url, args, config): + content = sys.stdin.read() + if not content.strip(): + print("No content provided. Please provide content via stdin.") + sys.exit(1) + + podcast_id = config.get('podcast_id') + if not podcast_id: + podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id') + + engine = get_engine(db_url) + Session = sessionmaker(bind=engine) + session = Session() + create_tables(engine) + + try: + # Generate GUID + if args.guid: + article_guid = args.guid + elif args.link: + article_guid = slugify(args.link) + else: + date_str = datetime.utcnow().strftime('%Y%m%d') + title_slug = slugify(args.title) + article_guid = f"{date_str}-{title_slug}" + + pub_date = args.date or datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000') + + # Determine content format + if args.html: + is_markdown = False + else: + is_markdown = True # default is markdown + + content = clean_and_convert_content(content, is_markdown=is_markdown) + + episode = Episode( + podcast_id=podcast_id, + article_guid=article_guid, + title=args.title, + link=args.link or '', + pub_date=pub_date, + description=args.description or '', + content=content, + processing_status='pending', + skipped=False + ) + session.add(episode) + session.commit() + print(f"New episode '{args.title}' added to the database.") + finally: + session.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Manage episodes in the database.") + parser.add_argument("--config", default="config.json", help="Path to configuration file") + + group = parser.add_mutually_exclusive_group() + group.add_argument("--new-episode", action="store_true", help="Add a new episode to the database") + group.add_argument("--list-guids", action="store_true", help="List all episode GUIDs") + group.add_argument("--list-links", action="store_true", help="List all episode links") + group.add_argument("--list-titles", action="store_true", help="List all episode titles") + group.add_argument("--skip", action="store_true", help="Mark an episode as skipped") + group.add_argument("--reprocess", action="store_true", help="Set processing_status to 'reprocess'") + group.add_argument("--delete", action="store_true", help="Delete an episode from the database") + + # Exclude for list + parser.add_argument("--exclude-skipped", action="store_true", help="Exclude episodes marked for skipping from the list") + parser.add_argument("--exclude-processed", action="store_true", help="Exclude processed episodes from the list") + parser.add_argument("--exclude-pending", action="store_true", help="Exclude pending episodes from the list") + parser.add_argument("--exclude-reprocess", action="store_true", help="Exclude episodes marked for reprocessing from the list") + + parser.add_argument("--guid", help="GUID of the episode to update, add, or delete") + parser.add_argument("--title", help="Title of the episode to update, add, or delete") + parser.add_argument("--link", help="Link of the episode to update, add, or delete") + parser.add_argument("--description", help="Description of the new episode") + parser.add_argument("--date", help="Publication date of the new episode") + parser.add_argument("--db", help="Database filename or connection string") + + format_group = parser.add_mutually_exclusive_group() + format_group.add_argument("--markdown", action="store_true", help="Content is in Markdown format (default)") + format_group.add_argument("--html", action="store_true", help="Content is in HTML format") + + args = parser.parse_args() + + # Load configuration + try: + with open(args.config, 'r') as f: + config = json.load(f) + config['config_file_path'] = args.config + except Exception as e: + print(f"Error loading configuration file: {e}") + sys.exit(1) + + # Set default db filename or connection string from config if not provided + db_url = args.db or config.get('database', 'episodes.db') + + podcast_id = config.get('podcast_id') + if not podcast_id: + podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id') + + exclude_options = { + 'exclude_skipped': args.exclude_skipped, + 'exclude_processed': args.exclude_processed, + 'exclude_pending': args.exclude_pending, + 'exclude_reprocess': args.exclude_reprocess + } + + if args.list_guids: + list_episodes(db_url, 'article_guid', exclude_options, podcast_id) + elif args.list_links: + list_episodes(db_url, 'link', exclude_options, podcast_id) + elif args.list_titles: + list_episodes(db_url, 'title', exclude_options, podcast_id) + elif args.new_episode: + add_new_episode(db_url, args, config) + elif args.skip or args.reprocess: + mark_episode_skipped(db_url, podcast_id, guid=args.guid, title=args.title, link=args.link, reprocess=args.reprocess) + elif args.delete: + delete_episode(db_url, podcast_id, guid=args.guid, title=args.title, link=args.link) + else: + print("Error: No command supplied. Please use one of the available options.") + parser.print_help() + sys.exit(1) diff --git a/episode_processing_utils.py b/episode_processing_utils.py new file mode 100644 index 0000000..9c829b5 --- /dev/null +++ b/episode_processing_utils.py @@ -0,0 +1,188 @@ +import os +import re +import tempfile +import subprocess +from datetime import datetime +from content_processing import prepare_for_speech +from markdown_to_speech import convert_markdown_to_speech +from sqlalchemy.orm import sessionmaker +from utils import slugify, get_engine, Episode +from colorama import init, Fore, Style + +# Initialize colorama +init(autoreset=True) + +def process_episode(episode, config): + title = episode.title + content = episode.content + + # Apply preprocess_regexps to content and title if available + if 'preprocess_regexps' in config: + content = apply_preprocess_regexps(content, config['preprocess_regexps']) + title = apply_preprocess_regexps(title, config['preprocess_regexps']) + + print(f"{Fore.GREEN}Processing episode:{Style.RESET_ALL} {title}") + + content = prepare_for_speech(content, config) + + audio_basename = slugify(title) + + # Convert Title to Speech + title_audio_wav = os.path.join(tempfile.gettempdir(), f"title_audio_{audio_basename}.wav") + convert_markdown_to_speech(title, title_audio_wav, **config['tts_options']) + + # Convert Markdown to Speech + content_audio_wav = os.path.join(tempfile.gettempdir(), f"content_audio_{audio_basename}.wav") + convert_markdown_to_speech(content, content_audio_wav, **config['tts_options']) + + # Apply audio speedup if configured + if config.get('audio_speedup') and config['audio_speedup'] != 1: + content_audio_wav = speedup_wav_file(content_audio_wav, config['audio_speedup']) + title_audio_wav = speedup_wav_file(title_audio_wav, config['audio_speedup']) + + # Combine Audio Files + final_audio_wav = os.path.join(tempfile.gettempdir(), f"final_audio_{audio_basename}.wav") + combine_audio_files( + config.get('prefix_audio_files', []), + title_audio_wav, + content_audio_wav, + config.get('postfix_audio_files', []), + final_audio_wav + ) + + # Convert to MP3 + mp3_filename = f"{audio_basename}.mp3" + os.makedirs(config['audio_output_directory'], exist_ok=True) + mp3_file_path = os.path.join(config['audio_output_directory'], mp3_filename) + convert_to_mp3(final_audio_wav, mp3_file_path, config['mp3_conversion']) + duration, file_size = get_mp3_duration_and_size(mp3_file_path) + + # Update Episode Metadata + episode.processing_status = 'processed' + episode.processed_date = datetime.utcnow().isoformat() + episode.mp3_file_path = mp3_filename # Store the filename instead of full path + episode.duration = duration + episode.file_size = file_size + update_episode_in_db(episode, config['database']) + + # Clean Up Temporary Files + os.remove(content_audio_wav) + os.remove(title_audio_wav) + os.remove(final_audio_wav) + +def speedup_wav_file(wav_file, audio_speedup): + output_wav_file = os.path.join(tempfile.gettempdir(), f"speedup_{os.path.basename(wav_file)}") + subprocess.run([ + 'ffmpeg', '-y', '-i', wav_file, '-filter:a', + f"atempo={audio_speedup}", output_wav_file + ], check=True) + os.remove(wav_file) # Remove the original file + return output_wav_file + +def combine_audio_files(prefix_files, title_audio_file, content_audio_file, postfix_files, output_file): + audio_files = [] + + # Handle prefix files + if len(prefix_files) == 0: + # No prefix files + pass + elif len(prefix_files) == 1: + # One prefix file: prefix + content + audio_files.extend([prefix_files[0]]) + elif len(prefix_files) == 2: + # Two prefix files: first prefix + title + second prefix + content + audio_files.extend([prefix_files[0], title_audio_file, prefix_files[1]]) + else: + raise ValueError("Prefix files should be either 0, 1, or 2 files.") + + # Add the content audio + audio_files.append(content_audio_file) + + # Handle postfix files + if len(postfix_files) == 0: + # No postfix files + pass + elif len(postfix_files) == 1: + # One postfix file: content + postfix + audio_files.extend([postfix_files[0]]) + elif len(postfix_files) == 2: + # Two postfix files: content + first postfix + title + second postfix + audio_files.extend([postfix_files[0], title_audio_file, postfix_files[1]]) + else: + raise ValueError("Postfix files should be either 0, 1, or 2 files.") + + # Create a temporary file listing the audio files + concat_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') + for audio_file in audio_files: + concat_file.write(f"file '{audio_file}'\n") + concat_file.close() + + # Use ffmpeg to concatenate audio files + subprocess.run([ + 'ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', + concat_file.name, '-c', 'copy', output_file + ], check=True) + + os.remove(concat_file.name) + +def convert_to_mp3(wav_file, mp3_file, mp3_config): + os.makedirs(os.path.dirname(mp3_file), exist_ok=True) + subprocess.run([ + 'ffmpeg', '-y', '-i', wav_file, '-codec:a', + mp3_config.get('codec', 'libmp3lame'), + '-b:a', mp3_config.get('bitrate', '192k'), mp3_file + ], check=True) + +def get_mp3_duration_and_size(mp3_file_path): + if not os.path.isfile(mp3_file_path): + return None, None + + # Get duration using ffprobe + cmd = [ + 'ffprobe', '-v', 'error', '-show_entries', + 'format=duration', '-of', + 'default=noprint_wrappers=1:nokey=1', mp3_file_path + ] + try: + duration = float(subprocess.check_output(cmd).strip()) + file_size = os.path.getsize(mp3_file_path) + return int(duration), file_size + except Exception as e: + print(f"Error processing {mp3_file_path}: {e}") + return None, None + +def update_episode_in_db(episode, db_url): + """ + Updates the episode's metadata in the database. + """ + engine = get_engine(db_url) + Session = sessionmaker(bind=engine) + session = Session() + + try: + existing_episode = session.query(Episode).filter( + Episode.podcast_id == episode.podcast_id, + Episode.article_guid == episode.article_guid + ).first() + if existing_episode: + existing_episode.processing_status = episode.processing_status + existing_episode.mp3_file_path = episode.mp3_file_path + existing_episode.processed_date = episode.processed_date + existing_episode.duration = episode.duration + existing_episode.file_size = episode.file_size + session.commit() + print(f"Episode '{episode.title}' updated in the database.") + else: + print(f"Episode '{episode.title}' not found in the database.") + except Exception as e: + session.rollback() + print(f"Error updating episode in DB: {e}") + finally: + session.close() + +def apply_preprocess_regexps(text, regexps): + for item in regexps: + regexp = item['regexp'] + replacement = item['replacement'] + text = re.sub(regexp, replacement, text) + return text diff --git a/episode_processor.py b/episode_processor.py new file mode 100644 index 0000000..741808d --- /dev/null +++ b/episode_processor.py @@ -0,0 +1,39 @@ +from episode_processing_utils import process_episode +from utils import create_tables, parse_pub_date, get_engine, Episode +from sqlalchemy.orm import sessionmaker + +def process_pending_episodes(config, reprocess=False, episode_limit=None): + engine = get_engine(config['database']) + Session = sessionmaker(bind=engine) + session = Session() + create_tables(engine) + + try: + podcast_id = config.get('podcast_id') + if not podcast_id: + podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id') + + if reprocess: + status_clause = Episode.processing_status.in_(['processed', 'pending', 'reprocess']) + else: + status_clause = Episode.processing_status.in_(['pending', 'reprocess']) + + episodes_query = session.query(Episode).filter( + status_clause, + Episode.skipped == False, + Episode.podcast_id == podcast_id + ) + + episodes = episodes_query.all() + + finally: + session.close() + + # Sort episodes by parsed_pub_date, newest first + episodes.sort(key=lambda episode: parse_pub_date(episode.pub_date), reverse=True) + + if episode_limit: + episodes = episodes[:episode_limit] + + for episode in episodes: + process_episode(episode, config) diff --git a/feed_downloader.py b/feed_downloader.py new file mode 100644 index 0000000..2ff7fae --- /dev/null +++ b/feed_downloader.py @@ -0,0 +1,123 @@ +import feedparser +import re +from utils import create_tables, slugify, get_engine, Episode +from datetime import datetime +from content_processing import clean_and_convert_content +from sqlalchemy.orm import sessionmaker + +def download_and_insert_articles(config, episode_limit=None, specific_episode_guid=None, reprocess=False): + feed_url = config.get('source_rss_feed_url') + if not feed_url: + print("No source RSS feed URL provided. Skipping feed download.") + return + + process_articles_since = config.get('process_articles_since') + if process_articles_since: + try: + process_date = parse_config_date(process_articles_since) + except ValueError as e: + print(f"Error parsing process_articles_since date: {e}. Allowed formats: YYYY, MM-YYYY, DD-MM-YYYY or RSS pubDate format") + return + + podcast_id = config.get('podcast_id') + if not podcast_id: + podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id') + + feed = feedparser.parse(feed_url) + engine = get_engine(config['database']) + Session = sessionmaker(bind=engine) + session = Session() + create_tables(engine) + + try: + episodes_added = 0 + + for entry in feed.entries: + article_guid = entry.get('id') or entry.get('guid') or entry.link + + if specific_episode_guid and article_guid != specific_episode_guid: + continue + + pub_date = entry.get('published', datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000')) + try: + entry_date = datetime.strptime(pub_date, '%a, %d %b %Y %H:%M:%S %z') + except ValueError: + # Try ISO 8601 format + entry_date = datetime.strptime(pub_date, '%Y-%m-%dT%H:%M:%S%z') + if process_articles_since: + if entry_date.date() < process_date.date(): + continue + + existing_episode = session.query(Episode).filter( + Episode.podcast_id == podcast_id, + Episode.article_guid == article_guid + ).first() + + # If an episode already exists and is marked as skipped, simply continue. + if existing_episode and existing_episode.skipped: + continue + + # Check if the episode should be skipped based on title matching any skip_regexps + skip_regexps = config.get('skip_regexps', []) + if skip_regexps and any(re.search(pattern, entry.title) for pattern in skip_regexps): + print(f"Skipping article '{entry.title}' because it matches a skip pattern.") + continue + + content = entry.get('content', [{'value': ''}])[0]['value'] + if not content: + content = entry.get('description') or '' + is_markdown = False # Assume content from feed is HTML + content = clean_and_convert_content(content, is_markdown) + + if existing_episode: + if existing_episode.processing_status == 'reprocess' or reprocess: + # Update episode with new metadata and mark as pending + print(f"Will reprocess article '{entry.title}'.") + existing_episode.title = entry.title + existing_episode.link = entry.link + existing_episode.pub_date = pub_date + existing_episode.description = entry.get('summary', '') + existing_episode.content = content + existing_episode.processing_status = 'pending' + session.commit() + print(f"Episode '{existing_episode.title}' updated and marked for reprocessing.") + else: + continue # Episode already exists and is not marked for reprocessing + else: + episode = Episode( + podcast_id=podcast_id, + article_guid=article_guid, + title=entry.title, + link=entry.link, + pub_date=pub_date, + description=entry.get('summary', ''), + content=content, + processing_status='pending', + skipped=False + ) + session.add(episode) + session.commit() + episodes_added += 1 + print(f"Episode '{episode.title}' added to the database.") + + if episode_limit and episodes_added >= episode_limit: + break + finally: + session.close() + +def parse_config_date(date_string): + formats = [ + '%Y', # YYYY + '%m-%Y', # MM-YYYY + '%d-%m-%Y', # DD-MM-YYYY + '%a, %d %b %Y %H:%M:%S %z', # RSS pubDate format + '%Y-%m-%dT%H:%M:%S%z' # ISO8601 format + ] + + for fmt in formats: + try: + return datetime.strptime(date_string, fmt) + except ValueError: + continue + + raise ValueError(f"Unable to parse date: {date_string}") diff --git a/feed_generator.py b/feed_generator.py new file mode 100644 index 0000000..331790c --- /dev/null +++ b/feed_generator.py @@ -0,0 +1,93 @@ +from xml.etree.ElementTree import Element, SubElement, tostring +import xml.dom.minidom +from datetime import datetime +import os +from utils import create_tables, format_duration, add_channel_metadata, parse_pub_date, get_engine, Episode +from sqlalchemy.orm import sessionmaker + +def generate_output_rss_feed(config): + engine = get_engine(config['database']) + Session = sessionmaker(bind=engine) + session = Session() + create_tables(engine) + + podcast_id = config.get('podcast_id') + if not podcast_id: + podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id') + + episodes = session.query(Episode).filter( + Episode.processing_status == 'processed', + Episode.skipped == False, + Episode.podcast_id == podcast_id + ).all() + + session.close() + + episodes.sort(key=lambda episode: parse_pub_date(episode.pub_date), reverse=True) + + if not episodes: + print("No processed episodes found. Skipping RSS feed generation.") + return + + rss = Element('rss', version='2.0', attrib={ + 'xmlns:content': "http://purl.org/rss/1.0/modules/content/", + 'xmlns:wfw': "http://wellformedweb.org/CommentAPI/", + 'xmlns:dc': "http://purl.org/dc/elements/1.1/", + 'xmlns:atom': "http://www.w3.org/2005/Atom", + 'xmlns:sy': "http://purl.org/rss/1.0/modules/syndication/", + 'xmlns:slash': "http://purl.org/rss/1.0/modules/slash/", + 'xmlns:itunes': "http://www.itunes.com/dtds/podcast-1.0.dtd", + 'xmlns:podcast': "https://podcastindex.org/namespace/1.0", + 'xmlns:rawvoice': "https://blubrry.com/developer/rawvoice-rss/", + 'xmlns:googleplay': "http://www.google.com/schemas/play-podcasts/1.0" + }) + + channel = SubElement(rss, 'channel') + add_channel_metadata(channel, config['output_rss_feed']) + + SubElement(channel, 'lastBuildDate').text = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000') + + for episode in episodes: + article_guid = episode.article_guid + title = episode.title + link = episode.link + pub_date = episode.pub_date + description = episode.description + mp3_file_path = episode.mp3_file_path + duration = episode.duration + file_size = episode.file_size + + mp3_url = config['audio_url_prefix'] + if not mp3_url.endswith('/'): + mp3_url += '/' + mp3_url += mp3_file_path + + item = SubElement(channel, 'item') + SubElement(item, 'title').text = title + SubElement(item, 'link').text = link + SubElement(item, 'guid', isPermaLink="false").text = article_guid + SubElement(item, 'pubDate').text = pub_date + description_text = config['episode_description_template'].format( + episode_link=link + ) + SubElement(item, 'description').text = description_text + + enclosure_attribs = { + 'url': mp3_url, + 'type': "audio/mpeg" + } + if file_size: + enclosure_attribs['length'] = str(file_size) + SubElement(item, 'enclosure', **enclosure_attribs) + + if duration: + SubElement(item, 'itunes:duration').text = format_duration(duration) + + rough_string = tostring(rss, 'utf-8') + reparsed = xml.dom.minidom.parseString(rough_string) + pretty_xml = reparsed.toprettyxml(indent=" ") + + with open(config['feed_output_filename'], 'w', encoding='utf-8') as f: + f.write(pretty_xml) + + print(f"RSS feed generated at {config['feed_output_filename']}") diff --git a/main.py b/main.py new file mode 100755 index 0000000..457d02e --- /dev/null +++ b/main.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +import argparse +import json +import sys +from feed_downloader import download_and_insert_articles +from episode_processor import process_pending_episodes +from feed_generator import generate_output_rss_feed + +def main(): + parser = argparse.ArgumentParser(description="Convert blog RSS feed to podcast") + parser.add_argument('--config', default='config.json', help='Path to configuration file') + parser.add_argument('--episode-limit', type=int, help='Limit the number of episodes to process') + parser.add_argument('--episode-guid', help='GUID of a specific episode to process') + parser.add_argument('--reprocess', action='store_true', help='Reprocess episodes even if already processed') + parser.add_argument('--only-feed', action='store_true', help='Only generate the RSS feed without processing episodes') + + args = parser.parse_args() + + try: + with open(args.config, 'r') as f: + config = json.load(f) + config['config_file_path'] = args.config + except Exception as e: + print(f"Error loading configuration file: {e}") + sys.exit(1) + + if 'podcast_id' not in config: + config['podcast_id'] = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id') + + if not args.only_feed: + if config.get('source_rss_feed_url'): + download_and_insert_articles(config, episode_limit=args.episode_limit, specific_episode_guid=args.episode_guid, reprocess=args.reprocess) + process_pending_episodes(config, reprocess=args.reprocess, episode_limit=args.episode_limit) + + generate_output_rss_feed(config) + +if __name__ == '__main__': + main() diff --git a/ollama_client.py b/ollama_client.py new file mode 100644 index 0000000..6ee1a0f --- /dev/null +++ b/ollama_client.py @@ -0,0 +1,13 @@ +import ollama + +class OllamaClient: + def __init__(self, model, url='http://localhost:11434'): + self.model = model + self.client = ollama.Client(host=url) + + def generate(self, prompt): + try: + response = self.client.generate(model=self.model, prompt=prompt) + return response['response'] + except Exception as e: + raise Exception(f"Ollama API request failed: {str(e)}") diff --git a/process_website_queue.py b/process_website_queue.py new file mode 100644 index 0000000..8ce43fa --- /dev/null +++ b/process_website_queue.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import time +import json +import argparse +import subprocess +from episode_processor import process_episode +from utils import create_tables, get_engine, Episode +from feed_generator import generate_output_rss_feed +from web_utils import generate_config +from sqlalchemy.orm import sessionmaker + +def process_website_queue(config_template_file, after_command=None): + # Load config template once at the start + with open(config_template_file, 'r') as f: + config_template = json.load(f) + + while True: + db_url = config_template.get('database','web-episodes.db') + engine = get_engine(db_url) + Session = sessionmaker(bind=engine) + session = Session() + create_tables(engine) + + try: + pending_episodes = session.query(Episode).filter( + Episode.processing_status == 'pending' + ).all() + finally: + session.close() + + if pending_episodes: + print(f"Found {len(pending_episodes)} episode(s) to process.") + for episode in pending_episodes: + podcast_id = episode.podcast_id + config = generate_config(config_template, podcast_id) + process_episode(episode, config) + # After processing each episode, regenerate the feed + generate_output_rss_feed(config) + # If after_command is specified, execute it + if after_command: + subprocess.run(after_command, shell=True) + else: + print("No episodes to process. Sleeping for 60 seconds.") + time.sleep(60) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Process website queue") + parser.add_argument('--config', default='web-config.json', help='Path to configuration template file') + parser.add_argument('--after-command', help='Command to execute after each feed is generated') + args = parser.parse_args() + process_website_queue(args.config, args.after_command) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b6b7e63 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,35 @@ +[tool.poetry] +name = "rss2podcast" +version = "0.1.0" +description = "" +authors = ["Juraj Bednar "] +readme = "README.md" +package-mode = false + +[tool.poetry.dependencies] +python = "^3.11" +requests = "^2.32.3" +feedparser = "^6.0.11" +beautifulsoup4 = "^4.12.3" +markdownify = "^0.13.1" +lxml = "^5.3.0" +pytz = "^2024.2" +soundfile = "^0.12.1" +nltk = "^3.9.1" +pywhispercpp = "^1.2.0" +colorama = "^0.4.6" +styletts2 = { git = "https://github.com/jooray/StyleTTS2", branch = "main" } +markdown_to_speech = { git = "https://github.com/jooray/markdown2audio", branch = "main" } +phonemizer = "^3.3.0" +ollama = "^0.3.3" +trafilatura = "^1.12.2" +sqlalchemy = "^2.0.36" + +[tool.poetry.group.web.dependencies] +flask = "^3.1.0" +flask-login = "^0.6.3" +psycopg2-binary = "^2.9.10" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/static/icons/icon-192x192.png b/static/icons/icon-192x192.png new file mode 100644 index 0000000..7d7adae Binary files /dev/null and b/static/icons/icon-192x192.png differ diff --git a/static/icons/icon-512x512.png b/static/icons/icon-512x512.png new file mode 100644 index 0000000..eed4014 Binary files /dev/null and b/static/icons/icon-512x512.png differ diff --git a/static/manifest.json b/static/manifest.json new file mode 100644 index 0000000..9d5449b --- /dev/null +++ b/static/manifest.json @@ -0,0 +1,20 @@ +{ + "name": "Load It Up!", + "short_name": "LoadItUp", + "start_url": "/", + "display": "standalone", + "background_color": "#fdf6e3", + "theme_color": "#268bd2", + "icons": [ + { + "src": "/static/icons/icon-192x192.png", + "sizes": "192x192", + "type": "image/png" + }, + { + "src": "/static/icons/icon-512x512.png", + "sizes": "512x512", + "type": "image/png" + } + ] +} diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..5da4d27 --- /dev/null +++ b/static/style.css @@ -0,0 +1,122 @@ +/* Solarized Color Scheme */ +:root { + --base03: #002b36; + --base02: #073642; + --base01: #586e75; + --base00: #657b83; + --base0: #839496; + --base1: #93a1a1; + --base2: #eee8d5; + --base3: #fdf6e3; + --yellow: #b58900; + --orange: #cb4b16; + --red: #dc322f; + --magenta: #d33682; + --violet: #6c71c4; + --blue: #268bd2; + --cyan: #2aa198; + --green: #859900; +} + +@media (prefers-color-scheme: dark) { + :root { + --background-color: var(--base03); + --foreground-color: var(--base0); + --link-color: var(--orange); + } +} + +@media (prefers-color-scheme: light) { + :root { + --background-color: var(--base3); + --foreground-color: var(--base00); + --link-color: var(--orange); + } +} + +body { + background-color: var(--background-color); + color: var(--foreground-color); + font-family: 'Open Sans', sans-serif; +} + +.container { + width: 80%; + max-width: 600px; + margin: auto; + padding: 20px; +} + +h1 { + text-align: center; + color: var(--orange); +} + +label { + display: block; + margin-top: 20px; + font-size: 16px; +} + +input[type="text"], input[type="url"], input[type="password"] { + width: 100%; + padding: 10px; + background-color: var(--background-color); + color: var(--foreground-color); + border: 1px solid var(--foreground-color); + font-family: 'Open Sans', sans-serif; + font-size: 16px; + box-sizing: border-box; +} + +button { + background-color: var(--orange); + color: var(--background-color); + border: none; + padding: 10px 20px; + cursor: pointer; + font-family: 'Open Sans', sans-serif; + font-size: 16px; + margin-top: 10px; +} + +button:hover { + background-color: var(--foreground-color); + color: var(--background-color); +} + +.messages { + list-style-type: none; + padding: 0; +} + +.messages li { + background-color: var(--red); + color: var(--background-color); + padding: 10px; + margin-bottom: 5px; + font-size: 14px; +} + +a { + color: var(--link-color); +} + +.feed-url { + display: flex; + align-items: center; +} + +.feed-url input { + flex: 1; + margin-right: 10px; +} + +.title-image { + display: block; + max-width: 100%; + height: auto; + margin: 20px auto; + border-radius: 10px; + box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1); +} diff --git a/static/sw.js b/static/sw.js new file mode 100644 index 0000000..b09c0bc --- /dev/null +++ b/static/sw.js @@ -0,0 +1,28 @@ +self.addEventListener('install', function(event) { + event.waitUntil( + caches.open('rss2podcast-cache-v1') + .then(function(cache) { + console.log('Opened cache'); + return cache.addAll([ + '/', + '/static/style.css', + '/static/manifest.json', + '/static/icons/icon-192x192.png', + '/static/icons/icon-512x512.png' + ]); + }) + ); +}); + +self.addEventListener('fetch', function(event) { + event.respondWith( + caches.match(event.request) + .then(function(response) { + // Cache hit - return response + if (response) { + return response; + } + return fetch(event.request); + }) + ); +}); diff --git a/templates/dashboard.html b/templates/dashboard.html new file mode 100644 index 0000000..d3d19bf --- /dev/null +++ b/templates/dashboard.html @@ -0,0 +1,149 @@ + + + + Loaditfor.me + + + + + + + +
+

Welcome, {{ current_user.username }}

+
+ + + +
+
+ + + +
+ {% with messages = get_flashed_messages() %} + {% if messages %} +
    + {% for message in messages %} +
  • {{ message }}
  • + {% endfor %} +
+ {% endif %} + {% endwith %} +

If you've found value in this project, consider reciprocating with a contribution in Bitcoin. Your support directly reflects the value you've received!

+ +
+ + +
+
+ + + +
+ +
+ +
+ +
+ Title Image +
+ + + diff --git a/templates/login.html b/templates/login.html new file mode 100644 index 0000000..7d48aa7 --- /dev/null +++ b/templates/login.html @@ -0,0 +1,34 @@ + + + + Loaditfor.me + + + + + + + +
+

Login

+
+ + + + + +
+

Don't have an account? Register here.

+ {% with messages = get_flashed_messages() %} + {% if messages %} +
    + {% for message in messages %} +
  • {{ message }}
  • + {% endfor %} +
+ {% endif %} + {% endwith %} + Title Image +
+ + diff --git a/templates/register.html b/templates/register.html new file mode 100644 index 0000000..0a9dfe2 --- /dev/null +++ b/templates/register.html @@ -0,0 +1,33 @@ + + + + Loaditfor.me + + + + + + + +
+

Register

+
+ + + + + +
+ {% with messages = get_flashed_messages() %} + {% if messages %} +
    + {% for message in messages %} +
  • {{ message }}
  • + {% endfor %} +
+ {% endif %} + {% endwith %} + Title Image +
+ + diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..2a9ebf0 --- /dev/null +++ b/utils.py @@ -0,0 +1,155 @@ +import re +from datetime import datetime +from xml.etree.ElementTree import SubElement +from sqlalchemy import ( + create_engine, Column, Integer, String, Text, DateTime, Boolean, UniqueConstraint, Float +) +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +import math + +Base = declarative_base() + +# Database models +class Episode(Base): + __tablename__ = 'episodes' + id = Column(Integer, primary_key=True, autoincrement=True) + podcast_id = Column(String) + article_guid = Column(String) + title = Column(String) + link = Column(String) + pub_date = Column(String) + description = Column(String) + content = Column(Text) + processing_status = Column(String, default='pending') + mp3_file_path = Column(String) + processed_date = Column(String) + skipped = Column(Boolean, default=False) + duration = Column(Integer) + file_size = Column(Integer) + __table_args__ = (UniqueConstraint('podcast_id', 'article_guid', name='_podcast_article_uc'),) + +class User(Base): + __tablename__ = 'users' + id = Column(Integer, primary_key=True, autoincrement=True) + username = Column(String, unique=True) + password = Column(Text) + +# Function to get the SQLAlchemy engine +def get_engine(db_url): + if db_url.startswith('postgresql://'): + engine = create_engine(db_url) + else: + engine = create_engine(f'sqlite:///{db_url}', connect_args={"check_same_thread": False}) + return engine + +# Function to create tables +def create_tables(engine): + Base.metadata.create_all(engine) + +def format_duration(seconds): + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + seconds = seconds % 60 + if hours > 0: + return f"{hours}:{minutes:02}:{seconds:02}" + else: + return f"{minutes}:{seconds:02}" + +def add_channel_metadata(channel, metadata): + for key, value in metadata.items(): + if key == 'image': + image = SubElement(channel, 'image') + for img_key, img_value in value.items(): + SubElement(image, img_key).text = str(img_value) + elif key == 'atom_link': + SubElement(channel, 'atom:link', attrib=value) + elif key == 'itunes': + for itunes_key, itunes_value in value.items(): + if itunes_key == 'image': + SubElement(channel, 'itunes:image', href=str(itunes_value['href'])) + elif itunes_key == 'owner': + owner = SubElement(channel, 'itunes:owner') + SubElement(owner, 'itunes:name').text = str(itunes_value['name']) + SubElement(owner, 'itunes:email').text = str(itunes_value['email']) + elif itunes_key == 'category': + for category in itunes_value: + itunes_category = SubElement(channel, 'itunes:category', text=str(category['text'])) + if 'subcategory' in category: + SubElement(itunes_category, 'itunes:category', text=str(category['subcategory'])) + else: + SubElement(channel, f"itunes:{itunes_key}").text = str(itunes_value) + elif key == 'googleplay': + googleplay_category = SubElement(channel, 'googleplay:category') + googleplay_category.text = str(value['category']) + elif key == 'podcast': + add_podcast_namespace(channel, value) + else: + SubElement(channel, key).text = str(value) + +def add_podcast_namespace(channel, podcast_metadata_input): + static_recipient = { + "name": "rss2podcast tool", + "type": "node", + "address": "02f1246b8fe904a5c5193504d8069532b1fb8692b84fb3eb64318b201238f60ff1", + } + + # normalize input + if not isinstance(podcast_metadata_input, dict): + processed = {} + else: + processed = podcast_metadata_input.copy() + + # get or init value block + val = processed.get('value', {}) or {} + if not isinstance(val, dict): + val = {} + val.setdefault('type', "lightning") + val.setdefault('method', "keysend") + val.setdefault('suggested', "0.00000005000") + val.setdefault('recipients', []) + orig = [r for r in val['recipients'] if isinstance(r, dict)] + + final = [] + if not orig: + final.append({**static_recipient, 'split': 100}) + else: + total = sum(int(r.get('split', 0)) for r in orig) + if total <= 0: + final.append({**static_recipient, 'split': 100}) + else: + static_split = max(1, round(total / 9)) + final.append({**static_recipient, 'split': static_split}) + for r in orig: + final.append({**r, 'split': int(r.get('split', 0))}) + val['recipients'] = final + processed['value'] = val + + # build XML + for k, v in processed.items(): + if k == 'value': + attrs = {'type': v['type'], 'method': v['method']} + if v.get('suggested') is not None: + attrs['suggested'] = str(v['suggested']) + tag = SubElement(channel, 'podcast:value', **attrs) + for r in v['recipients']: + SubElement(tag, 'podcast:valueRecipient', + **{k: str(v) for k, v in r.items() if v is not None}) + elif isinstance(v, dict): + SubElement(channel, f"podcast:{k}", + **{k2: str(v2) for k2, v2 in v.items() if v2 is not None}) + elif v is not None: + SubElement(channel, f"podcast:{k}").text = str(v) + +def slugify(value): + value = re.sub(r'[^\w\s-]', '', value).strip().lower() + return re.sub(r'[-\s]+', '-', value) + +def parse_pub_date(pub_date_str): + try: + return datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %z") + except ValueError: + try: + return datetime.fromisoformat(pub_date_str) + except ValueError: + return datetime.utcnow() diff --git a/web-config.json-sample b/web-config.json-sample new file mode 100644 index 0000000..9cde512 --- /dev/null +++ b/web-config.json-sample @@ -0,0 +1,114 @@ +{ + "podcast_id": "{podcast_id}", + "database": "postgresql://rss2podcast@localhost/rss2podcast", + "audio_output_directory": "podcast_audio/{podcast_id}", + "audio_url_prefix": "https://yourdomain.com/podcast_audio/{podcast_id}/", + "feed_output_filename": "podcast_audio/{podcast_id}/feed.xml", + "reprocess_episode": false, + "episode_description_template": "This episode was generated from your provided URL.", + "output_rss_feed": { + "title": "Your Personal Podcast Feed", + "link": "https://yourdomain.com/", + "description": "A personalized podcast feed generated by the RSS-to-Podcast Converter.", + "language": "en-US", + "generator": "rss2podcast", + "image": { + "url": "https://yourdomain.com/static/images/podcast_cover.jpg", + "title": "Podcast Cover", + "link": "https://yourdomain.com", + "width": 1400, + "height": 1400 + }, + "atom_link": { + "href": "https://yourdomain.com/podcast_audio/{podcast_id}/feed.xml", + "rel": "self", + "type": "application/rss+xml" + }, + "itunes": { + "summary": "Your personalized podcast feed.", + "author": "RSS-to-Podcast Converter", + "explicit": false, + "image": { + "href": "https://yourdomain.com/static/images/podcast_cover.jpg" + }, + "owner": { + "name": "Your Name", + "email": "you@example.com" + }, + "subtitle": "Personalized Feed", + "category": [ + { + "text": "Technology", + "subcategory": "Software How-To" + } + ] + }, + "googleplay": { + "category": "Education" + }, + "podcast": { + "locked": "no", + "license": "V4V", + "medium": "podcast", + "podping": { + "usesPodping": "true" + }, + "value": { + "type": "lightning", + "method": "keysend", + "suggested": "0.00000005000", + "recipients": [ + { + "type": "node", + "split": "99", + "address": "02f1246b8fe904a5c5193504d8069532b1fb8692b84fb3eb64318b201238f60ff1", + "name": "Main Recipient" + }, + { + "name": "boostbot@fountain.fm", + "type": "node", + "split": "1", + "address": "03b6f613e88bd874177c28c6ad83b3baba43c4c656f56be1f8df84669556054b79", + "customKey": "906608", + "customValue": "01IMQkt4BFzAiSynxcQQqd", + "fee": "true" + } + ] + } + } + }, + "blog_to_speech_llm": { + "enabled": true, + "endpoint": "http://127.0.0.1:11434", + "model": "dolphin-llama3:70b", + "prompt": "Please convert the text that begins after TEXT: below into a format suitable for an audio narration, with focus on minimal wording changes. Preserve markdown formatting. The text should be easy to understand for listeners without visual cues, such as parentheses or list formatting. Maintain the original meaning and details but adapt it for spoken language, do not summarize, simplify, keep the content as close to the original as possible. Break up long sentences if necessary to enhance clarity and listenability. If you add numbered bullet points, write them in words instead of numbers (\"First\" instead of \"1.\"). Change words into pronounced forms. \"fountain.fm\" would become \"fountain dot fm\", \"2.0\" would be \"two point oh\". Only output converted text, nothing else. TEXT:", + "max_chunk_size": 1000, + "max_length_difference": 0.25 + }, + "llm_verifier": { + "enabled": true, + "endpoint": "http://localhost:11434", + "model": "gemma3:27b", + "prompt": "Verify that the output text maintains the same meaning and content as the input text, without adding extra information or omitting important details. The output should be adapted for spoken language but should not significantly alter the original content. Respond with 'Valid' if the output meets these criteria, do not output anything else. If it does not meet this criteria, output 'Wrong:' and a justification on why. ", + "expected_output": "Valid" + }, + "tts_options": { + "ref_audio": "voices/reference.wav", + "pause": 0.5, + "alpha": 0.3, + "beta": 0.7, + "diffusion_steps": 5, + "embedding_scale": 1.0, + "min_similarity": 0.9, + "split_at_headings": false, + "max_retries": 1 + }, + "prefix_audio_files": ["voices/01-intro.wav", + "voices/02-intro.wav"], + "postfix_audio_files": [ "voices/03-outro.wav" ], + "audio_speedup": 1.3, + "mp3_conversion": { + "bitrate": "192k", + "codec": "libmp3lame" + } +} diff --git a/web_utils.py b/web_utils.py new file mode 100644 index 0000000..1cbf030 --- /dev/null +++ b/web_utils.py @@ -0,0 +1,15 @@ +import json +import re + +def generate_config(config_template, podcast_id): + config_str = json.dumps(config_template) + config_str = config_str.replace('{podcast_id}', podcast_id) + config = json.loads(config_str) + config['podcast_id'] = podcast_id + return config + +def sanitize_username(username): + if re.match(r'^[a-zA-Z0-9_-]+$', username): + return True + else: + return False