rss2podcast/add_website.py

#!/usr/bin/env python3
import argparse
import sys
import json
from datetime import datetime
import trafilatura
from utils import get_engine, create_tables, Episode
from sqlalchemy.orm import sessionmaker

def add_website_to_db(db_url, url, podcast_id):
    # Fetch and extract content using trafilatura
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        print(f"Failed to download content from URL: {url}")
        sys.exit(1)

    content = trafilatura.extract(downloaded, include_comments=False)
    metadata = trafilatura.extract_metadata(downloaded)
    title = metadata.title if metadata and metadata.title else "Untitled"

    if not content.strip():
        print("No content extracted from the URL.")
        sys.exit(1)

    # Generate GUID using the URL
    article_guid = url

    # Get current date and time in the required format
    pub_date = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000')

    # Connect to the database and insert the episode
    engine = get_engine(db_url)
    Session = sessionmaker(bind=engine)
    session = Session()
    create_tables(engine)

    episode = Episode(
        podcast_id=podcast_id,
        article_guid=article_guid,
        title=title,
        link=url,
        pub_date=pub_date,
        description='',
        content=content,
        processing_status='pending',
        skipped=False
    )
    session.add(episode)
    session.commit()
    session.close()
    print(f"Website '{title}' added to the database with GUID '{article_guid}' under podcast_id '{podcast_id}'.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Add a website to the episodes database.")
    parser.add_argument("url", help="URL of the website to add.")
    parser.add_argument("--config", default="config.json", help="Path to configuration file.")
    parser.add_argument("--db", help="Database filename or connection string (overrides config).")

    args = parser.parse_args()

    # Load configuration
    config = {}
    if args.config:
        try:
            with open(args.config, 'r') as f:
                config = json.load(f)
                config['config_file_path'] = args.config
        except Exception as e:
            print(f"Warning: Could not load configuration file: {e}")
            if not args.db:
                sys.exit(1)

    # Set database filename or connection string
    db_url = args.db or config.get('database', 'episodes.db')

    # Get podcast_id from config
    podcast_id = config.get('podcast_id')
    if not podcast_id:
        podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id')

    # Add website to the database
    add_website_to_db(db_url, args.url, podcast_id)
initial commit 2024-11-05 14:45:19 +01:00			`#!/usr/bin/env python3`
			`import argparse`
			`import sys`
			`import json`
			`from datetime import datetime`
			`import trafilatura`
			`from utils import get_engine, create_tables, Episode`
			`from sqlalchemy.orm import sessionmaker`

			`def add_website_to_db(db_url, url, podcast_id):`
			`# Fetch and extract content using trafilatura`
			`downloaded = trafilatura.fetch_url(url)`
			`if not downloaded:`
			`print(f"Failed to download content from URL: {url}")`
			`sys.exit(1)`

			`content = trafilatura.extract(downloaded, include_comments=False)`
			`metadata = trafilatura.extract_metadata(downloaded)`
			`title = metadata.title if metadata and metadata.title else "Untitled"`

			`if not content.strip():`
			`print("No content extracted from the URL.")`
			`sys.exit(1)`

			`# Generate GUID using the URL`
			`article_guid = url`

			`# Get current date and time in the required format`
			`pub_date = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000')`

			`# Connect to the database and insert the episode`
			`engine = get_engine(db_url)`
			`Session = sessionmaker(bind=engine)`
			`session = Session()`
			`create_tables(engine)`

			`episode = Episode(`
			`podcast_id=podcast_id,`
			`article_guid=article_guid,`
			`title=title,`
			`link=url,`
			`pub_date=pub_date,`
			`description='',`
			`content=content,`
			`processing_status='pending',`
			`skipped=False`
			`)`
			`session.add(episode)`
			`session.commit()`
			`session.close()`
			`print(f"Website '{title}' added to the database with GUID '{article_guid}' under podcast_id '{podcast_id}'.")`

			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(description="Add a website to the episodes database.")`
			`parser.add_argument("url", help="URL of the website to add.")`
			`parser.add_argument("--config", default="config.json", help="Path to configuration file.")`
			`parser.add_argument("--db", help="Database filename or connection string (overrides config).")`

			`args = parser.parse_args()`

			`# Load configuration`
			`config = {}`
			`if args.config:`
			`try:`
			`with open(args.config, 'r') as f:`
			`config = json.load(f)`
			`config['config_file_path'] = args.config`
			`except Exception as e:`
			`print(f"Warning: Could not load configuration file: {e}")`
			`if not args.db:`
			`sys.exit(1)`

			`# Set database filename or connection string`
			`db_url = args.db or config.get('database', 'episodes.db')`

			`# Get podcast_id from config`
			`podcast_id = config.get('podcast_id')`
			`if not podcast_id:`
			`podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id')`

			`# Add website to the database`
			`add_website_to_db(db_url, args.url, podcast_id)`