rss2podcast/feed_downloader.py

import feedparser
import re
from utils import create_tables, slugify, get_engine, Episode
from datetime import datetime
from content_processing import clean_and_convert_content
from sqlalchemy.orm import sessionmaker

def download_and_insert_articles(config, episode_limit=None, specific_episode_guid=None, reprocess=False):
    feed_url = config.get('source_rss_feed_url')
    if not feed_url:
        print("No source RSS feed URL provided. Skipping feed download.")
        return

    process_articles_since = config.get('process_articles_since')
    if process_articles_since:
        try:
            process_date = parse_config_date(process_articles_since)
        except ValueError as e:
            print(f"Error parsing process_articles_since date: {e}. Allowed formats: YYYY, MM-YYYY, DD-MM-YYYY or RSS pubDate format")
            return

    podcast_id = config.get('podcast_id')
    if not podcast_id:
        podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id')

    feed = feedparser.parse(feed_url)
    engine = get_engine(config['database'])
    Session = sessionmaker(bind=engine)
    session = Session()
    create_tables(engine)

    try:
        episodes_added = 0

        for entry in feed.entries:
            article_guid = entry.get('id') or entry.get('guid') or entry.link

            if specific_episode_guid and article_guid != specific_episode_guid:
                continue

            pub_date = entry.get('published', datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000'))
            try:
                entry_date = datetime.strptime(pub_date, '%a, %d %b %Y %H:%M:%S %z')
            except ValueError:
                # Try ISO 8601 format
                entry_date = datetime.strptime(pub_date, '%Y-%m-%dT%H:%M:%S%z')
            if process_articles_since:
                if entry_date.date() < process_date.date():
                    continue

            existing_episode = session.query(Episode).filter(
                Episode.podcast_id == podcast_id,
                Episode.article_guid == article_guid
            ).first()

            # If an episode already exists and is marked as skipped, simply continue.
            if existing_episode and existing_episode.skipped:
                continue

            # Check if the episode should be skipped based on title matching any skip_regexps
            skip_regexps = config.get('skip_regexps', [])
            if skip_regexps and any(re.search(pattern, entry.title) for pattern in skip_regexps):
                print(f"Skipping article '{entry.title}' because it matches a skip pattern.")
                continue

            content = entry.get('content', [{'value': ''}])[0]['value']
            if not content:
                content = entry.get('description') or ''
            is_markdown = False  # Assume content from feed is HTML
            content = clean_and_convert_content(content, is_markdown)

            if existing_episode:
                if existing_episode.processing_status == 'reprocess' or reprocess:
                    # Update episode with new metadata and mark as pending
                    print(f"Will reprocess article '{entry.title}'.")
                    existing_episode.title = entry.title
                    existing_episode.link = entry.link
                    existing_episode.pub_date = pub_date
                    existing_episode.description = entry.get('summary', '')
                    existing_episode.content = content
                    existing_episode.processing_status = 'pending'
                    session.commit()
                    print(f"Episode '{existing_episode.title}' updated and marked for reprocessing.")
                else:
                    continue  # Episode already exists and is not marked for reprocessing
            else:
                episode = Episode(
                    podcast_id=podcast_id,
                    article_guid=article_guid,
                    title=entry.title,
                    link=entry.link,
                    pub_date=pub_date,
                    description=entry.get('summary', ''),
                    content=content,
                    processing_status='pending',
                    skipped=False
                )
                session.add(episode)
                session.commit()
                episodes_added += 1
                print(f"Episode '{episode.title}' added to the database.")

                if episode_limit and episodes_added >= episode_limit:
                    break
    finally:
        session.close()

def parse_config_date(date_string):
    formats = [
        '%Y',          # YYYY
        '%m-%Y',       # MM-YYYY
        '%d-%m-%Y',    # DD-MM-YYYY
        '%a, %d %b %Y %H:%M:%S %z',  # RSS pubDate format
        '%Y-%m-%dT%H:%M:%S%z' # ISO8601 format
    ]

    for fmt in formats:
        try:
            return datetime.strptime(date_string, fmt)
        except ValueError:
            continue

    raise ValueError(f"Unable to parse date: {date_string}")
initial commit 2024-11-05 14:45:19 +01:00			`import feedparser`
			`import re`
			`from utils import create_tables, slugify, get_engine, Episode`
			`from datetime import datetime`
			`from content_processing import clean_and_convert_content`
			`from sqlalchemy.orm import sessionmaker`

			`def download_and_insert_articles(config, episode_limit=None, specific_episode_guid=None, reprocess=False):`
			`feed_url = config.get('source_rss_feed_url')`
			`if not feed_url:`
			`print("No source RSS feed URL provided. Skipping feed download.")`
			`return`

			`process_articles_since = config.get('process_articles_since')`
			`if process_articles_since:`
			`try:`
			`process_date = parse_config_date(process_articles_since)`
			`except ValueError as e:`
			`print(f"Error parsing process_articles_since date: {e}. Allowed formats: YYYY, MM-YYYY, DD-MM-YYYY or RSS pubDate format")`
			`return`

			`podcast_id = config.get('podcast_id')`
			`if not podcast_id:`
			`podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id')`

			`feed = feedparser.parse(feed_url)`
			`engine = get_engine(config['database'])`
			`Session = sessionmaker(bind=engine)`
			`session = Session()`
			`create_tables(engine)`

			`try:`
			`episodes_added = 0`

			`for entry in feed.entries:`
			`article_guid = entry.get('id') or entry.get('guid') or entry.link`

			`if specific_episode_guid and article_guid != specific_episode_guid:`
			`continue`

			`pub_date = entry.get('published', datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000'))`
			`try:`
			`entry_date = datetime.strptime(pub_date, '%a, %d %b %Y %H:%M:%S %z')`
			`except ValueError:`
			`# Try ISO 8601 format`
			`entry_date = datetime.strptime(pub_date, '%Y-%m-%dT%H:%M:%S%z')`
			`if process_articles_since:`
			`if entry_date.date() < process_date.date():`
			`continue`

			`existing_episode = session.query(Episode).filter(`
			`Episode.podcast_id == podcast_id,`
			`Episode.article_guid == article_guid`
			`).first()`

			`# If an episode already exists and is marked as skipped, simply continue.`
			`if existing_episode and existing_episode.skipped:`
			`continue`

			`# Check if the episode should be skipped based on title matching any skip_regexps`
			`skip_regexps = config.get('skip_regexps', [])`
			`if skip_regexps and any(re.search(pattern, entry.title) for pattern in skip_regexps):`
			`print(f"Skipping article '{entry.title}' because it matches a skip pattern.")`
			`continue`

			`content = entry.get('content', [{'value': ''}])[0]['value']`
			`if not content:`
			`content = entry.get('description') or ''`
			`is_markdown = False # Assume content from feed is HTML`
			`content = clean_and_convert_content(content, is_markdown)`

			`if existing_episode:`
			`if existing_episode.processing_status == 'reprocess' or reprocess:`
			`# Update episode with new metadata and mark as pending`
			`print(f"Will reprocess article '{entry.title}'.")`
			`existing_episode.title = entry.title`
			`existing_episode.link = entry.link`
			`existing_episode.pub_date = pub_date`
			`existing_episode.description = entry.get('summary', '')`
			`existing_episode.content = content`
			`existing_episode.processing_status = 'pending'`
			`session.commit()`
			`print(f"Episode '{existing_episode.title}' updated and marked for reprocessing.")`
			`else:`
			`continue # Episode already exists and is not marked for reprocessing`
			`else:`
			`episode = Episode(`
			`podcast_id=podcast_id,`
			`article_guid=article_guid,`
			`title=entry.title,`
			`link=entry.link,`
			`pub_date=pub_date,`
			`description=entry.get('summary', ''),`
			`content=content,`
			`processing_status='pending',`
			`skipped=False`
			`)`
			`session.add(episode)`
			`session.commit()`
			`episodes_added += 1`
			`print(f"Episode '{episode.title}' added to the database.")`

			`if episode_limit and episodes_added >= episode_limit:`
			`break`
			`finally:`
			`session.close()`

			`def parse_config_date(date_string):`
			`formats = [`
			`'%Y', # YYYY`
			`'%m-%Y', # MM-YYYY`
			`'%d-%m-%Y', # DD-MM-YYYY`
			`'%a, %d %b %Y %H:%M:%S %z', # RSS pubDate format`
			`'%Y-%m-%dT%H:%M:%S%z' # ISO8601 format`
			`]`

			`for fmt in formats:`
			`try:`
			`return datetime.strptime(date_string, fmt)`
			`except ValueError:`
			`continue`

			`raise ValueError(f"Unable to parse date: {date_string}")`