import feedparser import re from utils import create_tables, slugify, get_engine, Episode from datetime import datetime from content_processing import clean_and_convert_content from sqlalchemy.orm import sessionmaker def download_and_insert_articles(config, episode_limit=None, specific_episode_guid=None, reprocess=False): feed_url = config.get('source_rss_feed_url') if not feed_url: print("No source RSS feed URL provided. Skipping feed download.") return process_articles_since = config.get('process_articles_since') if process_articles_since: try: process_date = parse_config_date(process_articles_since) except ValueError as e: print(f"Error parsing process_articles_since date: {e}. Allowed formats: YYYY, MM-YYYY, DD-MM-YYYY or RSS pubDate format") return podcast_id = config.get('podcast_id') if not podcast_id: podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id') feed = feedparser.parse(feed_url) engine = get_engine(config['database']) Session = sessionmaker(bind=engine) session = Session() create_tables(engine) try: episodes_added = 0 for entry in feed.entries: article_guid = entry.get('id') or entry.get('guid') or entry.link if specific_episode_guid and article_guid != specific_episode_guid: continue pub_date = entry.get('published', datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000')) try: entry_date = datetime.strptime(pub_date, '%a, %d %b %Y %H:%M:%S %z') except ValueError: # Try ISO 8601 format entry_date = datetime.strptime(pub_date, '%Y-%m-%dT%H:%M:%S%z') if process_articles_since: if entry_date.date() < process_date.date(): continue existing_episode = session.query(Episode).filter( Episode.podcast_id == podcast_id, Episode.article_guid == article_guid ).first() # If an episode already exists and is marked as skipped, simply continue. if existing_episode and existing_episode.skipped: continue # Check if the episode should be skipped based on title matching any skip_regexps skip_regexps = config.get('skip_regexps', []) if skip_regexps and any(re.search(pattern, entry.title) for pattern in skip_regexps): print(f"Skipping article '{entry.title}' because it matches a skip pattern.") continue content = entry.get('content', [{'value': ''}])[0]['value'] if not content: content = entry.get('description') or '' is_markdown = False # Assume content from feed is HTML content = clean_and_convert_content(content, is_markdown) if existing_episode: if existing_episode.processing_status == 'reprocess' or reprocess: # Update episode with new metadata and mark as pending print(f"Will reprocess article '{entry.title}'.") existing_episode.title = entry.title existing_episode.link = entry.link existing_episode.pub_date = pub_date existing_episode.description = entry.get('summary', '') existing_episode.content = content existing_episode.processing_status = 'pending' session.commit() print(f"Episode '{existing_episode.title}' updated and marked for reprocessing.") else: continue # Episode already exists and is not marked for reprocessing else: episode = Episode( podcast_id=podcast_id, article_guid=article_guid, title=entry.title, link=entry.link, pub_date=pub_date, description=entry.get('summary', ''), content=content, processing_status='pending', skipped=False ) session.add(episode) session.commit() episodes_added += 1 print(f"Episode '{episode.title}' added to the database.") if episode_limit and episodes_added >= episode_limit: break finally: session.close() def parse_config_date(date_string): formats = [ '%Y', # YYYY '%m-%Y', # MM-YYYY '%d-%m-%Y', # DD-MM-YYYY '%a, %d %b %Y %H:%M:%S %z', # RSS pubDate format '%Y-%m-%dT%H:%M:%S%z' # ISO8601 format ] for fmt in formats: try: return datetime.strptime(date_string, fmt) except ValueError: continue raise ValueError(f"Unable to parse date: {date_string}")