mirror of
https://github.com/jooray/rss2podcast.git
synced 2025-05-23 07:52:00 +00:00
124 lines
5.0 KiB
Python
124 lines
5.0 KiB
Python
import feedparser
|
|
import re
|
|
from utils import create_tables, slugify, get_engine, Episode
|
|
from datetime import datetime
|
|
from content_processing import clean_and_convert_content
|
|
from sqlalchemy.orm import sessionmaker
|
|
|
|
def download_and_insert_articles(config, episode_limit=None, specific_episode_guid=None, reprocess=False):
|
|
feed_url = config.get('source_rss_feed_url')
|
|
if not feed_url:
|
|
print("No source RSS feed URL provided. Skipping feed download.")
|
|
return
|
|
|
|
process_articles_since = config.get('process_articles_since')
|
|
if process_articles_since:
|
|
try:
|
|
process_date = parse_config_date(process_articles_since)
|
|
except ValueError as e:
|
|
print(f"Error parsing process_articles_since date: {e}. Allowed formats: YYYY, MM-YYYY, DD-MM-YYYY or RSS pubDate format")
|
|
return
|
|
|
|
podcast_id = config.get('podcast_id')
|
|
if not podcast_id:
|
|
podcast_id = config.get('output_rss_feed', {}).get('atom_link', {}).get('href', 'default_podcast_id')
|
|
|
|
feed = feedparser.parse(feed_url)
|
|
engine = get_engine(config['database'])
|
|
Session = sessionmaker(bind=engine)
|
|
session = Session()
|
|
create_tables(engine)
|
|
|
|
try:
|
|
episodes_added = 0
|
|
|
|
for entry in feed.entries:
|
|
article_guid = entry.get('id') or entry.get('guid') or entry.link
|
|
|
|
if specific_episode_guid and article_guid != specific_episode_guid:
|
|
continue
|
|
|
|
pub_date = entry.get('published', datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000'))
|
|
try:
|
|
entry_date = datetime.strptime(pub_date, '%a, %d %b %Y %H:%M:%S %z')
|
|
except ValueError:
|
|
# Try ISO 8601 format
|
|
entry_date = datetime.strptime(pub_date, '%Y-%m-%dT%H:%M:%S%z')
|
|
if process_articles_since:
|
|
if entry_date.date() < process_date.date():
|
|
continue
|
|
|
|
existing_episode = session.query(Episode).filter(
|
|
Episode.podcast_id == podcast_id,
|
|
Episode.article_guid == article_guid
|
|
).first()
|
|
|
|
# If an episode already exists and is marked as skipped, simply continue.
|
|
if existing_episode and existing_episode.skipped:
|
|
continue
|
|
|
|
# Check if the episode should be skipped based on title matching any skip_regexps
|
|
skip_regexps = config.get('skip_regexps', [])
|
|
if skip_regexps and any(re.search(pattern, entry.title) for pattern in skip_regexps):
|
|
print(f"Skipping article '{entry.title}' because it matches a skip pattern.")
|
|
continue
|
|
|
|
content = entry.get('content', [{'value': ''}])[0]['value']
|
|
if not content:
|
|
content = entry.get('description') or ''
|
|
is_markdown = False # Assume content from feed is HTML
|
|
content = clean_and_convert_content(content, is_markdown)
|
|
|
|
if existing_episode:
|
|
if existing_episode.processing_status == 'reprocess' or reprocess:
|
|
# Update episode with new metadata and mark as pending
|
|
print(f"Will reprocess article '{entry.title}'.")
|
|
existing_episode.title = entry.title
|
|
existing_episode.link = entry.link
|
|
existing_episode.pub_date = pub_date
|
|
existing_episode.description = entry.get('summary', '')
|
|
existing_episode.content = content
|
|
existing_episode.processing_status = 'pending'
|
|
session.commit()
|
|
print(f"Episode '{existing_episode.title}' updated and marked for reprocessing.")
|
|
else:
|
|
continue # Episode already exists and is not marked for reprocessing
|
|
else:
|
|
episode = Episode(
|
|
podcast_id=podcast_id,
|
|
article_guid=article_guid,
|
|
title=entry.title,
|
|
link=entry.link,
|
|
pub_date=pub_date,
|
|
description=entry.get('summary', ''),
|
|
content=content,
|
|
processing_status='pending',
|
|
skipped=False
|
|
)
|
|
session.add(episode)
|
|
session.commit()
|
|
episodes_added += 1
|
|
print(f"Episode '{episode.title}' added to the database.")
|
|
|
|
if episode_limit and episodes_added >= episode_limit:
|
|
break
|
|
finally:
|
|
session.close()
|
|
|
|
def parse_config_date(date_string):
|
|
formats = [
|
|
'%Y', # YYYY
|
|
'%m-%Y', # MM-YYYY
|
|
'%d-%m-%Y', # DD-MM-YYYY
|
|
'%a, %d %b %Y %H:%M:%S %z', # RSS pubDate format
|
|
'%Y-%m-%dT%H:%M:%S%z' # ISO8601 format
|
|
]
|
|
|
|
for fmt in formats:
|
|
try:
|
|
return datetime.strptime(date_string, fmt)
|
|
except ValueError:
|
|
continue
|
|
|
|
raise ValueError(f"Unable to parse date: {date_string}")
|