rss2podcast/content_processing.py

import re
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from ollama_client import OllamaClient
import nltk
from nltk.tokenize import sent_tokenize
from colorama import init, Fore, Style

# Initialize colorama
init(autoreset=True)

def clean_and_convert_content(content, is_markdown):
    if not is_markdown:
        # Clean HTML and convert to Markdown
        clean_html = clean_html_content(content)
        markdown_content = convert_html_to_markdown(clean_html)
    else:
        markdown_content = content

    # Remove images and links
    markdown_content = remove_images_and_links(markdown_content)
    return markdown_content

def prepare_for_speech(markdown_content, config):
    # Optional LLM Processing
    if config['blog_to_speech_llm']['enabled']:
        markdown_content = process_with_llm(markdown_content, config)
    return markdown_content

def clean_html_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for script in soup(['script', 'style']):
        script.decompose()
    return str(soup)

def convert_html_to_markdown(html_content):
    return md(html_content, strip=['a', 'img', 'b', 'i'], heading_style="ATX", escape_asterisks=False,
              escape_underscores=False, escape_misc=False, bullets='*')

def remove_images_and_links(markdown_content):
    markdown_content = re.sub(r'!\[.*?\]\(.*?\)', '', markdown_content)
    markdown_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', markdown_content)
    return markdown_content

def process_with_llm(markdown_content, config):
    try:
        ollama_client = OllamaClient(config['blog_to_speech_llm']['model'], url=config['blog_to_speech_llm']['endpoint'])
        max_chunk_size = config['blog_to_speech_llm']['max_chunk_size']

        chunks = split_content(markdown_content, max_chunk_size)
        processed_chunks = []

        for chunk in chunks:
            prompt = f"{config['blog_to_speech_llm']['prompt']}\n\n{chunk}"
            print(f"{Fore.GREEN}Processing chunk with LLM:{Style.RESET_ALL} \"{chunk}\"")
            response = ollama_client.generate(prompt)
            print(f"{Fore.GREEN}Processed chunk:{Style.RESET_ALL} \"{response}\"")

            if response and isinstance(response, str):
                verified = verify_output(chunk, response, config)
                if verified:
                    processed_chunks.append(response)
                else:
                    processed_chunks.append(chunk)
            else:
                processed_chunks.append(chunk)

        if len(processed_chunks) == len(chunks):
            return '\n\n'.join(processed_chunks)

        print(f"{Fore.RED}LLM processing failed. Using original content.{Style.RESET_ALL}")
        return markdown_content
    except Exception as e:
        print(f"{Fore.RED}Error in LLM processing: {e}{Style.RESET_ALL}")
        return markdown_content

def split_content(content, max_chunk_size):
    sentences = sent_tokenize(content)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
            current_chunk += sentence + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + "\n"

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def verify_output(input_text, output_text, config):
    max_length_difference = config['blog_to_speech_llm']['max_length_difference']
    verifier_config = config['llm_verifier']

    # Length check
    input_length = len(input_text)
    output_length = len(output_text)
    length_difference = abs(input_length - output_length) / input_length

    if length_difference > max_length_difference:
        print(f"{Fore.RED}Length difference ({length_difference:.2%}) exceeds maximum allowed ({max_length_difference:.2%}){Style.RESET_ALL}")
        return False

    # If LLM verifier is not enabled, return True after passing length check
    if not verifier_config['enabled']:
        return True

    ollama_client = OllamaClient(verifier_config['model'], url=verifier_config['endpoint'])

    prompt = f"{verifier_config['prompt']}\n\nInput: {input_text}\n\nOutput: {output_text}"

    response = ollama_client.generate(prompt)
    if response.strip() == verifier_config['expected_output']:
        print(f"{Fore.GREEN}LLM validated the output, yay!{Style.RESET_ALL}")
        return True

    print(f"{Fore.RED}LLM verification failed:{Style.RESET_ALL} {response.strip()}")
    return False