rss2podcast/content_processing.py

122 lines
4.4 KiB
Python
Raw Permalink Normal View History

2024-11-05 14:45:19 +01:00
import re
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from ollama_client import OllamaClient
import nltk
from nltk.tokenize import sent_tokenize
from colorama import init, Fore, Style
# Initialize colorama
init(autoreset=True)
def clean_and_convert_content(content, is_markdown):
if not is_markdown:
# Clean HTML and convert to Markdown
clean_html = clean_html_content(content)
markdown_content = convert_html_to_markdown(clean_html)
else:
markdown_content = content
# Remove images and links
markdown_content = remove_images_and_links(markdown_content)
return markdown_content
def prepare_for_speech(markdown_content, config):
# Optional LLM Processing
if config['blog_to_speech_llm']['enabled']:
markdown_content = process_with_llm(markdown_content, config)
return markdown_content
def clean_html_content(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
for script in soup(['script', 'style']):
script.decompose()
return str(soup)
def convert_html_to_markdown(html_content):
return md(html_content, strip=['a', 'img', 'b', 'i'], heading_style="ATX", escape_asterisks=False,
escape_underscores=False, escape_misc=False, bullets='*')
def remove_images_and_links(markdown_content):
markdown_content = re.sub(r'!\[.*?\]\(.*?\)', '', markdown_content)
markdown_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', markdown_content)
return markdown_content
def process_with_llm(markdown_content, config):
try:
ollama_client = OllamaClient(config['blog_to_speech_llm']['model'], url=config['blog_to_speech_llm']['endpoint'])
max_chunk_size = config['blog_to_speech_llm']['max_chunk_size']
chunks = split_content(markdown_content, max_chunk_size)
processed_chunks = []
for chunk in chunks:
prompt = f"{config['blog_to_speech_llm']['prompt']}\n\n{chunk}"
print(f"{Fore.GREEN}Processing chunk with LLM:{Style.RESET_ALL} \"{chunk}\"")
response = ollama_client.generate(prompt)
print(f"{Fore.GREEN}Processed chunk:{Style.RESET_ALL} \"{response}\"")
if response and isinstance(response, str):
verified = verify_output(chunk, response, config)
if verified:
processed_chunks.append(response)
else:
processed_chunks.append(chunk)
else:
processed_chunks.append(chunk)
if len(processed_chunks) == len(chunks):
return '\n\n'.join(processed_chunks)
print(f"{Fore.RED}LLM processing failed. Using original content.{Style.RESET_ALL}")
return markdown_content
except Exception as e:
print(f"{Fore.RED}Error in LLM processing: {e}{Style.RESET_ALL}")
return markdown_content
def split_content(content, max_chunk_size):
sentences = sent_tokenize(content)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
current_chunk += sentence + "\n"
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + "\n"
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def verify_output(input_text, output_text, config):
max_length_difference = config['blog_to_speech_llm']['max_length_difference']
verifier_config = config['llm_verifier']
# Length check
input_length = len(input_text)
output_length = len(output_text)
length_difference = abs(input_length - output_length) / input_length
if length_difference > max_length_difference:
print(f"{Fore.RED}Length difference ({length_difference:.2%}) exceeds maximum allowed ({max_length_difference:.2%}){Style.RESET_ALL}")
return False
# If LLM verifier is not enabled, return True after passing length check
if not verifier_config['enabled']:
return True
ollama_client = OllamaClient(verifier_config['model'], url=verifier_config['endpoint'])
prompt = f"{verifier_config['prompt']}\n\nInput: {input_text}\n\nOutput: {output_text}"
response = ollama_client.generate(prompt)
if response.strip() == verifier_config['expected_output']:
print(f"{Fore.GREEN}LLM validated the output, yay!{Style.RESET_ALL}")
return True
print(f"{Fore.RED}LLM verification failed:{Style.RESET_ALL} {response.strip()}")
return False