Swellのブログ記事内容をhtmlタグも含めて取得するpythonスクリプト

2024-12-302024-12-31

備忘録として残しておきます。

Swellのブログ記事内容をhtmlタグも含めて取得するpythonスクリプト

htmlタグも含めて取得してくれます。

import tkinter as tk
from tkinter import scrolledtext, messagebox
import requests
from bs4 import BeautifulSoup

def fetch_post_content():
    url = url_entry.get()
    if not url:
        messagebox.showerror("Error", "Please enter a URL")
        return

    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad HTTP responses
        soup = BeautifulSoup(response.text, 'html.parser')

        post_content = soup.find(class_='post_content')
        h1_title = soup.find('h1', class_='c-postTitle__ttl')
        
        result = ""

        if h1_title:
            result += f"<h1>{h1_title.get_text(strip=True)}</h1>\n\n"

        if post_content:
            elements = post_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'p', 'a', 'code'])
            for element in elements:
                tag_name = element.name
                text_content = element.get_text(strip=True)
                result += f"<{tag_name}>{text_content}</{tag_name}>\n\n"
        else:
            result += "Class 'post_content' not found on this page."

        output_box.delete(1.0, tk.END)
        output_box.insert(tk.END, result.strip())

    except requests.exceptions.RequestException as e:
        messagebox.showerror("Error", f"Failed to fetch the page: {e}")

# GUI setup
root = tk.Tk()
root.title("Post Content Extractor")

# URL input
url_label = tk.Label(root, text="Enter URL:")
url_label.pack(pady=5)

url_entry = tk.Entry(root, width=50)
url_entry.pack(pady=5)

fetch_button = tk.Button(root, text="Fetch Post Content", command=fetch_post_content)
fetch_button.pack(pady=10)

# Output display
output_box = scrolledtext.ScrolledText(root, width=80, height=20, wrap=tk.WORD)
output_box.pack(pady=10)

# Run the GUI
root.mainloop()