ジッピーにモゴミのwikiのapi叩いて記事のhtmlをbootstrap風に出力しろと命令した結果出てきたゴミ

@ntxIii7WlrtWGxtxxpSH / 更新: 2026/01/04 23:06

紛うことなきゴミなので扱いには要注意

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, quote
import re
import html
WIKI_BASE = "https://wiki.yjsnpi.nu/wiki"
API_ENDPOINT = "https://wiki.yjsnpi.nu/w/api.php"
if len(sys.argv) < 2:
print("usage: {} ページ名".format(sys.argv[0]), file=sys.stderr)
sys.exit(1)
page_name = sys.argv[1]
def fetch_parsed_html(page):
params = {
"action": "parse",
"page": page,
"prop": "text|displaytitle",
"format": "json",
}
r = requests.get(API_ENDPOINT, params=params, timeout=15)
r.raise_for_status()
data = r.json()
if "error" in data:
raise RuntimeError("API error: {}".format(data["error"]))
if "parse" not in data:
raise RuntimeError("ページが見つかりません: " + page)
return data["parse"].get("displaytitle") or page, data["parse"]["text"]["*"]
# 簡易 style -> Bootstrap class マッピング
STYLE_CLASS_MAP = [
# 代表的な置換パターン (順に評価)
(re.compile(r'^\s*float\s*:\s*right\s*;?\s*$', re.I), "float-end"),
(re.compile(r'^\s*float\s*:\s*left\s*;?\s*$', re.I), "float-start"),
(re.compile(r'^\s*display\s*:\s*block\s*;?\s*$', re.I), "d-block"),
(re.compile(r'^\s*display\s*:\s*inline-block\s*;?\s*$', re.I), "d-inline-block"),
(re.compile(r'^\s*text-align\s*:\s*center\s*;?\s*$', re.I), "text-center"),
(re.compile(r'^\s*text-align\s*:\s*right\s*;?\s*$', re.I), "text-end"),
(re.compile(r'^\s*text-align\s*:\s*left\s*;?\s*$', re.I), "text-start"),
(re.compile(r'^\s*width\s*:\s*100%\s*;?\s*$', re.I), "w-100"),
(re.compile(r'^\s*max-width\s*:\s*100%\s*;?\s*$', re.I), "img-fluid"),
(re.compile(r'^\s*margin\s*:\s*0\s*;\s*$', re.I), "m-0"),
(re.compile(r'^\s*margin-left\s*:\s*auto\s*;\s*margin-right\s*:\s*auto\s*;?\s*$', re.I), "mx-auto"),
(re.compile(r'^\s*margin-top\s*:\s*(\d+)px\s*;?\s*$', re.I), lambda m: f"mt-{int(int(m.group(1))/4)}"), # rough
(re.compile(r'^\s*padding\s*:\s*(\d+)px\s*;?\s*$', re.I), lambda m: f"p-{int(int(m.group(1))/4)}"),
]
def style_to_bootstrap_classes(style_value):
if not style_value:
return None
# normalize ; spacing
style_value = style_value.strip()
# attempt exact-match mappings first
classes = []
# split multiple declarations and try heuristic mapping for each
parts = [p.strip() + (';' if not p.strip().endswith(';') else '') for p in style_value.split(';') if p.strip()]
for part in parts:
mapped = None
for pattern, cls in STYLE_CLASS_MAP:
m = pattern.match(part)
if m:
mapped = cls(m) if callable(cls) else cls
break
if mapped:
classes.append(mapped)
if not classes:
# fallback: if style contains color/background/font-size etc., keep nothing or map generically
return None
# join unique, filter None
uniq = []
for c in classes:
if c and c not in uniq:
uniq.append(c)
return " ".join(uniq) if uniq else None
def rewrite_images_links_styles(html_str):
soup = BeautifulSoup(html_str, "html.parser")
# images
for img in soup.find_all("img"):
src = img.get("src")
if src:
abs_src = urljoin(WIKI_BASE, src)
encoded = quote(abs_src, safe=":/?&=#%")
new_src = f"https://external-content.duckduckgo.com/iu/?u={encoded}"
img["src"] = new_src
if img.has_attr("srcset"):
del img["srcset"]
# convert inline style on img to classes if possible
if img.has_attr("style"):
cls = style_to_bootstrap_classes(img["style"])
del img["style"]
if cls:
existing = img.get("class", [])
img["class"] = existing + cls.split()
# links
for a in soup.find_all("a", href=True):
href = a["href"]
if href.startswith("/wiki/") or "/wiki/" in href:
new_href = href.replace("/wiki/", "/w/")
a["href"] = new_href
elif href.startswith("/w/index.php") and "title=" in href:
import urllib.parse as _up
parsed = _up.urlparse(href)
qs = _up.parse_qs(parsed.query)
title = qs.get("title", [None])[0]
if title:
a["href"] = "/w/" + quote(title, safe="/#?&=%")
elif href.startswith(WIKI_BASE):
new_href = href.replace(WIKI_BASE, "/w")
a["href"] = new_href
# convert style on links
if a.has_attr("style"):
cls = style_to_bootstrap_classes(a["style"])
del a["style"]
if cls:
existing = a.get("class", [])
a["class"] = existing + cls.split()
# all other elements: convert style -> class where possible
for tag in soup.find_all(True):
if tag.has_attr("style"):
cls = style_to_bootstrap_classes(tag["style"])
del tag["style"]
if cls:
existing = tag.get("class", [])
tag["class"] = existing + cls.split()
return str(soup)
def remove_mw_editsection(html_str):
soup = BeautifulSoup(html_str, "html.parser")
for span in soup.find_all("span", class_="mw-editsection"):
span.decompose() # ノードごと削除
return str(soup)
def save_html(title, body_html, out_filename):
with open(out_filename, "w", encoding="utf-8") as f:
f.write(body_html)
try:
title, html_content = fetch_parsed_html(page_name)
except Exception as e:
print("parse API に失敗:", e, file=sys.stderr)
sys.exit(2)
modified = rewrite_images_links_styles(html_content)
modified = remove_mw_editsection(modified)
out_filename = "{}.html".format(page_name.replace("/", "_"))
save_html(title, modified, out_filename)
print("保存完了:", out_filename)

タグ: #ゴミ #淫夢

最近追加されたノート

きつねゆっくりの利用規約の変更歴のログ。AI分析用のまとめデータ付属。

@yukkurisozai810 / 2026/02/21

Claude Codeに最低限入れるべきMCP・プラグイン

@Kongyokongyo / 2026/02/18

@621621621 / 2026/02/15