ジッピーにモゴミのwikiのapi叩いて記事のhtmlをbootstrap風に出力しろと命令した結果出てきたゴミ
紛うことなきゴミなので扱いには要注意
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- import sys
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin, quote
- import re
- import html
- WIKI_BASE = "https://wiki.yjsnpi.nu/wiki"
- API_ENDPOINT = "https://wiki.yjsnpi.nu/w/api.php"
- if len(sys.argv) < 2:
- print("usage: {} ページ名".format(sys.argv[0]), file=sys.stderr)
- sys.exit(1)
- page_name = sys.argv[1]
- def fetch_parsed_html(page):
- params = {
- "action": "parse",
- "page": page,
- "prop": "text|displaytitle",
- "format": "json",
- }
- r = requests.get(API_ENDPOINT, params=params, timeout=15)
- r.raise_for_status()
- data = r.json()
- if "error" in data:
- raise RuntimeError("API error: {}".format(data["error"]))
- if "parse" not in data:
- raise RuntimeError("ページが見つかりません: " + page)
- return data["parse"].get("displaytitle") or page, data["parse"]["text"]["*"]
- # 簡易 style -> Bootstrap class マッピング
- STYLE_CLASS_MAP = [
- # 代表的な置換パターン (順に評価)
- (re.compile(r'^\s*float\s*:\s*right\s*;?\s*$', re.I), "float-end"),
- (re.compile(r'^\s*float\s*:\s*left\s*;?\s*$', re.I), "float-start"),
- (re.compile(r'^\s*display\s*:\s*block\s*;?\s*$', re.I), "d-block"),
- (re.compile(r'^\s*display\s*:\s*inline-block\s*;?\s*$', re.I), "d-inline-block"),
- (re.compile(r'^\s*text-align\s*:\s*center\s*;?\s*$', re.I), "text-center"),
- (re.compile(r'^\s*text-align\s*:\s*right\s*;?\s*$', re.I), "text-end"),
- (re.compile(r'^\s*text-align\s*:\s*left\s*;?\s*$', re.I), "text-start"),
- (re.compile(r'^\s*width\s*:\s*100%\s*;?\s*$', re.I), "w-100"),
- (re.compile(r'^\s*max-width\s*:\s*100%\s*;?\s*$', re.I), "img-fluid"),
- (re.compile(r'^\s*margin\s*:\s*0\s*;\s*$', re.I), "m-0"),
- (re.compile(r'^\s*margin-left\s*:\s*auto\s*;\s*margin-right\s*:\s*auto\s*;?\s*$', re.I), "mx-auto"),
- (re.compile(r'^\s*margin-top\s*:\s*(\d+)px\s*;?\s*$', re.I), lambda m: f"mt-{int(int(m.group(1))/4)}"), # rough
- (re.compile(r'^\s*padding\s*:\s*(\d+)px\s*;?\s*$', re.I), lambda m: f"p-{int(int(m.group(1))/4)}"),
- ]
- def style_to_bootstrap_classes(style_value):
- if not style_value:
- return None
- # normalize ; spacing
- style_value = style_value.strip()
- # attempt exact-match mappings first
- classes = []
- # split multiple declarations and try heuristic mapping for each
- parts = [p.strip() + (';' if not p.strip().endswith(';') else '') for p in style_value.split(';') if p.strip()]
- for part in parts:
- mapped = None
- for pattern, cls in STYLE_CLASS_MAP:
- m = pattern.match(part)
- if m:
- mapped = cls(m) if callable(cls) else cls
- break
- if mapped:
- classes.append(mapped)
- if not classes:
- # fallback: if style contains color/background/font-size etc., keep nothing or map generically
- return None
- # join unique, filter None
- uniq = []
- for c in classes:
- if c and c not in uniq:
- uniq.append(c)
- return " ".join(uniq) if uniq else None
- def rewrite_images_links_styles(html_str):
- soup = BeautifulSoup(html_str, "html.parser")
- # images
- for img in soup.find_all("img"):
- src = img.get("src")
- if src:
- abs_src = urljoin(WIKI_BASE, src)
- encoded = quote(abs_src, safe=":/?&=#%")
- new_src = f"https://external-content.duckduckgo.com/iu/?u={encoded}"
- img["src"] = new_src
- if img.has_attr("srcset"):
- del img["srcset"]
- # convert inline style on img to classes if possible
- if img.has_attr("style"):
- cls = style_to_bootstrap_classes(img["style"])
- del img["style"]
- if cls:
- existing = img.get("class", [])
- img["class"] = existing + cls.split()
- # links
- for a in soup.find_all("a", href=True):
- href = a["href"]
- if href.startswith("/wiki/") or "/wiki/" in href:
- new_href = href.replace("/wiki/", "/w/")
- a["href"] = new_href
- elif href.startswith("/w/index.php") and "title=" in href:
- import urllib.parse as _up
- parsed = _up.urlparse(href)
- qs = _up.parse_qs(parsed.query)
- title = qs.get("title", [None])[0]
- if title:
- a["href"] = "/w/" + quote(title, safe="/#?&=%")
- elif href.startswith(WIKI_BASE):
- new_href = href.replace(WIKI_BASE, "/w")
- a["href"] = new_href
- # convert style on links
- if a.has_attr("style"):
- cls = style_to_bootstrap_classes(a["style"])
- del a["style"]
- if cls:
- existing = a.get("class", [])
- a["class"] = existing + cls.split()
- # all other elements: convert style -> class where possible
- for tag in soup.find_all(True):
- if tag.has_attr("style"):
- cls = style_to_bootstrap_classes(tag["style"])
- del tag["style"]
- if cls:
- existing = tag.get("class", [])
- tag["class"] = existing + cls.split()
- return str(soup)
- def remove_mw_editsection(html_str):
- soup = BeautifulSoup(html_str, "html.parser")
- for span in soup.find_all("span", class_="mw-editsection"):
- span.decompose() # ノードごと削除
- return str(soup)
- def save_html(title, body_html, out_filename):
- with open(out_filename, "w", encoding="utf-8") as f:
- f.write(body_html)
- try:
- title, html_content = fetch_parsed_html(page_name)
- except Exception as e:
- print("parse API に失敗:", e, file=sys.stderr)
- sys.exit(2)
- modified = rewrite_images_links_styles(html_content)
- modified = remove_mw_editsection(modified)
- out_filename = "{}.html".format(page_name.replace("/", "_"))
- save_html(title, modified, out_filename)
- print("保存完了:", out_filename)