ジッピーにモゴミのwikiのapi叩いて記事のhtmlをbootstrap風に出力しろと命令した結果出てきたゴミ

@ntxIii7WlrtWGxtxxpSH / 更新: 2026/01/04 23:06

紛うことなきゴミなので扱いには要注意

TEXT 5.96KB
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. import sys
  4. import requests
  5. from bs4 import BeautifulSoup
  6. from urllib.parse import urljoin, quote
  7. import re
  8. import html
  9. WIKI_BASE = "https://wiki.yjsnpi.nu/wiki"
  10. API_ENDPOINT = "https://wiki.yjsnpi.nu/w/api.php"
  11. if len(sys.argv) < 2:
  12. print("usage: {} ページ名".format(sys.argv[0]), file=sys.stderr)
  13. sys.exit(1)
  14. page_name = sys.argv[1]
  15. def fetch_parsed_html(page):
  16. params = {
  17. "action": "parse",
  18. "page": page,
  19. "prop": "text|displaytitle",
  20. "format": "json",
  21. }
  22. r = requests.get(API_ENDPOINT, params=params, timeout=15)
  23. r.raise_for_status()
  24. data = r.json()
  25. if "error" in data:
  26. raise RuntimeError("API error: {}".format(data["error"]))
  27. if "parse" not in data:
  28. raise RuntimeError("ページが見つかりません: " + page)
  29. return data["parse"].get("displaytitle") or page, data["parse"]["text"]["*"]
  30. # 簡易 style -> Bootstrap class マッピング
  31. STYLE_CLASS_MAP = [
  32. # 代表的な置換パターン (順に評価)
  33. (re.compile(r'^\s*float\s*:\s*right\s*;?\s*$', re.I), "float-end"),
  34. (re.compile(r'^\s*float\s*:\s*left\s*;?\s*$', re.I), "float-start"),
  35. (re.compile(r'^\s*display\s*:\s*block\s*;?\s*$', re.I), "d-block"),
  36. (re.compile(r'^\s*display\s*:\s*inline-block\s*;?\s*$', re.I), "d-inline-block"),
  37. (re.compile(r'^\s*text-align\s*:\s*center\s*;?\s*$', re.I), "text-center"),
  38. (re.compile(r'^\s*text-align\s*:\s*right\s*;?\s*$', re.I), "text-end"),
  39. (re.compile(r'^\s*text-align\s*:\s*left\s*;?\s*$', re.I), "text-start"),
  40. (re.compile(r'^\s*width\s*:\s*100%\s*;?\s*$', re.I), "w-100"),
  41. (re.compile(r'^\s*max-width\s*:\s*100%\s*;?\s*$', re.I), "img-fluid"),
  42. (re.compile(r'^\s*margin\s*:\s*0\s*;\s*$', re.I), "m-0"),
  43. (re.compile(r'^\s*margin-left\s*:\s*auto\s*;\s*margin-right\s*:\s*auto\s*;?\s*$', re.I), "mx-auto"),
  44. (re.compile(r'^\s*margin-top\s*:\s*(\d+)px\s*;?\s*$', re.I), lambda m: f"mt-{int(int(m.group(1))/4)}"), # rough
  45. (re.compile(r'^\s*padding\s*:\s*(\d+)px\s*;?\s*$', re.I), lambda m: f"p-{int(int(m.group(1))/4)}"),
  46. ]
  47. def style_to_bootstrap_classes(style_value):
  48. if not style_value:
  49. return None
  50. # normalize ; spacing
  51. style_value = style_value.strip()
  52. # attempt exact-match mappings first
  53. classes = []
  54. # split multiple declarations and try heuristic mapping for each
  55. parts = [p.strip() + (';' if not p.strip().endswith(';') else '') for p in style_value.split(';') if p.strip()]
  56. for part in parts:
  57. mapped = None
  58. for pattern, cls in STYLE_CLASS_MAP:
  59. m = pattern.match(part)
  60. if m:
  61. mapped = cls(m) if callable(cls) else cls
  62. break
  63. if mapped:
  64. classes.append(mapped)
  65. if not classes:
  66. # fallback: if style contains color/background/font-size etc., keep nothing or map generically
  67. return None
  68. # join unique, filter None
  69. uniq = []
  70. for c in classes:
  71. if c and c not in uniq:
  72. uniq.append(c)
  73. return " ".join(uniq) if uniq else None
  74. def rewrite_images_links_styles(html_str):
  75. soup = BeautifulSoup(html_str, "html.parser")
  76. # images
  77. for img in soup.find_all("img"):
  78. src = img.get("src")
  79. if src:
  80. abs_src = urljoin(WIKI_BASE, src)
  81. encoded = quote(abs_src, safe=":/?&=#%")
  82. new_src = f"https://external-content.duckduckgo.com/iu/?u={encoded}"
  83. img["src"] = new_src
  84. if img.has_attr("srcset"):
  85. del img["srcset"]
  86. # convert inline style on img to classes if possible
  87. if img.has_attr("style"):
  88. cls = style_to_bootstrap_classes(img["style"])
  89. del img["style"]
  90. if cls:
  91. existing = img.get("class", [])
  92. img["class"] = existing + cls.split()
  93. # links
  94. for a in soup.find_all("a", href=True):
  95. href = a["href"]
  96. if href.startswith("/wiki/") or "/wiki/" in href:
  97. new_href = href.replace("/wiki/", "/w/")
  98. a["href"] = new_href
  99. elif href.startswith("/w/index.php") and "title=" in href:
  100. import urllib.parse as _up
  101. parsed = _up.urlparse(href)
  102. qs = _up.parse_qs(parsed.query)
  103. title = qs.get("title", [None])[0]
  104. if title:
  105. a["href"] = "/w/" + quote(title, safe="/#?&=%")
  106. elif href.startswith(WIKI_BASE):
  107. new_href = href.replace(WIKI_BASE, "/w")
  108. a["href"] = new_href
  109. # convert style on links
  110. if a.has_attr("style"):
  111. cls = style_to_bootstrap_classes(a["style"])
  112. del a["style"]
  113. if cls:
  114. existing = a.get("class", [])
  115. a["class"] = existing + cls.split()
  116. # all other elements: convert style -> class where possible
  117. for tag in soup.find_all(True):
  118. if tag.has_attr("style"):
  119. cls = style_to_bootstrap_classes(tag["style"])
  120. del tag["style"]
  121. if cls:
  122. existing = tag.get("class", [])
  123. tag["class"] = existing + cls.split()
  124. return str(soup)
  125. def remove_mw_editsection(html_str):
  126. soup = BeautifulSoup(html_str, "html.parser")
  127. for span in soup.find_all("span", class_="mw-editsection"):
  128. span.decompose() # ノードごと削除
  129. return str(soup)
  130. def save_html(title, body_html, out_filename):
  131. with open(out_filename, "w", encoding="utf-8") as f:
  132. f.write(body_html)
  133. try:
  134. title, html_content = fetch_parsed_html(page_name)
  135. except Exception as e:
  136. print("parse API に失敗:", e, file=sys.stderr)
  137. sys.exit(2)
  138. modified = rewrite_images_links_styles(html_content)
  139. modified = remove_mw_editsection(modified)
  140. out_filename = "{}.html".format(page_name.replace("/", "_"))
  141. save_html(title, modified, out_filename)
  142. print("保存完了:", out_filename)
タグ: #ゴミ #淫夢