野獣大百科のコメント一覧アーカイブ化スクリプト

@korosu_gmnt / 更新: 2026/01/10 00:29

HTMLとdatを生成します

TEXT 5.77KB
1
  1. from bs4 import BeautifulSoup
  2. import html, requests, time, os, json
  3. ws_dir = "./yajudic"
  4. os.makedirs(ws_dir, exist_ok=True)
  5. proxies = {
  6. "http": "socks5h://127.0.0.1:9150",
  7. "https": "socks5h://127.0.0.1:9150",
  8. }
  9. def normalize_comment(node):
  10. parts = []
  11. for s in node.stripped_strings:
  12. parts.append(s)
  13. return "\n".join(parts)
  14. def save_json(wid):
  15. output_dir = os.path.join(ws_dir, f"{wid}")
  16. os.makedirs(output_dir, exist_ok=True)
  17. page = 1
  18. while True:
  19. output_file = os.path.join(output_dir, f"{page}.json")
  20. url = f"https://dic.yajuvideo.in/comments?article_id={wid}&page={page}"
  21. if os.path.exists(output_file):
  22. with open(output_file, "r", encoding="utf-8") as f:
  23. data = json.load(f)
  24. if len(data["comments"]) >= 50:
  25. print(f"exists: {output_file}")
  26. page += 1
  27. continue
  28. else:
  29. print(f"update: {output_file}")
  30. else:
  31. print(url)
  32. r = requests.get(url, proxies=proxies, timeout=30)
  33. r.raise_for_status()
  34. soup = BeautifulSoup(r.text, "html.parser")
  35. thread = (
  36. soup.body
  37. .find("main")
  38. .select_one(".container .row")
  39. .find_all(recursive=False)[1]
  40. .select_one(".card")
  41. )
  42. title = thread.select_one(".card-header").get_text(strip=True)
  43. comments = thread.select(".card-body .comments-list > div[id^='comment-']")
  44. if len(comments) < 1:
  45. with open(os.path.join(output_dir, f"info.json"), "w", encoding="utf-8") as f:
  46. info = {
  47. "page_num_max": page - 1
  48. }
  49. json.dump(info, f, indent=2, ensure_ascii=False)
  50. break
  51. data = {
  52. "title": title,
  53. "comments": []
  54. }
  55. for i, c in enumerate(comments):
  56. comment = {}
  57. header = c.select_one(".comment-header")
  58. spans = header.find_all("span")
  59. name = spans[1].get_text(strip=True)
  60. date = spans[2].get_text(strip=True)
  61. body = c.select_one(".article-comment-body")
  62. text = normalize_comment(body)
  63. text = html.escape(text)
  64. text = text.replace("\n", "<br>")
  65. comment = {
  66. "name": name,
  67. "date": date,
  68. "text": text,
  69. }
  70. data["comments"].append(comment)
  71. with open(output_file, "w", encoding="utf-8") as f:
  72. json.dump(data, f, indent=2, ensure_ascii=False)
  73. time.sleep(0.1)
  74. page += 1
  75. def merge_json(wid):
  76. input_dir = os.path.join(ws_dir, f"{wid}")
  77. data_merged = {}
  78. with open(os.path.join(input_dir, "info.json"), "r", encoding="utf-8") as f:
  79. info = json.load(f)
  80. for i in range(info["page_num_max"]):
  81. with open(os.path.join(input_dir, f"{i+1}.json"), "r", encoding="utf-8") as f:
  82. data = json.load(f)
  83. if i == 0:
  84. data_merged = data
  85. else:
  86. data_merged["comments"] += data["comments"]
  87. with open(os.path.join(input_dir, "merged.json"), "w", encoding="utf-8") as f:
  88. json.dump(data_merged, f, indent=2, ensure_ascii=False)
  89. return
  90. def json_to_dat(wid):
  91. input_dir = os.path.join(ws_dir, f"{wid}")
  92. output_file = os.path.join(input_dir, f"all.dat")
  93. with open(os.path.join(input_dir, "merged.json"), "r", encoding="utf-8") as f:
  94. data_merged = json.load(f)
  95. lines = []
  96. for i, comment in enumerate(data_merged["comments"]):
  97. line = f"{comment["name"]}<> <>{comment["date"]}<> {comment["text"]}<> {data_merged["title"] if i == 0 else ""}"
  98. lines.append(line)
  99. with open(output_file, "w", encoding="utf-8") as f:
  100. f.write("\n".join(lines) + "\n")
  101. def json_to_html(wid):
  102. input_dir = os.path.join(ws_dir, f"{wid}")
  103. output_file = os.path.join(input_dir, f"all.html")
  104. with open(os.path.join(input_dir, "merged.json"), "r", encoding="utf-8") as f:
  105. data_merged = json.load(f)
  106. li_items = []
  107. for i, comment in enumerate(data_merged["comments"]):
  108. li_items.append(f""" <div id="comment-1" class="article-comment-item border-bottom p-3">
  109. <div class="comment-header small text-muted">
  110. <span class="fw-bold">{i+1}:</span>
  111. <span class="ms-1">{comment["name"]}</span>
  112. <span class="ms-2">{comment["date"]}</span>
  113. </div>
  114. <div class="article-comment-body mt-2">
  115. <p>{comment["text"]}</p>
  116. </div>
  117. </div>""")
  118. li_html = "\n".join(li_items)
  119. html = f"""<div class="container py-4">
  120. <div class="row">
  121. <div class="col-12 mb-4">
  122. <div class="card shadow-sm">
  123. <div class="card-header bg-white">
  124. <h2 class="card-title h4 mb-0">
  125. <i class="bi bi-book me-1"></i>
  126. {data_merged["title"]}
  127. </h2>
  128. </div>
  129. <div class="card-body p-0">
  130. <div class="comments-list">
  131. {li_html}
  132. </div>
  133. </div>
  134. </div>
  135. </div>
  136. </div>
  137. </div>"""
  138. with open(output_file, "w", encoding="utf-8") as f:
  139. f.write(html)
  140. def main():
  141. wid = 5
  142. save_json(wid)
  143. merge_json(wid)
  144. json_to_dat(wid)
  145. json_to_html(wid)
  146. main()