野獣大百科のコメント一覧アーカイブ化スクリプト

@korosu_gmnt / 更新: 2026/01/10 00:29

HTMLとdatを生成します

from bs4 import BeautifulSoup
import html, requests, time, os, json
ws_dir = "./yajudic"
os.makedirs(ws_dir, exist_ok=True)
proxies = {
"http": "socks5h://127.0.0.1:9150",
"https": "socks5h://127.0.0.1:9150",
}
def normalize_comment(node):
parts = []
for s in node.stripped_strings:
parts.append(s)
return "\n".join(parts)
def save_json(wid):
output_dir = os.path.join(ws_dir, f"{wid}")
os.makedirs(output_dir, exist_ok=True)
page = 1
while True:
output_file = os.path.join(output_dir, f"{page}.json")
url = f"https://dic.yajuvideo.in/comments?article_id={wid}&page={page}"
if os.path.exists(output_file):
with open(output_file, "r", encoding="utf-8") as f:
data = json.load(f)
if len(data["comments"]) >= 50:
print(f"exists: {output_file}")
page += 1
continue
else:
print(f"update: {output_file}")
else:
print(url)
r = requests.get(url, proxies=proxies, timeout=30)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
thread = (
soup.body
.find("main")
.select_one(".container .row")
.find_all(recursive=False)[1]
.select_one(".card")
)
title = thread.select_one(".card-header").get_text(strip=True)
comments = thread.select(".card-body .comments-list > div[id^='comment-']")
if len(comments) < 1:
with open(os.path.join(output_dir, f"info.json"), "w", encoding="utf-8") as f:
info = {
"page_num_max": page - 1
}
json.dump(info, f, indent=2, ensure_ascii=False)
break
data = {
"title": title,
"comments": []
}
for i, c in enumerate(comments):
comment = {}
header = c.select_one(".comment-header")
spans = header.find_all("span")
name = spans[1].get_text(strip=True)
date = spans[2].get_text(strip=True)
body = c.select_one(".article-comment-body")
text = normalize_comment(body)
text = html.escape(text)
text = text.replace("\n", "<br>")
comment = {
"name": name,
"date": date,
"text": text,
}
data["comments"].append(comment)
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
time.sleep(0.1)
page += 1
def merge_json(wid):
input_dir = os.path.join(ws_dir, f"{wid}")
data_merged = {}
with open(os.path.join(input_dir, "info.json"), "r", encoding="utf-8") as f:
info = json.load(f)
for i in range(info["page_num_max"]):
with open(os.path.join(input_dir, f"{i+1}.json"), "r", encoding="utf-8") as f:
data = json.load(f)
if i == 0:
data_merged = data
else:
data_merged["comments"] += data["comments"]
with open(os.path.join(input_dir, "merged.json"), "w", encoding="utf-8") as f:
json.dump(data_merged, f, indent=2, ensure_ascii=False)
return
def json_to_dat(wid):
input_dir = os.path.join(ws_dir, f"{wid}")
output_file = os.path.join(input_dir, f"all.dat")
with open(os.path.join(input_dir, "merged.json"), "r", encoding="utf-8") as f:
data_merged = json.load(f)
lines = []
for i, comment in enumerate(data_merged["comments"]):
line = f"{comment["name"]}<> <>{comment["date"]}<> {comment["text"]}<> {data_merged["title"] if i == 0 else ""}"
lines.append(line)
with open(output_file, "w", encoding="utf-8") as f:
f.write("\n".join(lines) + "\n")
def json_to_html(wid):
input_dir = os.path.join(ws_dir, f"{wid}")
output_file = os.path.join(input_dir, f"all.html")
with open(os.path.join(input_dir, "merged.json"), "r", encoding="utf-8") as f:
data_merged = json.load(f)
li_items = []
for i, comment in enumerate(data_merged["comments"]):
li_items.append(f""" <div id="comment-1" class="article-comment-item border-bottom p-3">
<div class="comment-header small text-muted">
<span class="fw-bold">{i+1}:</span>
<span class="ms-1">{comment["name"]}</span>
<span class="ms-2">{comment["date"]}</span>
</div>
<div class="article-comment-body mt-2">
<p>{comment["text"]}</p>
</div>
</div>""")
li_html = "\n".join(li_items)
html = f"""<div class="container py-4">
<div class="row">
<div class="col-12 mb-4">
<div class="card shadow-sm">
<div class="card-header bg-white">
<h2 class="card-title h4 mb-0">
<i class="bi bi-book me-1"></i>
{data_merged["title"]}
</h2>
</div>
<div class="card-body p-0">
<div class="comments-list">
{li_html}
</div>
</div>
</div>
</div>
</div>
</div>"""
with open(output_file, "w", encoding="utf-8") as f:
f.write(html)
def main():
wid = 5
save_json(wid)
merge_json(wid)
json_to_dat(wid)
json_to_html(wid)
main()

タグ: #野獣大百科

野獣大百科のコメント一覧アーカイブ化スクリプト

関連ノート