- from bs4 import BeautifulSoup
- import html, requests, time, os, json
- ws_dir = "./yajudic"
- os.makedirs(ws_dir, exist_ok=True)
- proxies = {
- "http": "socks5h://127.0.0.1:9150",
- "https": "socks5h://127.0.0.1:9150",
- }
- def normalize_comment(node):
- parts = []
- for s in node.stripped_strings:
- parts.append(s)
- return "\n".join(parts)
- def save_json(wid):
- output_dir = os.path.join(ws_dir, f"{wid}")
- os.makedirs(output_dir, exist_ok=True)
- page = 1
- while True:
- output_file = os.path.join(output_dir, f"{page}.json")
- url = f"https://dic.yajuvideo.in/comments?article_id={wid}&page={page}"
- if os.path.exists(output_file):
- with open(output_file, "r", encoding="utf-8") as f:
- data = json.load(f)
- if len(data["comments"]) >= 50:
- print(f"exists: {output_file}")
- page += 1
- continue
- else:
- print(f"update: {output_file}")
- else:
- print(url)
- r = requests.get(url, proxies=proxies, timeout=30)
- r.raise_for_status()
- soup = BeautifulSoup(r.text, "html.parser")
- thread = (
- soup.body
- .find("main")
- .select_one(".container .row")
- .find_all(recursive=False)[1]
- .select_one(".card")
- )
- title = thread.select_one(".card-header").get_text(strip=True)
- comments = thread.select(".card-body .comments-list > div[id^='comment-']")
- if len(comments) < 1:
- with open(os.path.join(output_dir, f"info.json"), "w", encoding="utf-8") as f:
- info = {
- "page_num_max": page - 1
- }
- json.dump(info, f, indent=2, ensure_ascii=False)
- break
- data = {
- "title": title,
- "comments": []
- }
- for i, c in enumerate(comments):
- comment = {}
- header = c.select_one(".comment-header")
- spans = header.find_all("span")
- name = spans[1].get_text(strip=True)
- date = spans[2].get_text(strip=True)
- body = c.select_one(".article-comment-body")
- text = normalize_comment(body)
- text = html.escape(text)
- text = text.replace("\n", "<br>")
- comment = {
- "name": name,
- "date": date,
- "text": text,
- }
- data["comments"].append(comment)
- with open(output_file, "w", encoding="utf-8") as f:
- json.dump(data, f, indent=2, ensure_ascii=False)
- time.sleep(0.1)
- page += 1
- def merge_json(wid):
- input_dir = os.path.join(ws_dir, f"{wid}")
- data_merged = {}
- with open(os.path.join(input_dir, "info.json"), "r", encoding="utf-8") as f:
- info = json.load(f)
- for i in range(info["page_num_max"]):
- with open(os.path.join(input_dir, f"{i+1}.json"), "r", encoding="utf-8") as f:
- data = json.load(f)
- if i == 0:
- data_merged = data
- else:
- data_merged["comments"] += data["comments"]
- with open(os.path.join(input_dir, "merged.json"), "w", encoding="utf-8") as f:
- json.dump(data_merged, f, indent=2, ensure_ascii=False)
- return
- def json_to_dat(wid):
- input_dir = os.path.join(ws_dir, f"{wid}")
- output_file = os.path.join(input_dir, f"all.dat")
- with open(os.path.join(input_dir, "merged.json"), "r", encoding="utf-8") as f:
- data_merged = json.load(f)
- lines = []
- for i, comment in enumerate(data_merged["comments"]):
- line = f"{comment["name"]}<> <>{comment["date"]}<> {comment["text"]}<> {data_merged["title"] if i == 0 else ""}"
- lines.append(line)
- with open(output_file, "w", encoding="utf-8") as f:
- f.write("\n".join(lines) + "\n")
- def json_to_html(wid):
- input_dir = os.path.join(ws_dir, f"{wid}")
- output_file = os.path.join(input_dir, f"all.html")
- with open(os.path.join(input_dir, "merged.json"), "r", encoding="utf-8") as f:
- data_merged = json.load(f)
- li_items = []
- for i, comment in enumerate(data_merged["comments"]):
- li_items.append(f""" <div id="comment-1" class="article-comment-item border-bottom p-3">
- <div class="comment-header small text-muted">
- <span class="fw-bold">{i+1}:</span>
- <span class="ms-1">{comment["name"]}</span>
- <span class="ms-2">{comment["date"]}</span>
- </div>
- <div class="article-comment-body mt-2">
- <p>{comment["text"]}</p>
- </div>
- </div>""")
- li_html = "\n".join(li_items)
- html = f"""<div class="container py-4">
- <div class="row">
- <div class="col-12 mb-4">
- <div class="card shadow-sm">
- <div class="card-header bg-white">
- <h2 class="card-title h4 mb-0">
- <i class="bi bi-book me-1"></i>
- {data_merged["title"]}
- </h2>
- </div>
- <div class="card-body p-0">
- <div class="comments-list">
- {li_html}
- </div>
- </div>
- </div>
- </div>
- </div>
- </div>"""
- with open(output_file, "w", encoding="utf-8") as f:
- f.write(html)
- def main():
- wid = 5
- save_json(wid)
- merge_json(wid)
- json_to_dat(wid)
- json_to_html(wid)
- main()
タグ:
#野獣大百科