有没有合适的PYTHON抓字典的案例可参考？

stiggg · 发表于 2020-9-13 15:19:03

懂点PYTHON,想学学前人怎么爬字典的。目前只看到BT4BAIDU大大的开源，大大的代码很典范，可是代码四五年了，似乎网站也改版，具体细节的作用比较难揣摩了。不知道还有其它的比较好可参考学习？

你去哪里 · 发表于 2020-9-13 22:19:26

我也想知道这个。

你去哪里 · 发表于 2020-9-13 22:19:38

我也想知道这个。

scirem · 发表于 2021-1-4 02:18:57

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import requests
import re
import time
import sqlite3
from string import ascii_lowercase
from pathlib import Path
from urllib.parse import urljoin
from html.parser import HTMLParser
from bs4 import BeautifulSoup
webster_url = "https://www.merriam-webster.com/browse/dictionary/"
oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"
macmillan_url = "https://www.macmillandictionary.com/browse/british/"
cambridge_url ="https://dictionary.cambridge.org/browse/english/"
cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
base_url = "https://dictionary.cambridge.org/search/direct/"
header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}
conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")
conn.execute("""
CREATE TABLE IF NOT EXISTS cambridge (
entry TEXT PRIMARY KEY
UNIQUE
NOT NULL,
url TEXT NOT NULL,
html TEXT NOT NULL,
latex TEXT
);
""")
conn.commit()
def latexify(string):
result = ""
trimmed_string = re.sub(r"\s{2,}", " ", string)
for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
if char == "%":
result += r"\%"
elif char == "\":
result += r"\textbackslash{}"
elif char == "$":
result += r"\$"
elif char == "#":
result += r"\#"
elif char == "&":
result += r"\&"
elif char == "{":
result += r"\{"
elif char == "}":
result += r"\}"
elif char == "^":
result += r"\^"
elif char == "_":
result += r"\_"
elif char == "~":
result += "r\textasciitilde{}"
else:
result += char
return result
def latexify_html(beautifulsoup_object):
try:
return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
except:
return latexify(beautifulsoup_object)
class CambridgeDictionaryScraper:
""" Scraper for Cambridge Dictionary """
url_set = set() ## Shared by all instance
def __init__(self):
for item in conn.execute("SELECT url FROM cambridge;"):
self.url_set.add(item[0])
print("Already Downloaded " + str(len(self.url_set)) + " Words!")
def __del__(self):
conn.commit()
def get_word_page(self, url):
r = requests.get(url, headers=header)
bs_obj = BeautifulSoup(r.text, "html.parser")
entry_tag = bs_obj.find("div", {"class": "entry"})
if not entry_tag:
entry_tag = bs_obj.find("div", {"class": "di-body"})
if not entry_tag:
## Beta Words
entry_tag = bs_obj.find("div", {"id": "entryContent"})
if not entry_tag:
entry_tag = bs_obj
if not entry_tag:
entry_tag = bs_obj
for tag in entry_tag.find_all("script"):
tag.extract()
result_string = str(entry_tag)
return result_string
def start(self, url):
r = requests.get(url, headers=header)
bs_obj = BeautifulSoup(r.text, "html.parser")
for li_tag in bs_obj.select("li.lpr-10"):
child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])
print(child_url)
self.find_child_entry(child_url)
def find_child_entry(self, url):
r = requests.get(url, headers=header)
bs_obj = BeautifulSoup(r.text, "html.parser")
for li_tag in bs_obj.select("li.t-i"):
child_url = urljoin(url, li_tag.a.attrs["href"]).strip()
child_text = li_tag.get_text().strip()
if "..." in child_text:
self.find_child_entry(child_url)
else:
if child_url in self.url_set:
continue
print(child_text + "\t" + child_url)
conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
(child_text, child_url, self.get_word_page(child_url)))
conn.commit()
self.url_set.add(child_url)
class CambridgeDictionaryExtractor():
def __init__(self, entry, entry_html = ""):
self.entry = latexify(entry)
self.entry_html = entry_html
self.result = ""
# def __del__(self):
# pass
def extract(self):
"""
<div class="pr idiom-block">
<div class="idiom-block"></div>
</div>
"""
bs_obj = BeautifulSoup(self.entry_html, "html.parser")
self.result += "\\begin{entry}{" + self.entry + "}"
for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):
self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
idiom_block = bs_obj.find("div", {"class": "idiom-block"})
if idiom_block:
for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"
self.result += "\n\\end{entry}\n\n"
def process_idiom_block(self, idiom_block):
result = ""
idiom_body = idiom_block.find("span", {"class": "idiom-body"})
if idiom_body:
for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):
result += self.process_sense_tag(sense_tag)
return result
def get_smart_vocabulary(self, smart_vocabulary_tag):
result = ""
for li_tag in smart_vocabulary_tag.find_all("li"):
result += "\\smart{" + latexify_html(li_tag) + "}\n"
return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"
def process_part_of_speech(self, part_of_speech_tag):
"""
<div class="entry-body__el">
<div class="pos-header"></div>
<div class="pos-body"></div>
<div class="pr relativDiv"></div>
<div>
"""
result = ""
header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})
if header_tag:
result += self.process_part_of_speech_header(header_tag)
body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})
if body_tag:
result += self.process_part_of_speech_body(body_tag)
pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})
if pv_block_tag:
result += self.process_pv_block(pv_block_tag)
return result.strip()
def process_pv_block(self, tag):
"""
<div class="pv-block">
<div class="di-title"></div>
<span clss="di-info"></span>
<span class="pv-body dpv-body">
<div class="pr dsense dsense-noh">
<span>
<div>
"""
result = ""
for item in tag.find_all("div",{"class", "sense-body"}):
result += self.process_sense_body(item)
return result
def process_part_of_speech_header(self, header_tag):
result = ""
# title_tag = header_tag.find("div", {"class": "di-title"})
# if title_tag:
# result += process_header_title(title_tag)
posgram_tag = header_tag.find("div", {"class": "posgram"})
if posgram_tag:
result += self.process_part_of_speech_grammar(posgram_tag)
for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
result += self.process_pronunciation(pronunciation_tag)
return result.strip()
def process_header_title(self, title_tag):
## <span class="hw dhw">record</span>
result = ""
headword_tag = title_tag.find("span", {"class": "hw"})
if headword_tag:
result += "\\entry{" + latexify_html(headword_tag) + "}\n"
else:
result += "\\entry{" + latexify_html(title_tag) + "}\n"
return result
def process_part_of_speech_grammar(self, posgram_tag):
result = ""
part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
if part_of_speech_tag:
result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"
gram_tag = posgram_tag.find("span", {"class": "gc"})
if gram_tag:
result += "\n\\posgram{" + latexify_html(gram_tag) + "}"
return result
def process_pronunciation(self, pronunciation_tag):
is_us_pronunciation = False
if "us" in pronunciation_tag.attrs["class"]:
is_us_pronunciation = True
result = ""
audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})
ipa_tag = pronunciation_tag.find("span", {"class": "ipa"})
if ipa_tag:
if is_us_pronunciation:
result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
else:
result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"
if audio_tag:
audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])
result += "\n\pronuniation{" + audio_url + "}"
return result
def process_sense_head(self, tag):
text = latexify_html(tag)
if "(" in text:
left_bracket_index = text.index("(")
else:
left_bracket_index = 0
if ")" in text:
right_bracket_index = text.index(")")
else:
right_bracket_index = len(text)
return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"
def get_definition(self, tag):
result = ""
for def_info_tag in tag.select("span.def-into"):
result += latexify_html(def_info_tag)
for def_tag in tag.select("div.def"):
result += latexify_html(def_tag)
return result
def process_def_block(self, tag):
result = ""
def_tag = tag.find("div", {"class": "ddef_h"})
if def_tag:
result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
try:
def_trans_tag = tag.select("span.trans")[0]
if def_trans_tag:
result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"
except:
pass
for example_tag in tag.select("span.eg"):
result += "\n\\example{" + latexify_html(example_tag) + "}"
return result
def process_phrase_block(self, phrase_block_tag):
"""
<div class="phrase-head dphrase_h">...</div>
<div class="phrase-body dphrase_b">...</div>
<div class="bb hax">...</div>
"""
result = "\\begin{phrase}{"
result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"
return result + "\\end{pharse}\n"
def process_sense_body(self, tag):
"""
<div class="pr phrase-block dphrase-block">...</div>
<div class="def-block ddef_block">...</div>
<div class="bb hax">...</div>
"""
result = ""
for def_block in tag.select("div.def-block"):
result += self.process_def_block(def_block)
for phrase_block in tag.select("div.pharse-block"):
result += self.process_phrase_block(phrase_block)
return result
def process_sense_tag(self, sense_tag):
"""
<h3 class="dsense_h">...</h3>
<div class="sense-body dsense_b">...</div>
<div class="smartt daccord">...</div> # Smart Vocabulary
<div class="bb hax">...</div>
"""
result = ""
sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})
if sense_head_tag:
result += self.process_sense_head(sense_head_tag)
for sense_body_tag in sense_tag.select("div.sense-body"):
result += self.process_sense_body(sense_body_tag)
for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):
result += self.get_smart_vocabulary(smart_vocabulary_tag)
return result
def process_part_of_speech_body(self, body_tag):
"""
<div class="pr dsense">...</div>
<div class="pr dsense">...</div>
"""
result = ""
for sense_tag in body_tag.select("div.dsense"):
result += self.process_sense_tag(sense_tag)
return result
if __name__ == "__main__":
string = ""
CambridgeDictionaryScraper().start(cambridge_english_chinese_url)
for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
entry = row[0]
print(entry)
url = row[1]
html = row[2]
record = CambridgeDictionaryExtractor(entry, entry_html=html)
record.extract()
string += record.result + "\n"
#print(record.result)
with open("./final.tex", "w", encoding="utf-8") as f:
try:
for char in ascii_lowercase:
string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)
except:
pass
f.write(string.replace("", ""))

复制代码

[求助] 有没有合适的PYTHON抓字典的案例可参考？