TA的每日心情 | 开心 2021-1-4 22:53 |
---|
签到天数: 5 天 [LV.2]偶尔看看I
童生
- 积分
- 258
|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- import os
- import requests
- import re
- import time
- import sqlite3
- from string import ascii_lowercase
- from pathlib import Path
- from urllib.parse import urljoin
- from html.parser import HTMLParser
- from bs4 import BeautifulSoup
- webster_url = "https://www.merriam-webster.com/browse/dictionary/"
- oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"
- macmillan_url = "https://www.macmillandictionary.com/browse/british/"
- cambridge_url ="https://dictionary.cambridge.org/browse/english/"
- cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
- base_url = "https://dictionary.cambridge.org/search/direct/"
- header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
- payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}
- conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")
- conn.execute("""
- CREATE TABLE IF NOT EXISTS cambridge (
- entry TEXT PRIMARY KEY
- UNIQUE
- NOT NULL,
- url TEXT NOT NULL,
- html TEXT NOT NULL,
- latex TEXT
- );
- """)
- conn.commit()
- def latexify(string):
- result = ""
- trimmed_string = re.sub(r"\s{2,}", " ", string)
- for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
- if char == "%":
- result += r"\%"
- elif char == "\":
- result += r"\textbackslash{}"
- elif char == "$":
- result += r"\$"
- elif char == "#":
- result += r"\#"
- elif char == "&":
- result += r"\&"
- elif char == "{":
- result += r"\{"
- elif char == "}":
- result += r"\}"
- elif char == "^":
- result += r"\^"
- elif char == "_":
- result += r"\_"
- elif char == "~":
- result += "r\textasciitilde{}"
- else:
- result += char
- return result
- def latexify_html(beautifulsoup_object):
- try:
- return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
- except:
- return latexify(beautifulsoup_object)
- class CambridgeDictionaryScraper:
- """ Scraper for Cambridge Dictionary """
- url_set = set() ## Shared by all instance
- def __init__(self):
- for item in conn.execute("SELECT url FROM cambridge;"):
- self.url_set.add(item[0])
- print("Already Downloaded " + str(len(self.url_set)) + " Words!")
- def __del__(self):
- conn.commit()
- def get_word_page(self, url):
- r = requests.get(url, headers=header)
- bs_obj = BeautifulSoup(r.text, "html.parser")
- entry_tag = bs_obj.find("div", {"class": "entry"})
- if not entry_tag:
- entry_tag = bs_obj.find("div", {"class": "di-body"})
- if not entry_tag:
- ## Beta Words
- entry_tag = bs_obj.find("div", {"id": "entryContent"})
- if not entry_tag:
- entry_tag = bs_obj
- if not entry_tag:
- entry_tag = bs_obj
-
- for tag in entry_tag.find_all("script"):
- tag.extract()
-
- result_string = str(entry_tag)
- return result_string
-
- def start(self, url):
- r = requests.get(url, headers=header)
- bs_obj = BeautifulSoup(r.text, "html.parser")
- for li_tag in bs_obj.select("li.lpr-10"):
- child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])
- print(child_url)
- self.find_child_entry(child_url)
- def find_child_entry(self, url):
- r = requests.get(url, headers=header)
- bs_obj = BeautifulSoup(r.text, "html.parser")
- for li_tag in bs_obj.select("li.t-i"):
- child_url = urljoin(url, li_tag.a.attrs["href"]).strip()
- child_text = li_tag.get_text().strip()
- if "..." in child_text:
- self.find_child_entry(child_url)
- else:
- if child_url in self.url_set:
- continue
- print(child_text + "\t" + child_url)
- conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
- (child_text, child_url, self.get_word_page(child_url)))
- conn.commit()
- self.url_set.add(child_url)
- class CambridgeDictionaryExtractor():
- def __init__(self, entry, entry_html = ""):
- self.entry = latexify(entry)
- self.entry_html = entry_html
- self.result = ""
- # def __del__(self):
- # pass
- def extract(self):
- """
- <div class="pr idiom-block">
- <div class="idiom-block"></div>
- </div>
- """
- bs_obj = BeautifulSoup(self.entry_html, "html.parser")
- self.result += "\\begin{entry}{" + self.entry + "}"
- for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):
- self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
- idiom_block = bs_obj.find("div", {"class": "idiom-block"})
- if idiom_block:
- for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
- self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"
- self.result += "\n\\end{entry}\n\n"
-
- def process_idiom_block(self, idiom_block):
- result = ""
- idiom_body = idiom_block.find("span", {"class": "idiom-body"})
- if idiom_body:
- for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):
- result += self.process_sense_tag(sense_tag)
- return result
-
- def get_smart_vocabulary(self, smart_vocabulary_tag):
- result = ""
- for li_tag in smart_vocabulary_tag.find_all("li"):
- result += "\\smart{" + latexify_html(li_tag) + "}\n"
- return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"
- def process_part_of_speech(self, part_of_speech_tag):
- """
- <div class="entry-body__el">
- <div class="pos-header"></div>
- <div class="pos-body"></div>
- <div class="pr relativDiv"></div>
- <div>
- """
- result = ""
- header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})
- if header_tag:
- result += self.process_part_of_speech_header(header_tag)
- body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})
- if body_tag:
- result += self.process_part_of_speech_body(body_tag)
- pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})
- if pv_block_tag:
- result += self.process_pv_block(pv_block_tag)
- return result.strip()
-
- def process_pv_block(self, tag):
- """
- <div class="pv-block">
- <div class="di-title"></div>
- <span clss="di-info"></span>
- <span class="pv-body dpv-body">
- <div class="pr dsense dsense-noh">
- <span>
- <div>
- """
- result = ""
- for item in tag.find_all("div",{"class", "sense-body"}):
- result += self.process_sense_body(item)
- return result
-
- def process_part_of_speech_header(self, header_tag):
- result = ""
- # title_tag = header_tag.find("div", {"class": "di-title"})
- # if title_tag:
- # result += process_header_title(title_tag)
- posgram_tag = header_tag.find("div", {"class": "posgram"})
- if posgram_tag:
- result += self.process_part_of_speech_grammar(posgram_tag)
- for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
- result += self.process_pronunciation(pronunciation_tag)
- return result.strip()
- def process_header_title(self, title_tag):
- ## <span class="hw dhw">record</span>
- result = ""
- headword_tag = title_tag.find("span", {"class": "hw"})
- if headword_tag:
- result += "\\entry{" + latexify_html(headword_tag) + "}\n"
- else:
- result += "\\entry{" + latexify_html(title_tag) + "}\n"
- return result
- def process_part_of_speech_grammar(self, posgram_tag):
- result = ""
- part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
- if part_of_speech_tag:
- result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"
- gram_tag = posgram_tag.find("span", {"class": "gc"})
- if gram_tag:
- result += "\n\\posgram{" + latexify_html(gram_tag) + "}"
- return result
- def process_pronunciation(self, pronunciation_tag):
- is_us_pronunciation = False
- if "us" in pronunciation_tag.attrs["class"]:
- is_us_pronunciation = True
- result = ""
- audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})
-
- ipa_tag = pronunciation_tag.find("span", {"class": "ipa"})
- if ipa_tag:
- if is_us_pronunciation:
- result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
- else:
- result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"
- if audio_tag:
- audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])
- result += "\n\pronuniation{" + audio_url + "}"
- return result
- def process_sense_head(self, tag):
- text = latexify_html(tag)
- if "(" in text:
- left_bracket_index = text.index("(")
- else:
- left_bracket_index = 0
- if ")" in text:
- right_bracket_index = text.index(")")
- else:
- right_bracket_index = len(text)
- return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"
- def get_definition(self, tag):
- result = ""
- for def_info_tag in tag.select("span.def-into"):
- result += latexify_html(def_info_tag)
- for def_tag in tag.select("div.def"):
- result += latexify_html(def_tag)
- return result
- def process_def_block(self, tag):
- result = ""
- def_tag = tag.find("div", {"class": "ddef_h"})
- if def_tag:
- result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
- try:
- def_trans_tag = tag.select("span.trans")[0]
- if def_trans_tag:
- result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"
- except:
- pass
- for example_tag in tag.select("span.eg"):
- result += "\n\\example{" + latexify_html(example_tag) + "}"
- return result
- def process_phrase_block(self, phrase_block_tag):
- """
- <div class="phrase-head dphrase_h">...</div>
- <div class="phrase-body dphrase_b">...</div>
- <div class="bb hax">...</div>
- """
- result = "\\begin{phrase}{"
- result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"
- return result + "\\end{pharse}\n"
- def process_sense_body(self, tag):
- """
- <div class="pr phrase-block dphrase-block">...</div>
- <div class="def-block ddef_block">...</div>
- <div class="bb hax">...</div>
- """
- result = ""
- for def_block in tag.select("div.def-block"):
- result += self.process_def_block(def_block)
- for phrase_block in tag.select("div.pharse-block"):
- result += self.process_phrase_block(phrase_block)
- return result
-
- def process_sense_tag(self, sense_tag):
- """
- <h3 class="dsense_h">...</h3>
- <div class="sense-body dsense_b">...</div>
- <div class="smartt daccord">...</div> # Smart Vocabulary
- <div class="bb hax">...</div>
- """
- result = ""
- sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})
- if sense_head_tag:
- result += self.process_sense_head(sense_head_tag)
- for sense_body_tag in sense_tag.select("div.sense-body"):
- result += self.process_sense_body(sense_body_tag)
- for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):
- result += self.get_smart_vocabulary(smart_vocabulary_tag)
-
- return result
- def process_part_of_speech_body(self, body_tag):
- """
- <div class="pr dsense">...</div>
- <div class="pr dsense">...</div>
- """
- result = ""
- for sense_tag in body_tag.select("div.dsense"):
- result += self.process_sense_tag(sense_tag)
- return result
-
- if __name__ == "__main__":
- string = ""
- CambridgeDictionaryScraper().start(cambridge_english_chinese_url)
- for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
- entry = row[0]
- print(entry)
- url = row[1]
- html = row[2]
- record = CambridgeDictionaryExtractor(entry, entry_html=html)
- record.extract()
- string += record.result + "\n"
- #print(record.result)
- with open("./final.tex", "w", encoding="utf-8") as f:
- try:
- for char in ascii_lowercase:
- string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)
- except:
- pass
-
- f.write(string.replace("", ""))
-
复制代码 |
|