查看: 526|回复: 4
打印 上一主题 下一主题

[求助] 有没有合适的PYTHON抓字典的案例可参考?

[复制链接]
  • TA的每日心情

    2020-11-25 15:28
  • 签到天数: 86 天

    [LV.6]常住居民II

    1

    主题

    125

    回帖

    5490

    积分

    会元

    Rank: 7Rank: 7Rank: 7

    积分
    5490
    跳转到指定楼层
    1
    发表于 2020-9-13 15:19:03 | 只看该作者 回帖奖励 |倒序浏览 |阅读模式
    懂点PYTHON,想学学前人怎么爬字典的。目前只看到BT4BAIDU大大的开源,大大的代码很典范,可是代码四五年了,似乎网站也改版,具体细节的作用比较难揣摩了。不知道还有其它的比较好可参考学习?
  • TA的每日心情
    开心
    2023-2-17 08:38
  • 签到天数: 321 天

    [LV.8]以坛为家I

    4

    主题

    785

    回帖

    1万

    积分

    状元

    Rank: 9Rank: 9Rank: 9

    积分
    11494

    QQ 章

    3
    发表于 2020-9-13 22:19:26 | 只看该作者
    我也想知道这个。
  • TA的每日心情
    开心
    2023-2-17 08:38
  • 签到天数: 321 天

    [LV.8]以坛为家I

    4

    主题

    785

    回帖

    1万

    积分

    状元

    Rank: 9Rank: 9Rank: 9

    积分
    11494

    QQ 章

    4
    发表于 2020-9-13 22:19:38 | 只看该作者
    我也想知道这个。
  • TA的每日心情
    开心
    2021-1-4 22:53
  • 签到天数: 5 天

    [LV.2]偶尔看看I

    0

    主题

    19

    回帖

    258

    积分

    童生

    Rank: 2

    积分
    258
    5
    发表于 2021-1-4 02:18:57 | 只看该作者
    1. #!/usr/bin/env python3
    2. # -*- coding: utf-8 -*-

    3. import os
    4. import requests
    5. import re
    6. import time
    7. import sqlite3

    8. from string import ascii_lowercase

    9. from pathlib import Path
    10. from urllib.parse import urljoin
    11. from html.parser import HTMLParser
    12. from bs4 import BeautifulSoup

    13. webster_url = "https://www.merriam-webster.com/browse/dictionary/"
    14. oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"
    15. macmillan_url = "https://www.macmillandictionary.com/browse/british/"
    16. cambridge_url ="https://dictionary.cambridge.org/browse/english/"

    17. cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"

    18. base_url = "https://dictionary.cambridge.org/search/direct/"
    19. header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
    20. payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}

    21. conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")

    22. conn.execute("""
    23. CREATE TABLE IF NOT EXISTS cambridge (
    24.     entry TEXT PRIMARY KEY
    25.                UNIQUE
    26.                NOT NULL,
    27.     url   TEXT NOT NULL,
    28.     html  TEXT NOT NULL,
    29.     latex TEXT
    30. );
    31. """)

    32. conn.commit()


    33. def latexify(string):
    34.     result = ""
    35.     trimmed_string = re.sub(r"\s{2,}", " ", string)
    36.     for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
    37.         if char == "%":
    38.             result += r"\%"
    39.         elif char == "\":
    40.             result += r"\textbackslash{}"
    41.         elif char == "$":
    42.             result += r"\$"
    43.         elif char == "#":
    44.             result += r"\#"
    45.         elif char == "&":
    46.             result += r"\&"
    47.         elif char == "{":
    48.             result += r"\{"
    49.         elif char == "}":
    50.             result += r"\}"
    51.         elif char == "^":
    52.             result += r"\^"
    53.         elif char == "_":
    54.             result += r"\_"
    55.         elif char == "~":
    56.             result += "r\textasciitilde{}"
    57.         else:
    58.             result += char
    59.     return result

    60. def latexify_html(beautifulsoup_object):
    61.     try:
    62.         return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
    63.     except:
    64.         return latexify(beautifulsoup_object)


    65. class CambridgeDictionaryScraper:
    66.     """ Scraper for Cambridge Dictionary """
    67.     url_set = set() ## Shared by all instance

    68.     def __init__(self):
    69.         for item in conn.execute("SELECT url FROM cambridge;"):
    70.             self.url_set.add(item[0])
    71.         print("Already Downloaded " + str(len(self.url_set)) + " Words!")


    72.     def __del__(self):
    73.         conn.commit()


    74.     def get_word_page(self, url):
    75.         r = requests.get(url, headers=header)        
    76.         bs_obj = BeautifulSoup(r.text, "html.parser")
    77.         entry_tag = bs_obj.find("div", {"class": "entry"})
    78.         if not entry_tag:
    79.             entry_tag = bs_obj.find("div", {"class": "di-body"})
    80.         if not entry_tag:
    81.             ## Beta Words
    82.             entry_tag = bs_obj.find("div", {"id": "entryContent"})
    83.         if not entry_tag:
    84.             entry_tag = bs_obj

    85.         if not entry_tag:
    86.             entry_tag = bs_obj
    87.         
    88.         for tag in entry_tag.find_all("script"):
    89.             tag.extract()
    90.             
    91.         result_string = str(entry_tag)
    92.         return result_string

    93.    
    94.     def start(self, url):
    95.         r = requests.get(url, headers=header)

    96.         bs_obj = BeautifulSoup(r.text, "html.parser")

    97.         for li_tag in bs_obj.select("li.lpr-10"):
    98.             child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])
    99.             print(child_url)
    100.             self.find_child_entry(child_url)


    101.     def find_child_entry(self, url):
    102.         r = requests.get(url, headers=header)
    103.         bs_obj = BeautifulSoup(r.text, "html.parser")
    104.         for li_tag in bs_obj.select("li.t-i"):
    105.             child_url = urljoin(url, li_tag.a.attrs["href"]).strip()
    106.             child_text = li_tag.get_text().strip()
    107.             if "..." in child_text:
    108.                 self.find_child_entry(child_url)
    109.             else:
    110.                 if child_url in self.url_set:
    111.                     continue
    112.                 print(child_text + "\t" + child_url)
    113.                 conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
    114.                 (child_text, child_url, self.get_word_page(child_url)))
    115.                 conn.commit()
    116.                 self.url_set.add(child_url)


    117. class CambridgeDictionaryExtractor():
    118.     def __init__(self, entry, entry_html = ""):
    119.         self.entry = latexify(entry)
    120.         self.entry_html = entry_html
    121.         self.result = ""

    122.     # def __del__(self):
    123.     #     pass


    124.     def extract(self):
    125.         """
    126.         <div class="pr idiom-block">
    127.             <div class="idiom-block"></div>
    128.         </div>
    129.         """
    130.         bs_obj = BeautifulSoup(self.entry_html, "html.parser")
    131.         self.result += "\\begin{entry}{" + self.entry + "}"
    132.         for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):
    133.             self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
    134.         idiom_block = bs_obj.find("div", {"class": "idiom-block"})
    135.         if idiom_block:
    136.             for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
    137.                 self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"
    138.         self.result += "\n\\end{entry}\n\n"

    139.    
    140.     def process_idiom_block(self, idiom_block):
    141.         result = ""
    142.         idiom_body = idiom_block.find("span", {"class": "idiom-body"})
    143.         if idiom_body:
    144.             for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):
    145.                 result += self.process_sense_tag(sense_tag)
    146.         return result

    147.    

    148.     def get_smart_vocabulary(self, smart_vocabulary_tag):
    149.         result = ""
    150.         for li_tag in smart_vocabulary_tag.find_all("li"):
    151.             result += "\\smart{" + latexify_html(li_tag) + "}\n"
    152.         return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"


    153.     def process_part_of_speech(self, part_of_speech_tag):
    154.         """
    155.         <div class="entry-body__el">
    156.             <div class="pos-header"></div>
    157.             <div class="pos-body"></div>
    158.             <div class="pr relativDiv"></div>
    159.         <div>
    160.         """
    161.         result = ""
    162.         header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})
    163.         if header_tag:
    164.             result += self.process_part_of_speech_header(header_tag)
    165.         body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})
    166.         if body_tag:
    167.             result += self.process_part_of_speech_body(body_tag)
    168.         pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})
    169.         if pv_block_tag:
    170.             result += self.process_pv_block(pv_block_tag)
    171.         return result.strip()

    172.    
    173.     def process_pv_block(self, tag):
    174.         """
    175.         <div class="pv-block">
    176.             <div class="di-title"></div>
    177.             <span clss="di-info"></span>
    178.             <span class="pv-body dpv-body">
    179.                 <div class="pr dsense dsense-noh">
    180.             <span>
    181.         <div>
    182.         """
    183.         result = ""
    184.         for item in tag.find_all("div",{"class", "sense-body"}):
    185.             result += self.process_sense_body(item)
    186.         return result


    187.    
    188.     def process_part_of_speech_header(self, header_tag):
    189.         result = ""
    190.         # title_tag = header_tag.find("div", {"class": "di-title"})
    191.         # if title_tag:
    192.         #     result += process_header_title(title_tag)
    193.         posgram_tag = header_tag.find("div", {"class": "posgram"})
    194.         if posgram_tag:
    195.             result += self.process_part_of_speech_grammar(posgram_tag)
    196.         for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
    197.             result += self.process_pronunciation(pronunciation_tag)

    198.         return result.strip()


    199.     def process_header_title(self, title_tag):
    200.         ## <span class="hw dhw">record</span>
    201.         result = ""
    202.         headword_tag = title_tag.find("span", {"class": "hw"})
    203.         if headword_tag:
    204.             result += "\\entry{" + latexify_html(headword_tag) + "}\n"
    205.         else:
    206.             result += "\\entry{" + latexify_html(title_tag) + "}\n"
    207.         return result

    208.     def process_part_of_speech_grammar(self, posgram_tag):
    209.         result = ""
    210.         part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
    211.         if part_of_speech_tag:
    212.             result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"
    213.         gram_tag = posgram_tag.find("span", {"class": "gc"})
    214.         if gram_tag:
    215.             result += "\n\\posgram{" + latexify_html(gram_tag)  + "}"
    216.         return result

    217.     def process_pronunciation(self, pronunciation_tag):
    218.         is_us_pronunciation = False
    219.         if "us" in pronunciation_tag.attrs["class"]:
    220.             is_us_pronunciation = True
    221.         result = ""
    222.         audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})
    223.         
    224.         ipa_tag = pronunciation_tag.find("span", {"class": "ipa"})
    225.         if ipa_tag:
    226.             if is_us_pronunciation:
    227.                 result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
    228.             else:
    229.                 result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"
    230.         if audio_tag:
    231.             audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])
    232.             result += "\n\pronuniation{" + audio_url + "}"
    233.         return result



    234.     def process_sense_head(self, tag):
    235.         text = latexify_html(tag)
    236.         if "(" in text:
    237.             left_bracket_index = text.index("(")
    238.         else:
    239.             left_bracket_index = 0
    240.         if ")" in text:
    241.             right_bracket_index = text.index(")")
    242.         else:
    243.             right_bracket_index = len(text)
    244.         return "\n\\shortmeaning{" + text[left_bracket_index + 1:  right_bracket_index].strip() + "}"


    245.     def get_definition(self, tag):
    246.         result = ""
    247.         for def_info_tag in tag.select("span.def-into"):
    248.             result += latexify_html(def_info_tag)
    249.         for def_tag in tag.select("div.def"):
    250.             result += latexify_html(def_tag)
    251.         return result


    252.     def process_def_block(self, tag):
    253.         result = ""
    254.         def_tag = tag.find("div", {"class": "ddef_h"})
    255.         if def_tag:
    256.             result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
    257.         try:
    258.             def_trans_tag = tag.select("span.trans")[0]
    259.             if def_trans_tag:
    260.                 result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"
    261.         except:
    262.             pass
    263.         for example_tag in tag.select("span.eg"):
    264.             result += "\n\\example{" + latexify_html(example_tag) + "}"
    265.         return result


    266.     def process_phrase_block(self, phrase_block_tag):
    267.         """
    268.         <div class="phrase-head dphrase_h">...</div>
    269.         <div class="phrase-body dphrase_b">...</div>
    270.         <div class="bb hax">...</div>
    271.         """
    272.         result = "\\begin{phrase}{"
    273.         result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"
    274.         return result + "\\end{pharse}\n"

    275.     def process_sense_body(self, tag):
    276.         """
    277.         <div class="pr phrase-block dphrase-block">...</div>
    278.         <div class="def-block ddef_block">...</div>
    279.         <div class="bb hax">...</div>
    280.         """
    281.         result = ""
    282.         for def_block in tag.select("div.def-block"):
    283.             result += self.process_def_block(def_block)
    284.         for phrase_block in tag.select("div.pharse-block"):
    285.             result += self.process_phrase_block(phrase_block)
    286.         return result
    287.         
    288.     def process_sense_tag(self, sense_tag):
    289.         """
    290.         <h3 class="dsense_h">...</h3>
    291.         <div class="sense-body dsense_b">...</div>
    292.         <div class="smartt daccord">...</div>           # Smart Vocabulary
    293.         <div class="bb hax">...</div>
    294.         """
    295.         result = ""
    296.         sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})
    297.         if sense_head_tag:
    298.             result += self.process_sense_head(sense_head_tag)
    299.         for sense_body_tag in sense_tag.select("div.sense-body"):
    300.             result += self.process_sense_body(sense_body_tag)
    301.         for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):
    302.             result += self.get_smart_vocabulary(smart_vocabulary_tag)
    303.             
    304.         return result

    305.     def process_part_of_speech_body(self, body_tag):
    306.         """
    307.         <div class="pr dsense">...</div>
    308.         <div class="pr dsense">...</div>
    309.         """
    310.         result = ""
    311.         for sense_tag in body_tag.select("div.dsense"):
    312.             result += self.process_sense_tag(sense_tag)
    313.         return result
    314.    

    315. if __name__ == "__main__":
    316.     string = ""
    317.     CambridgeDictionaryScraper().start(cambridge_english_chinese_url)
    318.     for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
    319.         entry = row[0]
    320.         print(entry)
    321.         url = row[1]
    322.         html = row[2]
    323.         record = CambridgeDictionaryExtractor(entry, entry_html=html)
    324.         record.extract()
    325.         string += record.result + "\n"
    326.         #print(record.result)

    327.     with open("./final.tex", "w", encoding="utf-8") as f:
    328.         try:
    329.             for char in ascii_lowercase:
    330.                 string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)
    331.         except:
    332.             pass
    333.         
    334.         f.write(string.replace("", ""))


    335.         
    复制代码