|
推荐
楼主 |
发表于 2014-9-15 13:20:51
|
只看该作者
本帖最后由 bt4baidu 于 2014-9-15 13:26 编辑
附XML->HTML转换用代码(需要逐词条转换,以免标签不匹配造成大范围影响)
- tags = 'b|u|ab|alt|althead|arbd1|awlsym|block-g|bullet|cc|cf|chn|cl|clsym|cm|collsubhead|core_block|coresym|coresym2|crosssym|d|db|def_block|dhb|di|dr|dr-g|drsym|eb|ei-g|etym_tr|etym_i|etymology|etymsym|ff|fm|gi|gl|h|helpsym|h-g|hide|hm|hs|hsrch|id|id-g|ids-g|idsym|idsym-g|if|if-g|ifs-g|infl|inflection|ipa_block|label-g|lang|ndv|n-g|notesym|np|opp|opp-g|oppsym|para|patterns|p-g|ph|pos|posschn|pron-g|pv|pv-g|pvs-g|pvsym|pvsym-g|pvsymopp|r|ref|revout|schn|sd|sd-g|sense-g|side-g|side-panel|small_coresym|sn|subhead|subject|swung-dash|syn|synsep|synsym|tab|ticksym|title|top-g|tx|unbox|unei|unfm|ungi|unsyn|unx|unxh|v|vp|vphdr|vs|vs-g|wb|wd|wf-g|wfw|wfw-g|wx|x|x_in_cl_g|x-g|xh|xhm|xp|xr|xr-g|xs|xsym|xw|y|z|z_ab|z_ei-g|z_g|z_g_er|z_g_it|z_gr|z_gr_br|z_il|z_n|z_p|z_p_in_p|z_p_in_p-g|z_phon-gb|z_phon-us|z_r|z_r_er|z_s|z_side_panel|z_synsep2|z_wfp|z_xr|zd|z_core_h'
- # match "<tag...>...</tag>"
- g_p = re.compile(''.join([r'<(', tags, r')((?:\s[^>]*[^\/])?\s*>.*?)</\1>']), re.I)
- # match unnecessary "<tag...>" (to close it with "</span>")
- g_q = re.compile(''.join([r'<(', tags, r')((?:\s[^>]*[^\/])?\s*>)']), re.I)
- def reptag(m):
- g1 = m.group(1)
- g2 = m.group(2)
- return ''.join([r'<span class="', g1.replace(r'-', r'_'), r'_o"', g2, '</span>'])
- # XML->HTML
- def fmtOALD(key, text):
- text = text.replace('<pos class="', '<pos c_="')
- p = re.compile(r'\/\s+>')
- text = p.sub(r'/>', text)
- n = 1
- while n:
- text, n = g_p.subn(reptag, text)
- n = 1
- while n:
- text, n = g_q.subn(reptag, text)
- return text
- # CSS
- def convertCSS():
- f = open('D:/OALD8.css', 'rU')
- s = f.read()
- lns = s.split('\n')
- p = re.compile(''.join([r'(^|[^\w\-\.\[])(', tags, r')(?=[^\w\-\]]|$)']))
- q = re.compile(r'(\.\w+)\-(\w+_o)')
- out = []
- for ln in lns:
- p1, p2 = ln.split('{')
- p1 = p.sub(r'\1.\2_o', p1)
- p1 = q.sub(r'\1_\2', p1)
- out.append(''.join([p1, '{', p2]))
- f.close()
- f = open('D:/OALD8_cvt.css', 'w')
- f.write('\n'.join(out))
- f.close()
复制代码
其实本来也可以这样转HTML,不过原数据里已经有<a> <table> <span>等标签,不易区分,只好作罢。
- g_p = re.compile(r'<(\w+[\w\-]*\w+|\w+)((?:\s[^>]*[^\/])?\s*>.*?)</\1>', re.I)
- g_q = re.compile(r'<(\w+[\w\-]*\w+|\w+)((?:\s[^>]*[^\/])?\s*>)', re.I)
复制代码 |
|