整理了一下《傲慢与偏见》中出现两次及以上的单词【含...

fnaviwwo1 · 发表于 2016-7-18 12:08:06

本帖最后由 fnaviwwo1 于 2016-7-19 16:52 编辑

用 Scala 和 Stanford 的 NLP 工具重写了一遍，生成的质量和上面的 Python/Nltk 各有千秋。

import java.io.File
import java.io.FileReader
import java.io.PrintWriter
import java.io.Reader
import java.io.StringReader
import scala.annotation.migration
import scala.collection.JavaConversions.asScalaBuffer
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import scala.io.Source
import org.apache.lucene.morphology.english.EnglishLuceneMorphology
import edu.stanford.nlp.ling.CoreLabel
import edu.stanford.nlp.ling.HasWord
import edu.stanford.nlp.process.CoreLabelTokenFactory
import edu.stanford.nlp.process.Morphology
import edu.stanford.nlp.process.PTBTokenizer
import edu.stanford.nlp.tagger.maxent.MaxentTagger
object ConvDoc extends App {
val DEBUG = false //|| true
def run(filename: String, title: String) = {
val luceneMorph = new EnglishLuceneMorphology();
def get_adj_or_adv(w: String) = luceneMorph.getNormalForms(w.toLowerCase()).sortBy(_.size).apply(0)
val comp = Set("JJR", "JJS", "RBR", "RBS")
val stopwords = Set("xi", "x", "_", "a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "ca", "can", "couldn", "d", "did", "didn", "do", "does", "doesn", "doing", "don", "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "has", "hasn", "have", "haven", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "it", "its", "itself", "just", "ll", "m", "ma", "me", "mightn", "more", "most", "much", "mustn", "my", "myself", "needn", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "she", "should", "shouldn", "so", "some", "such", "t", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "we", "were", "weren", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "wouldn", "y", "you", "your", "yours", "yourself", "yourselves")
val words = Source.fromFile("userdata/words.txt").getLines().toSet
val ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
val m = new Morphology()
//var r: Reader = new StringReader("I'm using StanfordCore NLP Library for my project. It uses PTB Tokenizer for tokenization.");
var r: Reader = null
//r = new FileReader("Pride and Prejudice.txt")
var text = Source.fromFile(filename).getLines.mkString(" ").replaceAll("(\\s)+", " ")
//text = "Elizabeth listened in silence, but was not convinced; their behaviour at the assembly had not been calculated to please in general; and with more quickness of observation and less pliancy of temper than her sister, and with a judgment too unassailed by any attention to herself, she was very little disposed to approve them."
r = new StringReader(text)
val ss = MaxentTagger.tokenizeText(r, ptbTokenizerFactory);
println(ss.length)
var model:String=null
model= "english-left3words-distsim.tagger"//faster
model = "english-bidirectional-distsim.tagger"//more accurate
val tagger = new MaxentTagger(model)
val gg = new HashMap[String, ArrayBuffer[(Int, String)]]()
val pos_t = Set('V', 'N', 'J', 'R', 'I', 'M', 'W')
println()
def filter_word(word: String, tag: String) = {
pos_t.contains(tag(0)) && !tag.startsWith("NNP") && !stopwords.contains(word.toLowerCase()) // && !tag.startsWith(".")
}
def find_words(sent: java.util.List[HasWord]) = {
val s = tagger.tagSentence(sent)
if (DEBUG) println(s)
s.filter { x => filter_word(x.word(), x.tag()) }
.map(x =>
if (comp.contains(x.tag())) {
(get_adj_or_adv(x.word()), x.word())
} else
(m.lemma(x.word(), x.tag()), x.word())) //+ "_" + x.tag()
}
def get_sent(xs: java.util.List[HasWord]) = {
xs.flatMap { x =>
val o = x.asInstanceOf[CoreLabel]
Seq(o.originalText(), o.after())
}.mkString.trim().replaceAll("(\\s)+", " ")
}
for (i <- 0 until (if (DEBUG) Math.min(20, ss.size) else ss.length)) {
if (i % 100 == 1) println(i)
if (DEBUG) println("[" + get_sent(ss(i)) + "]")
val words = find_words(ss(i))
if (DEBUG) println(words)
words.foreach {
case (x, y) =>
gg.getOrElseUpdate(x, new ArrayBuffer()) += ((i, y))
}
}
println(gg.keys.toList)
println(gg("behaviour"))
//println(gg("behavior"))
if (!DEBUG) {
val out = new PrintWriter(new File(filename + ".wordlist.htm"))
val wordlist = gg.keys.map { x => (x, gg(x).length) }
.filter(x => x._2 > 1)
.filter(x => words.contains(x._1))
.toList.sortBy(x => x._1)
println(wordlist.size)
out.println("<html><body>")
out.println(title)
val escape = (x: String) => x.replaceAll("<", "<").replaceAll(">", ">").replaceAll("&", "&")
for (i <- 0 until (wordlist.size)) {
val e = wordlist(i)
val headword = e._1
val count = e._2
out.println(s"<h3>${i + 1} <font color=blue>$headword</font> (${count})</h3>")
val t = gg(headword)(0)
out.println("<p>")
out.println(escape(get_sent(ss(t._1))).replaceAll(s"\\b(${t._2})\\b", "<font color=red><b>$1</b></font>"))
out.println("</p>")
}
out.println("</body></html>")
out.close()
println("ok")
}
}
run("userdata/Pride and Prejudice.txt", "<h1>Pride and Prejudice</h1>")
}

复制代码

注：感觉nlp4j这个库的Morphology模块更符合需求啊...有个叫LanguageTool的库也很有趣...

附件为按照字典序的输出文件。

fnaviwwo1 · 发表于 2016-7-18 20:08:52

本帖最后由 fnaviwwo1 于 2016-7-19 17:35 编辑

heyuan-cwh 发表于 2016-7-18 19:39
好工具，我以前做那个试卷词典时，在github中找了几个代码，分别尝试了，但都只能得到该单词在一个文本中“ ...

前辈您好！！当初就是受到您制作词典的启发，一直想能制作类似的词典。

我想法是利用一些NLP工具箱，并参考了一些语料库的使用说明。

首先进行分词分句把文本分割成 List<List<String>>这样的数据。
然后做词性标注，标注词性。
根据标注的词性，转换单词的变形为基本形式。
过滤掉 Stopwords（包括代词），专有名词（NNP），进行拼写检查。
最后建立索引，得到 Map<String,List<String>>这样每个单词到所有包含这个单词的各种形式的句子结构的映射
根据索引就可以排序排版输出了。

如果把中间结果写入硬盘数据库的话，对单个文本大小没有限制，否则取决于内存大小。

细节的话，工具用到了NLTK和Stanford CoreNLP
NLTK转单词变形的时候是用查Wordnet的单词表，不能利用好词性标注上关于单词变形的信息。
Stanford CoreNLP的单词变形不处理形容词比较级，最后又用了其他模块额外处理一下。
各种分句模块必须要求句号后面有一个空格。没有空格会当成和句号前的是一个词，好奇怪。
还有就是NLTK自带的词性标注准确率不高，动词的-ing做名词时候是也都会被标注为名词（这样不好转词形）
Stanford CoreNLP标注的准确率感觉没有问题（据说有97%的准确率，NLTK可能只有94%）。
感觉还可以解析出词的依赖关系来分析出常见搭配，这里还没验证过。
我觉得Stanford CoreNLP最好用命令行管道调用，内部的接口不大好直接用。
絮絮叨叨就是这些。

----

请问前辈您所说的把文章拆分分别程序分析是怎样步骤？很希望能收到指点和启发。

zhu1234 · 发表于 2016-7-15 16:46:51

啥软件整理出来的？

华华华 · 发表于 2016-7-15 17:05:47

不简单。没有3000单词以上的水平，还真看不懂英文小说。

lxchen2001 · 发表于 2016-7-15 17:34:59

本帖最后由 lxchen2001 于 2016-7-15 17:36 编辑

谢谢！！！

能否单独提供代码文件?

复制代码加了很多乱码

lxchen2001 · 发表于 2016-7-15 19:06:41

lxchen2001 发表于 2016-7-15 17:34
谢谢！！！

也是同样的加了乱码进来手动更改一下也行代码并不多

学习中 .... 看这样的代码对我提高Python很有帮助非常感谢我只是个初学者没什么编程基础

lxchen2001 · 发表于 2016-7-15 21:12:17

lxchen2001 发表于 2016-7-15 19:06
也是同样的加了乱码进来手动更改一下也行代码并不多

学习中 .... 看这样的代码对我提高Python很 ...

呵呵我是在辅导小孩英语中发现到的，英语写作好难提高。每次自己去看也很累，效果不佳。

我很想做到的是：导入一篇文章，在语料库中检查，然后把不恰当的句子，不合适的词语搭配找出来。

您看看有这样的可能吗？不过我想词法纠错可能稍微容易点，句法比较难。

shakahenryqht · 发表于 2016-7-15 22:59:17

用的什么软件？

shakahenryqht · 发表于 2016-7-15 23:02:21

楼主留个求求号，我想问你一些问题，希望能得到你的帮忙。

lxchen2001 · 发表于 2016-7-15 23:21:12

shakahenryqht 发表于 2016-7-15 23:02
楼主留个求求号，我想问你一些问题，希望能得到你的帮忙。

我帮着楼主解答吧：
- 需要 Python 2 或者3 （Copy代码的话是需要用Python 2)
- 之后需要安装NLTK
- 安装nltk后按步骤安装资料库

在 Mdict软件及经验交流还有其他的介绍帖子

fnaviwwo1 · 发表于 2016-7-16 08:34:55

shakahenryqht 发表于 2016-7-15 22:59
用的什么软件？

稍微等一下，处理词的变形还要改进一下，目前有几个s结尾的复数和can't这样的缩写词处理得还不大好。。。之后如有可能，会包一个API出来，或者GUI出来的。

有什么需求请赶紧提出来哦，如果有什么供测试的文本的话，也非常欢迎提供的。

shakahenryqht · 发表于 2016-7-16 09:05:38

楼主，弄个软件出来吧。谢谢你！我一直在想着找个能摘句的软件。这样可以把字典的文本整合了

shakahenryqht · 发表于 2016-7-16 09:08:30

lxchen2001 发表于 2016-7-15 23:21
我帮着楼主解答吧：
- 需要 Python 2 或者3 （Copy代码的话是需要用Python 2)
- 之后需要安装NLTK

大神，我的求求58225957，我想详细问问你这个代码怎么用

shakahenryqht · 发表于 2016-7-16 21:24:27

shakahenryqht 发表于 2016-7-16 09:08
大神，我的求求58225957，我想详细问问你这个代码怎么用

我在群里，请问你在群里的名字是？

shakahenryqht · 发表于 2016-7-16 21:44:49

shakahenryqht 发表于 2016-7-16 21:24
我在群里，请问你在群里的名字是？

现在群里全员禁言

shakahenryqht · 发表于 2016-7-16 21:46:54

shakahenryqht 发表于 2016-7-16 21:44
现在群里全员禁言

我的网名是Mr. Qi

孤影 · 发表于 2016-7-17 05:54:22

谢谢楼主的无私奉献!!!!!!

histalent · 发表于 2016-7-17 16:32:13

谢谢楼主分享

histalent · 发表于 2016-7-17 16:32:30

谢谢楼主分享

heyuan-cwh · 发表于 2016-7-18 19:39:15

好工具，我以前做那个试卷词典时，在github中找了几个代码，分别尝试了，但都只能得到该单词在一个文本中“第一次出现”的位置，然后导出该句子。为了能得到多一些句子，就只能把文章拆分成几个小部分，分别用程序进行分析处理。不知道楼主的工具怎么样？能处理多大的单个文本？

flyfire013 · 发表于 2016-7-19 21:46:11

谢谢楼主分享!,不错不错

[英英] 整理了一下《傲慢与偏见》中出现两次及以上的单词【含...

本帖被以下淘专辑推荐:

点评

点评

点评

点评

点评

点评

点评