|
推荐
楼主 |
发表于 2016-7-18 12:08:06
|
只看该作者
本帖最后由 fnaviwwo1 于 2016-7-19 16:52 编辑
用 Scala 和 Stanford 的 NLP 工具重写了一遍,生成的质量和上面的 Python/Nltk 各有千秋。
- import java.io.File
- import java.io.FileReader
- import java.io.PrintWriter
- import java.io.Reader
- import java.io.StringReader
- import scala.annotation.migration
- import scala.collection.JavaConversions.asScalaBuffer
- import scala.collection.mutable.ArrayBuffer
- import scala.collection.mutable.HashMap
- import scala.io.Source
- import org.apache.lucene.morphology.english.EnglishLuceneMorphology
- import edu.stanford.nlp.ling.CoreLabel
- import edu.stanford.nlp.ling.HasWord
- import edu.stanford.nlp.process.CoreLabelTokenFactory
- import edu.stanford.nlp.process.Morphology
- import edu.stanford.nlp.process.PTBTokenizer
- import edu.stanford.nlp.tagger.maxent.MaxentTagger
- object ConvDoc extends App {
- val DEBUG = false //|| true
- def run(filename: String, title: String) = {
- val luceneMorph = new EnglishLuceneMorphology();
- def get_adj_or_adv(w: String) = luceneMorph.getNormalForms(w.toLowerCase()).sortBy(_.size).apply(0)
- val comp = Set("JJR", "JJS", "RBR", "RBS")
- val stopwords = Set("xi", "x", "_", "a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "ca", "can", "couldn", "d", "did", "didn", "do", "does", "doesn", "doing", "don", "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "has", "hasn", "have", "haven", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "it", "its", "itself", "just", "ll", "m", "ma", "me", "mightn", "more", "most", "much", "mustn", "my", "myself", "needn", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "she", "should", "shouldn", "so", "some", "such", "t", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "we", "were", "weren", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "wouldn", "y", "you", "your", "yours", "yourself", "yourselves")
- val words = Source.fromFile("userdata/words.txt").getLines().toSet
- val ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
- val m = new Morphology()
- //var r: Reader = new StringReader("I'm using StanfordCore NLP Library for my project. It uses PTB Tokenizer for tokenization.");
- var r: Reader = null
- //r = new FileReader("Pride and Prejudice.txt")
- var text = Source.fromFile(filename).getLines.mkString(" ").replaceAll("(\\s)+", " ")
- //text = "Elizabeth listened in silence, but was not convinced; their behaviour at the assembly had not been calculated to please in general; and with more quickness of observation and less pliancy of temper than her sister, and with a judgment too unassailed by any attention to herself, she was very little disposed to approve them."
- r = new StringReader(text)
- val ss = MaxentTagger.tokenizeText(r, ptbTokenizerFactory);
- println(ss.length)
- var model:String=null
- model= "english-left3words-distsim.tagger"//faster
- model = "english-bidirectional-distsim.tagger"//more accurate
- val tagger = new MaxentTagger(model)
- val gg = new HashMap[String, ArrayBuffer[(Int, String)]]()
- val pos_t = Set('V', 'N', 'J', 'R', 'I', 'M', 'W')
- println()
- def filter_word(word: String, tag: String) = {
- pos_t.contains(tag(0)) && !tag.startsWith("NNP") && !stopwords.contains(word.toLowerCase()) // && !tag.startsWith(".")
- }
- def find_words(sent: java.util.List[HasWord]) = {
- val s = tagger.tagSentence(sent)
- if (DEBUG) println(s)
- s.filter { x => filter_word(x.word(), x.tag()) }
- .map(x =>
- if (comp.contains(x.tag())) {
- (get_adj_or_adv(x.word()), x.word())
- } else
- (m.lemma(x.word(), x.tag()), x.word())) //+ "_" + x.tag()
- }
- def get_sent(xs: java.util.List[HasWord]) = {
- xs.flatMap { x =>
- val o = x.asInstanceOf[CoreLabel]
- Seq(o.originalText(), o.after())
- }.mkString.trim().replaceAll("(\\s)+", " ")
- }
- for (i <- 0 until (if (DEBUG) Math.min(20, ss.size) else ss.length)) {
- if (i % 100 == 1) println(i)
- if (DEBUG) println("[" + get_sent(ss(i)) + "]")
- val words = find_words(ss(i))
- if (DEBUG) println(words)
- words.foreach {
- case (x, y) =>
- gg.getOrElseUpdate(x, new ArrayBuffer()) += ((i, y))
- }
- }
- println(gg.keys.toList)
- println(gg("behaviour"))
- //println(gg("behavior"))
- if (!DEBUG) {
- val out = new PrintWriter(new File(filename + ".wordlist.htm"))
- val wordlist = gg.keys.map { x => (x, gg(x).length) }
- .filter(x => x._2 > 1)
- .filter(x => words.contains(x._1))
- .toList.sortBy(x => x._1)
- println(wordlist.size)
- out.println("<html><body>")
- out.println(title)
- val escape = (x: String) => x.replaceAll("<", "<").replaceAll(">", ">").replaceAll("&", "&")
- for (i <- 0 until (wordlist.size)) {
- val e = wordlist(i)
- val headword = e._1
- val count = e._2
- out.println(s"<h3>${i + 1} <font color=blue>$headword</font> (${count})</h3>")
- val t = gg(headword)(0)
- out.println("<p>")
- out.println(escape(get_sent(ss(t._1))).replaceAll(s"\\b(${t._2})\\b", "<font color=red><b>$1</b></font>"))
- out.println("</p>")
- }
- out.println("</body></html>")
- out.close()
- println("ok")
- }
- }
- run("userdata/Pride and Prejudice.txt", "<h1>Pride and Prejudice</h1>")
- }
复制代码
注:感觉nlp4j这个库的Morphology模块更符合需求啊...有个叫LanguageTool的库也很有趣...
附件为按照字典序的输出文件。 |
|