Inspecting TreeTagger models
The following Groovy script prints some information about a given TreeTagger model, e.g. version and dictionary sizes. Additionally, it extracts the dictionaries and tagset from the model and saves them to disk.
# ! /usr/ bin /env groovy
/ **
* SYNOPSIS: ttmodelinfo . groovy [ file ] [ encoding ]
*
* EXAMPLE: . /ttmodelinfo.groovy english-par-linux-3.2.bin.gz ISO-8859-1
*
* Gets information from a TreeTagger model. Dumps dictionaries to disk.
*/
@Grab ( group = 'org.annolab.tt4j' , module = 'org.annolab.tt4j' , version = '1.1.2' )
import static org . annolab . tt4j . TreeTaggerModelUtil .*;
def modelFile = new File ( args [ 0 ]);
def model = readModel ( modelFile , args [ 1 ]);
def tags = new File ( modelFile . getPath ()+ ".tags" )
def lemmas = new File ( modelFile . getPath ()+ ".lemmas" )
def tokens = new File ( modelFile . getPath ()+ ".tokens" )
println "== Header =="
println " Source : ${model.source}"
println " Version : ${model.version}"
println " Encoding : ${args[1]}"
println "== Dictionaries =="
println " Tags : ${model.tags.size()}"
println " Lemmas : ${model.lemmas.size()}"
println " Tokens : ${model.tokens.size()}"
println "Writing dictionaries may take a while..."
println "... writing tags ..."
tags . delete ()
tags . withPrintWriter ( "UTF-8" , { out ->
model . tags . each { out . println "${it}" }
})
println "... writing lemmas ..."
lemmas . delete ()
lemmas . withPrintWriter ( "UTF-8" , { out ->
model . lemmas . each { out . println "${it}" }
})
println "... writing tokens ..."
tokens . delete ()
tokens . withPrintWriter ( "UTF-8" , { out ->
model . tokens . each { out . println "${it}" }
})
println "... done."