The following Groovy script prints some information about a given TreeTagger model, e.g. version and dictionary sizes. Additionally, it extracts the dictionaries and tagset from the model and saves them to disk.
#!/usr/bin/env groovy
/**
* SYNOPSIS: ttmodelinfo.groovy [file] [encoding]
*
* EXAMPLE: ./ttmodelinfo.groovy english-par-linux-3.2.bin.gz ISO-8859-1
*
* Gets information from a TreeTagger model. Dumps dictionaries to disk.
*/
@Grab(group='org.annolab.tt4j', module='org.annolab.tt4j', version='1.1.2')
import static org.annolab.tt4j.TreeTaggerModelUtil.*;
def modelFile = new File(args[0]);
def model = readModel(modelFile, args[1]);
def tags = new File(modelFile.getPath()+".tags")
def lemmas = new File(modelFile.getPath()+".lemmas")
def tokens = new File(modelFile.getPath()+".tokens")
println "== Header =="
println " Source : ${model.source}"
println " Version : ${model.version}"
println " Encoding : ${args[1]}"
println "== Dictionaries =="
println " Tags : ${model.tags.size()}"
println " Lemmas : ${model.lemmas.size()}"
println " Tokens : ${model.tokens.size()}"
println "Writing dictionaries may take a while..."
println "... writing tags ..."
tags.delete()
tags.withPrintWriter("UTF-8", { out ->
model.tags.each { out.println "${it}" }
})
println "... writing lemmas ..."
lemmas.delete()
lemmas.withPrintWriter("UTF-8", { out ->
model.lemmas.each { out.println "${it}" }
})
println "... writing tokens ..."
tokens.delete()
tokens.withPrintWriter("UTF-8", { out ->
model.tokens.each { out.println "${it}" }
})
println "... done."