Skip to content

Commit

Permalink
project refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
remstef committed Sep 1, 2016
1 parent 49d8859 commit 9344791
Show file tree
Hide file tree
Showing 46 changed files with 622 additions and 232 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@

**/.cache-main
**/.cache-tests
**/.pydevproject
38 changes: 10 additions & 28 deletions lt.lm/pom.xml
Original file line number Diff line number Diff line change
@@ -1,29 +1,17 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>de.tudarmstadt</groupId>

<parent>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.kd-suite</artifactId>
<version>0.7.0</version>
</parent>

<artifactId>lt.lm</artifactId>
<version>0.4.1h</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<repositories>
<repository>
<id>lt.lm-local-repository</id>
<url>file://${project.basedir}/repo</url>
</repository>
</repositories>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.4</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
Expand Down Expand Up @@ -108,16 +96,10 @@
<artifactId>berkeleylm</artifactId>
<version>1.1.6</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version><!--$NO-MVN-MAN-VER$ -->
<scope>test</scope>
</dependency>
<dependency>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.utilities</artifactId>
<version>0.3.7</version>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
Expand All @@ -142,7 +124,7 @@
<dependency>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.seg</artifactId>
<version>0.5.1</version>
<version>${project.version}</version>
</dependency>
<!-- <dependency>
<groupId>org.codehaus.janino</groupId>
Expand Down
22 changes: 14 additions & 8 deletions lt.lm/src/main/java/de/tudarmstadt/lt/lm/app/PerpDoc.java
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ void run(Reader r) {
_oov_terms = 0;
_num_ngrams = 0;
long l = 0;
String ts = null;
String s = null;
for(LineIterator liter = new LineIterator(r); liter.hasNext(); ){
if(++l % 5000 == 0)
LOG.info("{}: processing line {}.", _rmi_string, l);
Expand All @@ -210,30 +212,34 @@ void run(Reader r) {
String[] splits = line.split("\t");
if(splits.length < 3)
continue;
if(docid == null)
if(docid == null){
docid = splits[2];
ts = splits[0];
}

if(!splits[2].equals(docid)){
double perplexity = _perplexity_doc.get();
if(perplexity > _max_perp)
_max_perp = perplexity;
if(perplexity < _min_perp)
_min_perp = perplexity;
String o = String.format("%s\t%s\tPerplexity: %6.3e \tMax: %6.3e \tMin: %6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
_rmi_string, docid, perplexity, _max_perp, _min_perp,
String o = String.format("%s\t%s\t%s\tPerplexity: %6.3e \tMax: %6.3e \tMin: %6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
_rmi_string, ts, docid, perplexity, _max_perp, _min_perp,
_num_ngrams, _oov_terms, _oov_ngrams);
LOG.info(o);
if(!_quiet)
write(String.format("%s%n", o));
else
write(String.format("%s\t%s\t%6.3e%n", _rmi_string, docid, perplexity));
write(String.format("%s\t%s\t%s\t%6.3e%n", _rmi_string, ts, docid, perplexity));
_perplexity_doc.reset();
docid = splits[2];
ts = splits[0];
}

s = splits[1];
List<String>[] ngrams;
try {
ngrams = _lm_prvdr.getNgrams(line);
ngrams = _lm_prvdr.getNgrams(s);
if(ngrams == null || ngrams.length == 0)
continue;
} catch (Exception e) {
Expand Down Expand Up @@ -268,14 +274,14 @@ void run(Reader r) {
_max_perp = perplexity;
if(perplexity < _min_perp)
_min_perp = perplexity;
String o = String.format("%s\t%s\tPerplexity: %6.3e \tMax: %6.3e \tMin: %6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
_rmi_string, docid, perplexity, _max_perp, _min_perp,
String o = String.format("%s\t%s\t%s\tPerplexity: %6.3e \tMax: %6.3e \tMin: %6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
_rmi_string, ts, docid, perplexity, _max_perp, _min_perp,
_num_ngrams, _oov_terms, _oov_ngrams);
LOG.info(o);
if(!_quiet)
write(String.format("%s%n", o));
else
write(String.format("%s\t%s\t%6.3e%n", _rmi_string, docid, _perplexity_doc.get()));
write(String.format("%s\t%s\t%s\t%6.3e%n", _rmi_string, ts, docid, _perplexity_doc.get()));
}

}
Expand Down
46 changes: 35 additions & 11 deletions lt.lm/src/main/java/de/tudarmstadt/lt/lm/app/PerplexityClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ public PerplexityClient(String args[]) {
opts.addOption(OptionBuilder.withLongOpt("port").withArgName("port-number").hasArg().withDescription(String.format("Specifies the port on which the rmi registry listens (default: %d).", Registry.REGISTRY_PORT)).create("p"));
opts.addOption(OptionBuilder.withLongOpt("selftest").withDescription("Run a selftest, compute perplexity of ngrams in specified LM.").create("s"));
opts.addOption(OptionBuilder.withLongOpt("quiet").withDescription("Run with minimum outout on stdout.").create("q"));
opts.addOption(OptionBuilder.withLongOpt("noov").hasOptionalArg().withArgName("{true|false}").withDescription("Do not consider oov terms, i.e. ngrams that end in an oov term. (default: false)").create());
opts.addOption(OptionBuilder.withLongOpt("skipoov").hasOptionalArg().withArgName("{true|false}").withDescription("Do not consider oov terms, i.e. ngrams that end in an oov term. (default: false)").create());
opts.addOption(OptionBuilder.withLongOpt("skipoovreflm").hasOptionalArg().withArgName("{true|false}").withDescription("Do not consider oov terms regarding the oovreflm, i.e. ngrams that end in an oov term. (default: false)").create());
opts.addOption(OptionBuilder.withLongOpt("oovreflm").withArgName("identifier").hasArg().withDescription("Do not consider oov terms with respect to the provided lm, i.e. ngrams that end in an oov term in the referenced lm. (default use current lm)").create());
opts.addOption(OptionBuilder.withLongOpt("host").withArgName("hostname").hasArg().withDescription("Specifies the hostname on which the rmi registry listens (default: localhost).").create("h"));
opts.addOption(OptionBuilder.withLongOpt("file").withArgName("name").hasArg().withDescription("Specify the file or directory that contains '.txt' files that are used as source for testing perplexity with the specified language model. Specify '-' to pipe from stdin. (default: '-').").create("f"));
Expand All @@ -89,9 +90,13 @@ public PerplexityClient(String args[]) {
_host = cmd.getOptionValue("host", "localhost");
_selftest = cmd.hasOption("selftest");
_quiet = cmd.hasOption("quiet");
_no_oov = cmd.hasOption("noov");
if(_no_oov && cmd.getOptionValue("noov") != null)
_no_oov = Boolean.parseBoolean(cmd.getOptionValue("noov"));
_no_oov = cmd.hasOption("skipoov");
if(_no_oov && cmd.getOptionValue("skipoov") != null)
_no_oov = Boolean.parseBoolean(cmd.getOptionValue("skipoov"));
_no_oov_reflm = cmd.hasOption("skipoovreflm");
if(_no_oov_reflm && cmd.getOptionValue("skipoovreflm") != null)
_no_oov_reflm = Boolean.parseBoolean(cmd.getOptionValue("skipoovreflm"));

_one_ngram_per_line = cmd.hasOption("one_ngram_per_line");
if(_one_ngram_per_line && cmd.getOptionValue("one_ngram_per_line") != null)
_one_ngram_per_line = Boolean.parseBoolean(cmd.getOptionValue("one_ngram_per_line"));
Expand All @@ -114,6 +119,7 @@ public PerplexityClient(String args[]) {
boolean _selftest;
boolean _quiet;
boolean _no_oov;
boolean _no_oov_reflm;
boolean _one_ngram_per_line;
PrintStream _pout;

Expand All @@ -126,6 +132,8 @@ public PerplexityClient(String args[]) {
ModelPerplexity<String> _perplexity_all = null;
ModelPerplexity<String> _perplexity_file = null;

long _oovreflm_oov_terms = 0;
long _oovreflm_oov_ngrams = 0;
long _oov_terms = 0;
long _oov_ngrams = 0;
long _num_ngrams = 0;
Expand Down Expand Up @@ -192,22 +200,30 @@ public boolean accept(File f) {
try{ run(new InputStreamReader(new FileInputStream(f), "UTF-8")); }catch(Exception e){LOG.error("{}: Could not compute perplexity from file '{}'.", _rmi_string, f.getAbsolutePath(), e);}
String o = String.format("%s: (intermediate results) \t %s \tPerplexity (file): %6.3e \tPerplexity (cum): %6.3e \tMax: log_10(p(%s))=%6.3e \tMin: log_10(p(%s))=%6.3e \tngrams (cum): %d \tOov-terms (cum): %d \tOov-ngrams (cum): %d",
_rmi_string, f.getAbsoluteFile(), _perplexity_file.get(), _perplexity_all.get(), _max_ngram, _max_prob, _min_ngram, _min_prob,
_num_ngrams, _oov_terms, _oov_ngrams);
_num_ngrams, _oovreflm_oov_terms, _oovreflm_oov_ngrams);
LOG.info(o);
if(!_quiet)
write(String.format("%s%n", o));
}
}
}

String o = String.format("%s\t%s\tPerplexity: %6.3e \tMax: log_10(p(%s))=%6.3e \tMin: log_10(p(%s))=%6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
String o = String.format("%s\t%s\tPerplexity: %6.3e \tMax: log_10(p(%s))=%6.3e \tMin: log_10(p(%s))=%6.3e \tngrams: %d \toov-handling: %s \tOov-terms: %d \tOov-ngrams: %d \toov-reflm-handling: %s \tOov-reflm-terms: %d \tOov-reflm-ngrams: %d",
_rmi_string, _file, _perplexity_all.get(), _max_ngram, _max_prob, _min_ngram, _min_prob,
_num_ngrams, _oov_terms, _oov_ngrams);
_num_ngrams,
_no_oov ? "oov excluded" : "oov included", _oov_terms, _oov_ngrams,
_no_oov_reflm ? "oov-reflm excluded" : "oov-reflm included", _oovreflm_oov_terms, _oovreflm_oov_ngrams);
LOG.info(o);
if(!_quiet)
write(String.format("%s%n", o));
else
write(String.format("%s\t%s\t%6.3e%n", _rmi_string, _file, _perplexity_all.get()));
write(String.format("%s\t%s\t%6.3e\t%d\t%s\t%d\t%d\t%s\t%d\t%d%n", _rmi_string, _file, _perplexity_all.get(), _num_ngrams,
_no_oov ? "oov excluded" : "oov included",
_oov_ngrams,
_oov_terms,
_no_oov_reflm ? "oov-reflm excluded" : "oov-reflm included",
_oovreflm_oov_ngrams,
_oovreflm_oov_terms));
}

@SuppressWarnings("unchecked")
Expand Down Expand Up @@ -239,12 +255,20 @@ void run(Reader r) {
continue;
_num_ngrams++;
try{
boolean oov = false;
if(_lm_prvdr.ngramContainsOOV(ngram)){
_oov_ngrams++;
if(_lm_prvdr.ngramEndsWithOOV(ngram)){
_oov_terms++;
oov = true;
}
}

if(_lm_prvdr_oovref.ngramContainsOOV(ngram)){
_oov_ngrams++;
_oovreflm_oov_ngrams++;
if(_lm_prvdr_oovref.ngramEndsWithOOV(ngram)){
_oov_terms++;
if(_no_oov)
_oovreflm_oov_terms++;
if(_no_oov_reflm || (_no_oov && oov))
continue;
}
}
Expand Down
15 changes: 11 additions & 4 deletions lt.lm/src/main/sh/lm
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ cp=${cp:1} # remove heading colon

# skip all -D.. and -X.. parameters before the actual main class and add them later to JAVA_OPTS
DX=''
while [[ "${1}" == -D* || "${1}" == -X* ]]; do
DX="${DX} \"${1}\""
while [[ $1 == -D* || $1 == -X* ]]; do
DX="$DX $1"
shift
done

Expand All @@ -75,7 +75,8 @@ if [[ ! $JAVA_OPTS == *"-Dproject.properties="* && -e ${lmhome}/project.properti
# add logback.xml if not already in JAVA_OPTS
if [[ ! $JAVA_OPTS == *"-Dlogback.configurationFile="* && -e ${lmhome}/logback.xml ]]; then JAVA_OPTS="$JAVA_OPTS -Dlogback.configurationFile=\"${lmhome}/logback.xml\"" ; fi
# add Xmx and Xms if not set
if [[ ! $JAVA_OPTS == *"-Xmx"* && ! $JAVA_OPTS == *"-Xms"* ]]; then JAVA_OPTS="$JAVA_OPTS -Xmx2g -Xms2g" ; fi
if [[ ! $JAVA_OPTS == *"-Xmx"* ]]; then JAVA_OPTS="$JAVA_OPTS -Xmx2g" ; fi
if [[ ! $JAVA_OPTS == *"-Xms"* ]]; then JAVA_OPTS="$JAVA_OPTS -Xms2g" ; fi

# try to find JAVA_HOME and set it accordingly
if [ -z ${JAVA_HOME} ]; then
Expand Down Expand Up @@ -110,7 +111,13 @@ fi
if [ $test_var == 'y' -o $test_var == 'Y' ]
then
echo "Start: `date`." >&2
eval "time ${command}"
if [ $(which rlwrap) ]; then
echo "found rlwrap" >&2
eval "time rlwrap ${command}"
else
echo "rlwrap not found." >&2
eval "time ${command}"
fi
echo "Finished: `date`." >&2
else
echo "Command execution cancelled." >&2
Expand Down
14 changes: 14 additions & 0 deletions lt.lm/src/main/sh/lm-nightly
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
##
#
# Copy this file into your PATH
#
##

[ -z ${KD_SUITE_HOME} ] && kdhome="${HOME}/git/lt.kd" || kdhome=${KD_SUITE_HOME}

lmsrc="${kdhome}/lt.lm"

tgt=$(find "${lmsrc}/target" -type f -name "lm" | grep "dist/" | head -n1)

eval "${tgt} $@"
3 changes: 2 additions & 1 deletion lt.ltbot/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ target
**/*.pyc

# ipython notebook related
**/.ipynb_checkpoints*
.ipynb_checkpoints
__pycache__

# java compiled
**/*.class
Expand Down
40 changes: 11 additions & 29 deletions lt.ltbot/pom.xml
Original file line number Diff line number Diff line change
@@ -1,29 +1,17 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>de.tudarmstadt</groupId>

<parent>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.kd-suite</artifactId>
<version>0.7.0</version>
</parent>

<artifactId>lt.ltbot</artifactId>
<version>0.4.1a</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<repositories>
<repository>
<id>local-repository</id>
<url>file:///${project.basedir}/repo</url>
</repository>
</repositories>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
Expand Down Expand Up @@ -140,19 +128,13 @@
<dependency>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.lm</artifactId>
<version>0.4.1h</version>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- <dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.treetagger-asl</artifactId>
Expand Down Expand Up @@ -253,12 +235,12 @@
<dependency>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.seg</artifactId>
<version>0.5.1</version>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.utilities</artifactId>
<version>0.3.7</version>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.syncthemall</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,14 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
*/
protected void writeplaintext(CrawlURI curi, String cleaned_plaintext) {
String perplexity_value_as_string = "null";
if(curi != null && curi.getData() != null){
Object obj = curi.getData().get(SharedConstants.EXTRA_INFO_PERPLEXITY);
if(obj != null)
perplexity_value_as_string = (String)obj;
}
if(curi == null || curi.getData() == null)
return;
if(StringUtils.isEmpty(cleaned_plaintext))
return;

Object obj = curi.getData().get(SharedConstants.EXTRA_INFO_PERPLEXITY);
if(obj != null)
perplexity_value_as_string = (String)obj;

String time = TimeUtils.get_ISO_8601_UTC();
synchronized (_lck) {
Expand Down
Loading

0 comments on commit 9344791

Please sign in to comment.