Skip to content

Commit

Permalink
several bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
remstef committed Jun 29, 2016
1 parent 0077b31 commit ba19a16
Show file tree
Hide file tree
Showing 21 changed files with 452 additions and 51 deletions.
3 changes: 0 additions & 3 deletions lt.lm/src/test/java/META-INF/MANIFEST.MF

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ seeds.textSource.path=seed.txt

<bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName" />

<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
<bean id="seeds" class="de.tudarmstadt.lt.ltbot.seed.TextPrioSeedModule">
<property name="textSource">
<bean class="org.archive.spring.ConfigFile">
<property name="path">
Expand Down Expand Up @@ -204,7 +204,7 @@ seeds.textSource.path=seed.txt

<!-- PRIORITY -->
<bean id="sentenceMaker" class="de.tudarmstadt.lt.ltbot.writer.SentenceMaker">
<property name="minLength" value="5" />
<property name="minLength" value="3" />
<property name="targetLanguageCode" value="default" />
</bean>
<bean id="perplexityProducer" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValueProducerPerplexity" autowire="byName">
Expand Down Expand Up @@ -246,7 +246,7 @@ seeds.textSource.path=seed.txt
</bean>
</property>
</bean>
<bean id="perplexityPrioritizer" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValuePrioritizer" />
<bean id="perplexityPrioritizer" class="de.tudarmstadt.lt.ltbot.prefetch.DecesiveValuePrioritizer" />
<bean id="perplexityLoggerDispositionChain" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValueLogger" />

<!-- CANDIDATE CHAIN -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ http://127.0.0.1/test

<!-- BEANS BEANS BEANS -->
<bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName" />
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
<bean id="seeds" class="de.tudarmstadt.lt.ltbot.seed.TextPrioSeedModule">
<property name="textSource">
<bean class="org.archive.spring.ConfigString">
<property name="value">
Expand Down Expand Up @@ -203,7 +203,7 @@ http://127.0.0.1/test

<!-- PRIORITY -->
<bean id="sentenceMaker" class="de.tudarmstadt.lt.ltbot.writer.SentenceMaker">
<property name="minLength" value="5" />
<property name="minLength" value="3" />
<property name="targetLanguageCode" value="default" />
</bean>
<bean id="perplexityProducer" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValueProducerPerplexity" autowire="byName">
Expand Down Expand Up @@ -245,7 +245,7 @@ http://127.0.0.1/test
</bean>
</property>
</bean>
<bean id="perplexityPrioritizer" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValuePrioritizer" />
<bean id="perplexityPrioritizer" class="de.tudarmstadt.lt.ltbot.prefetch.DecesiveValuePrioritizer" />
<bean id="perplexityLoggerDispositionChain" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValueLogger" />

<!-- CANDIDATE CHAIN -->
Expand Down
2 changes: 1 addition & 1 deletion lt.ltbot/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.ltbot</artifactId>
<version>0.4.0d</version>
<version>0.4.1</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ protected void innerProcess(CrawlURI uri) throws InterruptedException {

private String getLogString(CrawlURI curi){
String timestamp = TimeUtils.get_ISO_8601_UTC();
String value_as_str = curi.getData().containsKey(getExtraInfoValueFieldName()) ? curi.getData().get(getExtraInfoValueFieldName()).toString() : "null";
String value_as_str = curi.getData().get(getExtraInfoValueFieldName()) != null ? curi.getData().get(getExtraInfoValueFieldName()).toString() : "null";
String current_scheduling_directive = String.valueOf(curi.getSchedulingDirective());
String current_precedence = String.valueOf(curi.getPrecedence());
String assigned_scheduling_directive = "_";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@ public double computePerplexity(String text) throws Exception{
ModelPerplexity<String> perp = new ModelPerplexity<String>(_lmprvdr.get());
for(String sentence : _sentenceMakerInstance.getSentences(text)){
List<String>[] ngrams = _lmprvdr.get().getNgrams(sentence);
if(ngrams.length <= 1) // at least 2 ngrams
// LOG.finest(String.format("ngrams: %s", ngrams));
if(ngrams.length < 1) // at least 1 ngrams
continue;
if(ngrams[ngrams.length-1].size() < _lmprvdr.get().getLmOrder()) // at least one ngram with cardinality of lm
continue;
Expand Down Expand Up @@ -264,7 +265,7 @@ protected void innerProcess(CrawlURI uri) throws InterruptedException {


synchronized (_lck) {
if(Double.isInfinite(perplexity)){
if(!Double.isFinite(perplexity) || perplexity <= 1){
_num_inf_values.incrementAndGet();
}else{
double temp = (_perplexity_avg * _num_values) + perplexity;
Expand Down Expand Up @@ -306,14 +307,14 @@ static void addExtraInfo(CrawlURI uri, String key, Object value) {
double perplexity = Double.POSITIVE_INFINITY;
try {
String docid = "#" + Integer.toHexString(cleaned_plaintext.hashCode());
LOG.fine(String.format("Sending text with id '%s' to StringProvider: '%s' (length %d).", docid, cleaned_plaintext_abbr, cleaned_plaintext.length()));
LOG.finest(String.format("Sending text with id '%s' to StringProvider: '%s' (length %d).", docid, cleaned_plaintext_abbr, cleaned_plaintext.length()));
perplexity = computePerplexity(cleaned_plaintext);
// if (Double.isNaN(perplexity)) {
// double perplexity_new = -1d;
// LOG.log(Level.WARNING, String.format("[%s '%s'] failed to get meaningful perplexity: %g. Setting perplexity to %g.", uri.toString(), cleaned_plaintext_abbr, perplexity, perplexity_new));
// perplexity = perplexity_new;
// }
LOG.fine(String.format("[%s, '%s'] perplexity: %g.", uri.toString(), cleaned_plaintext_abbr, perplexity));
LOG.finest(String.format("[%s, '%s'] perplexity: %g.", uri.toString(), cleaned_plaintext_abbr, perplexity));
} catch (Throwable t) {
for (int i = 1; t != null && i < 10; i++) {
LOG.log(Level.SEVERE,
Expand All @@ -330,7 +331,7 @@ static void addExtraInfo(CrawlURI uri, String key, Object value) {
_paused_due_to_error = true;
}
}
if(Double.isInfinite(perplexity)){
if(!Double.isFinite(perplexity) || perplexity <= 1){
LOG.log(Level.FINE, String.format("[%s '%s'] resetting infinite perplexity to predefined maximum perplexity value (-1).", uri.toString(), cleaned_plaintext_abbr));
perplexity = -1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ public class SharedConstants {
private SharedConstants(){ /* DO NOT INSTANTIATE */ }

public final static String EXTRA_INFO_PERPLEXITY = "perp";
public final static String EXTRA_INFO_PERPLEXITY_VIA = "perp-via";
public final static String EXTRA_INFO_ASSIGNED_SCHEDULING_DIRECTIVE = "asgnd-sched-drctve";
public final static String EXTRA_INFO_ASSIGNED_COST_PRECEDENCE = "asgnd-cost-precedence";
public final static String EXTRA_INFO_PLAINTEXT_ABBREVIATED = "plain-abbrv";
Expand Down
Loading

0 comments on commit ba19a16

Please sign in to comment.