Skip to content

Commit

Permalink
Merge pull request #44 from whelk-io/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
zteater authored Jan 6, 2020
2 parents 130efab + de7ea32 commit 2523910
Show file tree
Hide file tree
Showing 12 changed files with 538 additions and 209 deletions.
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,48 @@ source: https://en.wikipedia.org/wiki/Flesch–Kincaid_readability_tests#Flesch
</repositories>
````

### Provided Dependencies

`whelk-flesch-kincaid` offers two options for parsing String content to requisite sentences and word tokens.

#### Stanford Core NLP

Stanford Core NLP offers a range of NLP features, however the trained models must be loaded as a separate dependency. The models are used to parse POS tags from sentences.

````xml
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.9.2</version>
</dependency>

<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.9.2</version>
<classifier>models</classifier>
</dependency>

````

#### AWS Comprehend and Stanford Core NLP hybrid

Stanford NLP models are nearly 370mb in size. As of today, AWS Lambda functions and layers have a upper limit of 250mb. Since the models are only used to parse POS tags from words, AWS comprehend can be used instead to parse `SyntaxTokens` using `BatchDetectSyntax` or `DetectSyntax`. See AWS documentation for more information: https://github.com/awsdocs/amazon-comprehend-developer-guide

````xml
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.9.2</version>
</dependency>

<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-comprehend</artifactId>
<version>1.11.700</version>
</dependency>

````

More information on authenticating with GitHub packages: https://help.github.com/en/github/managing-packages-with-github-packages/configuring-apache-maven-for-use-with-github-packages#authenticating-to-github-packages

14 changes: 13 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

<groupId>io.whelk.flesch.kincaid</groupId>
<artifactId>whelk-flesch-kincaid</artifactId>
<version>0.0.19-RELEASE</version>
<version>0.0.20-RELEASE</version>
<name>Flesch Kincaid</name>
<description>Java OSS library for performing Flesch-Kincaid readability tests.</description>
<url>https://github.com/whelk-io/flesch-kincaid</url>
Expand Down Expand Up @@ -45,15 +45,27 @@
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.9.2</version>
<scope>provided</scope>
<optional>true</optional>
</dependency>

<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.9.2</version>
<classifier>models</classifier>
<scope>provided</scope>
<optional>true</optional>
</dependency>

<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-comprehend</artifactId>
<version>1.11.700</version>
<scope>provided</scope>
<optional>true</optional>
</dependency>

<dependency>
<groupId>io.whelk.hy.phen</groupId>
<artifactId>whelk-hy-phen-a-tion</artifactId>
Expand Down
59 changes: 59 additions & 0 deletions src/main/java/io/whelk/flesch/kincaid/ComprehendValidator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package io.whelk.flesch.kincaid;

import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.ADJ;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.ADP;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.ADV;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.AUX;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.CCONJ;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.CONJ;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.DET;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.INTJ;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.NOUN;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.NUM;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.PART;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.PRON;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.PROPN;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.SCONJ;
import static com.amazonaws.services.comprehend.model.PartOfSpeechTagType.VERB;
import java.util.Arrays;
import java.util.List;
import com.amazonaws.services.comprehend.model.SyntaxToken;
import lombok.experimental.UtilityClass;

@UtilityClass
public class ComprehendValidator {

private static final List<String> validTagTypes =
Arrays.asList(
ADJ.name(),
ADP.name(),
ADV.name(),
AUX.name(),
CONJ.name(),
CCONJ.name(),
DET.name(),
INTJ.name(),
NOUN.name(),
NUM.name(),
PART.name(),
PRON.name(),
PROPN.name(),
SCONJ.name(),
VERB.name());

/**
* Determine if <code>syntaxToken</code> is valid word. Unknown tags, punctuation, or symbols are
* not considered valid words.
*
*
* @param syntaxToken
* @return
*/
public static boolean isWord(SyntaxToken syntaxToken) {
return syntaxToken != null &&
syntaxToken.getPartOfSpeech() != null &&
syntaxToken.getPartOfSpeech().getTag() != null &&
validTagTypes.contains(syntaxToken.getPartOfSpeech().getTag());
}

}
53 changes: 0 additions & 53 deletions src/main/java/io/whelk/flesch/kincaid/POSTag.java

This file was deleted.

71 changes: 71 additions & 0 deletions src/main/java/io/whelk/flesch/kincaid/PennTreebankValidator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package io.whelk.flesch.kincaid;

import static io.whelk.flesch.kincaid.PennTreebankValidator.PennTreebankTag.parse;
import java.util.Arrays;
import java.util.List;
import edu.stanford.nlp.simple.Token;
import lombok.experimental.UtilityClass;

@UtilityClass
public class PennTreebankValidator {

private static final List<PennTreebankTag> invalidWordTags =
Arrays.asList(
PennTreebankTag.UNKNOWN,
PennTreebankTag.POS,
PennTreebankTag.SYM);

public static boolean isWord(Token token) {
return token != null && !invalidWordTags.contains(parse(token.posTag()));
}

enum PennTreebankTag {

CC,
CD,
DT,
EX,
FW,
IN,
JJ,
JJR,
JJS,
LS,
MD,
NN,
NNS,
NNP,
NNPS,
PDT,
POS,
PRP,
PRP$,
RB,
RBR,
RBS,
RP,
SYM,
TO,
UH,
VB,
VBD,
VBG,
VBN,
VBP,
VBZ,
WDT,
WP,
WP$,
WRB,
UNKNOWN;

public static PennTreebankTag parse(String posTag) {
return Arrays
.stream(PennTreebankTag.values())
.filter(p -> p.name().equals(posTag))
.findFirst()
.orElse(PennTreebankTag.UNKNOWN);
}
}

}
Loading

0 comments on commit 2523910

Please sign in to comment.