Skip to content

Commit

Permalink
use JHazm tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
kariminf committed Jan 28, 2015
1 parent b76726d commit dde9433
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 18 deletions.
4 changes: 3 additions & 1 deletion src/aak/as/preProcess/persian/FaNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@

package aak.as.preProcess.persian;

import hazm.jhazm.PersianNormalizer;

import java.util.HashMap;

import aak.as.preProcess.lang.Normalizer;

public class FaNormalizer implements Normalizer {

JHazm.Normalizer normalizer = new JHazm.Normalizer();
PersianNormalizer normalizer = new PersianNormalizer();
/*
* This function is used to delete new lines
*/
Expand Down
44 changes: 38 additions & 6 deletions src/aak/as/preProcess/persian/FaSegmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@

package aak.as.preProcess.persian;

import hazm.jhazm.PersianSentTokenizer;
import hazm.jhazm.PersianWordTokenizer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

Expand All @@ -31,18 +35,38 @@

public class FaSegmenter implements Segmenter {

private final String punctuation = "\\.،؛:\"\'؟([?]+|:»)}\"«{";
PersianSentTokenizer sentSegmenter = new PersianSentTokenizer();
PersianWordTokenizer wordTokenizer;

public FaSegmenter(){
try {
wordTokenizer = new PersianWordTokenizer();
} catch (IOException e) {
wordTokenizer = null;
}
}

@Override
public List<String> splitToSentences(String text) {
List<String> ret = new ArrayList<String>();
for(String sentence: text.split("[\\\\!][\\s$]"))
if(sentence.trim().length() > 0)
ret.add(sentence.trim());

return ret;
return sentSegmenter.Tokenize(text);
}


@Override
public List<String> segmentWords(String text) {
if (wordTokenizer != null){
List<String> words = wordTokenizer.Tokenize(text);
deletePunctuation(words);
return (words);
}
System.out.println("no persian tokenizer");

return segmentWordsDef(text);
}


private List<String> segmentWordsDef(String text){
List<String> ret = new ArrayList<String>();
for(String word: text.split("[\\.،؛:\"\'؟\\!]?\\s+|\\.$")){
//word = word.replace(" ", "");
Expand All @@ -55,6 +79,14 @@ public List<String> segmentWords(String text) {
return ret;
}

private void deletePunctuation (List<String> words){
for(int i=words.size()-1; i>=0; i--){
if(punctuation.contains(words.get(i).substring(0, 1)))
words.remove(i);
}

}

public static void main(String[] args) {

Segmenter segmenter = new FaSegmenter();
Expand Down
15 changes: 4 additions & 11 deletions src/aak/as/preProcess/persian/FaStemmer.java
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@

package aak.as.preProcess.persian;

import hazm.jhazm.PersianLemmatizer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import JHazm.Lemmatizer;

import aak.as.preProcess.lang.Stemmer;

public class FaStemmer implements Stemmer {

Lemmatizer lematizer;
PersianLemmatizer lematizer;

public FaStemmer(){
try {
lematizer = new Lemmatizer();
lematizer = new PersianLemmatizer();
} catch (IOException e) {
System.out.println("no persian lematizer");
lematizer = null;
Expand Down Expand Up @@ -46,17 +46,10 @@ public static void main(String[] args) {
Stemmer Stemmer=new FaStemmer();
List<String> tstList = new ArrayList<String>();
tstList.add("قسمتی");
tstList.add("آبشار");
tstList.add("نیاگارا");
tstList.add("بین");
tstList.add("آمریکا");
tstList.add("کانادا");
tstList.add("قرار");
tstList.add("دارد");
tstList.add("ازجاذبه‌های");
tstList.add("طبیعی");
tstList.add("توریستی");
tstList.add("آمریکا");
tstList.add("شمار");
tstList.add("می‌رود");

Expand Down

0 comments on commit dde9433

Please sign in to comment.