Skip to content

Commit

Permalink
Remove language 'Unknown' from the set of input languages (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
pemistahl committed Nov 28, 2021
1 parent c67de11 commit f4abe93
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 30 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ Erroneously classified as Dutch: 0.20%, Latin: 0.10%

## 7. <a name="library-dependency"></a> How to add it to your project? <sup>[Top ▲](#table-of-contents)</sup>

go get github.com/pemistahl/[email protected].3
go get github.com/pemistahl/[email protected].4

## 8. <a name="library-build"></a> How to build? <sup>[Top ▲](#table-of-contents)</sup>

Expand Down Expand Up @@ -337,7 +337,9 @@ input text. The longer the input text, the larger the distance between the langu
want to classify very short text phrases, do not set the minimum relative distance too high.
Otherwise [`Unknown`](https://github.com/pemistahl/lingua-go/blob/main/language.go#L106) will be
returned most of the time as in the example above. This is the return value for cases where
language detection is not reliably possible.
language detection is not reliably possible. This value is not meant to be included in the set
of input languages when building the language detector. If you include it, it will be
automatically removed from the set of input languages.

### 9.3 Confidence values

Expand Down
9 changes: 9 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## Lingua 1.0.4 (released on 28 Nov 2021)

### Bug Fixes

- It was possible to include `lingua.Unknown` in the set of input languages
for building the language detector. It is only meant as a return value,
so it is now automatically removed from the set of input languages.
Thanks to @marians for identifying this problem. (#7)

## Lingua 1.0.3 (released on 20 Oct 2021)

### Improvements
Expand Down
18 changes: 18 additions & 0 deletions builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,25 @@ func (builder *languageDetectorBuilder) FromAllLanguagesWithout(languages ...Lan
}

func (builder *languageDetectorBuilder) FromLanguages(languages ...Language) LanguageDetectorBuilder {
for i, language := range languages {
if language == Unknown {
languages = append(languages[:i], languages[i+1:]...)
break
}
}
if len(languages) < 2 {
panic(missingLanguageMessage)
}
return builder.from(languages)
}

func (builder *languageDetectorBuilder) FromIsoCodes639_1(isoCodes ...IsoCode639_1) LanguageDetectorBuilder {
for i, isoCode := range isoCodes {
if isoCode == UnknownIsoCode639_1 {
isoCodes = append(isoCodes[:i], isoCodes[i+1:]...)
break
}
}
if len(isoCodes) < 2 {
panic(missingLanguageMessage)
}
Expand All @@ -184,6 +196,12 @@ func (builder *languageDetectorBuilder) FromIsoCodes639_1(isoCodes ...IsoCode639
}

func (builder *languageDetectorBuilder) FromIsoCodes639_3(isoCodes ...IsoCode639_3) LanguageDetectorBuilder {
for i, isoCode := range isoCodes {
if isoCode == UnknownIsoCode639_3 {
isoCodes = append(isoCodes[:i], isoCodes[i+1:]...)
break
}
}
if len(isoCodes) < 2 {
panic(missingLanguageMessage)
}
Expand Down
125 changes: 97 additions & 28 deletions builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,56 +145,125 @@ func TestLanguageDetectorBuilder_FromAllLanguagesWithout(t *testing.T) {
func TestLanguageDetectorBuilder_FromAllLanguagesWithout_Panics(t *testing.T) {
assert.PanicsWithValue(
t,
"LanguageDetector needs at least 2 languages to choose from",
missingLanguageMessage,
func() {
NewLanguageDetectorBuilder().FromAllLanguagesWithout(AllLanguages()[1:]...)
},
)
}

func TestLanguageDetectorBuilder_FromLanguages(t *testing.T) {
builder := NewLanguageDetectorBuilder().FromLanguages(German, English)
assert.ElementsMatch(t, []Language{German, English}, builder.getLanguages())
testCases := []struct {
languages []Language
expectedLanguages []Language
}{
{
[]Language{German, English},
[]Language{German, English},
},
{
[]Language{German, English, Unknown},
[]Language{German, English},
},
}
for _, testCase := range testCases {
builder := NewLanguageDetectorBuilder().FromLanguages(testCase.languages...)
assert.ElementsMatch(t, testCase.expectedLanguages, builder.getLanguages())
}
}

func TestLanguageDetectorBuilder_FromLanguages_Panics(t *testing.T) {
assert.PanicsWithValue(
t,
"LanguageDetector needs at least 2 languages to choose from",
func() {
NewLanguageDetectorBuilder().FromLanguages(German)
},
)
testCases := []struct {
languages []Language
}{
{[]Language{German}},
{[]Language{German, Unknown}},
}
for _, testCase := range testCases {
assert.PanicsWithValue(
t,
missingLanguageMessage,
func() {
NewLanguageDetectorBuilder().FromLanguages(testCase.languages...)
},
)
}
}

func TestLanguageDetectorBuilder_FromIsoCodes639_1(t *testing.T) {
builder := NewLanguageDetectorBuilder().FromIsoCodes639_1(DE, SV)
assert.ElementsMatch(t, []Language{German, Swedish}, builder.getLanguages())
testCases := []struct {
isoCodes []IsoCode639_1
expectedLanguages []Language
}{
{
[]IsoCode639_1{DE, EN},
[]Language{German, English},
},
{
[]IsoCode639_1{DE, EN, UnknownIsoCode639_1},
[]Language{German, English},
},
}
for _, testCase := range testCases {
builder := NewLanguageDetectorBuilder().FromIsoCodes639_1(testCase.isoCodes...)
assert.ElementsMatch(t, testCase.expectedLanguages, builder.getLanguages())
}
}

func TestLanguageDetectorBuilder_FromIsoCodes639_1_Panics(t *testing.T) {
assert.PanicsWithValue(
t,
"LanguageDetector needs at least 2 languages to choose from",
func() {
NewLanguageDetectorBuilder().FromIsoCodes639_1(DE)
},
)
testCases := []struct {
isoCodes []IsoCode639_1
}{
{[]IsoCode639_1{DE}},
{[]IsoCode639_1{DE, UnknownIsoCode639_1}},
}
for _, testCase := range testCases {
assert.PanicsWithValue(
t,
missingLanguageMessage,
func() {
NewLanguageDetectorBuilder().FromIsoCodes639_1(testCase.isoCodes...)
},
)
}
}

func TestLanguageDetectorBuilder_FromIsoCodes639_3(t *testing.T) {
builder := NewLanguageDetectorBuilder().FromIsoCodes639_3(DEU, SWE)
assert.ElementsMatch(t, []Language{German, Swedish}, builder.getLanguages())
testCases := []struct {
isoCodes []IsoCode639_3
expectedLanguages []Language
}{
{
[]IsoCode639_3{DEU, ENG},
[]Language{German, English},
},
{
[]IsoCode639_3{DEU, ENG, UnknownIsoCode639_3},
[]Language{German, English},
},
}
for _, testCase := range testCases {
builder := NewLanguageDetectorBuilder().FromIsoCodes639_3(testCase.isoCodes...)
assert.ElementsMatch(t, testCase.expectedLanguages, builder.getLanguages())
}
}

func TestLanguageDetectorBuilder_FromIsoCodes639_3_Panics(t *testing.T) {
assert.PanicsWithValue(
t,
"LanguageDetector needs at least 2 languages to choose from",
func() {
NewLanguageDetectorBuilder().FromIsoCodes639_3(DEU)
},
)
testCases := []struct {
isoCodes []IsoCode639_3
}{
{[]IsoCode639_3{DEU}},
{[]IsoCode639_3{DEU, UnknownIsoCode639_3}},
}
for _, testCase := range testCases {
assert.PanicsWithValue(
t,
missingLanguageMessage,
func() {
NewLanguageDetectorBuilder().FromIsoCodes639_3(testCase.isoCodes...)
},
)
}
}

func TestLanguageDetectorBuilder_WithMinimumRelativeDistance_Panics_1(t *testing.T) {
Expand Down

0 comments on commit f4abe93

Please sign in to comment.