From 22d362d863ac6771351c4dd072dbf53dc20ce5b5 Mon Sep 17 00:00:00 2001
From: Wayne Cheng <waynethecheng@gmail.com>
Date: Fri, 12 Jun 2020 17:58:45 -0400
Subject: [PATCH] Fix dictionary index case-sensitivity inconsistencies

---
 Cargo.lock                                    |  11 +
 Cargo.toml                                    |   1 +
 src/dictionary/indexing.rs                    | 276 +++++++++++++++---
 src/dictionary/mod.rs                         | 109 ++++++-
 .../testdata/case_insensitive_dict.dict       |  26 ++
 .../testdata/case_insensitive_dict.index      |  11 +
 .../testdata/case_sensitive_dict.dict         |  27 ++
 .../testdata/case_sensitive_dict.index        |  12 +
 8 files changed, 421 insertions(+), 52 deletions(-)
 create mode 100644 src/dictionary/testdata/case_insensitive_dict.dict
 create mode 100644 src/dictionary/testdata/case_insensitive_dict.index
 create mode 100644 src/dictionary/testdata/case_sensitive_dict.dict
 create mode 100644 src/dictionary/testdata/case_sensitive_dict.index
diff --git a/Cargo.lock b/Cargo.lock
index a9debbc1..37a701a3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -104,6 +104,15 @@ dependencies = [
  "pkg-config 0.3.17 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
+[[package]]
+name = "caseless"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unicode-normalization 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "cc"
 version = "1.0.54"
@@ -661,6 +670,7 @@ dependencies = [
  "anyhow 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)",
  "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "byteorder 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "caseless 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "chrono 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)",
  "crossbeam-channel 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "downcast-rs 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -1330,6 +1340,7 @@ dependencies = [
 "checksum bytes 0.5.4 (registry+https://github.com/rust-lang/crates.io-index)" = "130aac562c0dd69c56b3b1cc8ffd2e17be31d0b6c25b61c96b76231aa23e39e1"
 "checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b"
 "checksum bzip2-sys 0.1.9+1.0.8 (registry+https://github.com/rust-lang/crates.io-index)" = "ad3b39a260062fca31f7b0b12f207e8f2590a67d32ec7d59c20484b07ea7285e"
+"checksum caseless 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "808dab3318747be122cb31d36de18d4d1c81277a76f8332a02b81a3d73463d7f"
 "checksum cc 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311"
 "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
 "checksum chrono 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)" = "80094f509cf8b5ae86a4966a39b3ff66cd7e2a3e594accec3743ff3fabeab5b2"
diff --git a/Cargo.toml b/Cargo.toml
index afe9bff5..8ead2a2f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ filetime = "0.2.10"
 fxhash = "0.2.1"
 rand_core = "0.5.1"
 rand_xoshiro = "0.4.0"
+caseless = "0.2.1"
 
 [dependencies.getopts]
 version = "0.2.21"
diff --git a/src/dictionary/indexing.rs b/src/dictionary/indexing.rs
index 2643054c..2e9c6f66 100644
--- a/src/dictionary/indexing.rs
+++ b/src/dictionary/indexing.rs
@@ -25,11 +25,13 @@ use levenshtein::levenshtein;
 
 use super::errors::DictError;
 use super::errors::DictError::*;
+use caseless::default_case_fold_str;
 
 /// The index is partially loaded if `state` isn't `None`.
 pub struct Index<R: BufRead> {
     pub entries: Vec<Entry>,
     pub state: Option<R>,
+    pub settings: Settings,
 }
 
 #[derive(Debug, Clone)]
@@ -40,15 +42,24 @@ pub struct Entry {
     pub original: Option<String>,
 }
 
+#[derive(Debug, Clone)]
+// Settings correspond to options detailed in `dictfmt`
+pub struct Settings {
+    pub all_characters: bool,
+    pub case_sensitive: bool,
+
+}
+
 pub trait IndexReader {
     fn load_and_find(&mut self, headword: &str, fuzzy: bool) -> Vec<Entry>;
     fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry>;
+    fn settings(&self) -> Settings;
 }
 
 impl<R: BufRead> IndexReader for Index<R> {
     fn load_and_find(&mut self, headword: &str, fuzzy: bool) -> Vec<Entry> {
         if let Some(br) = self.state.take() {
-            if let Ok(mut index) = parse_index(br, false) {
+            if let Ok(mut index) = parse_index_with_settings(br, false, Option::Some(&self.settings)) {
                 self.entries.append(&mut index.entries);
             }
         }
@@ -56,32 +67,11 @@ impl<R: BufRead> IndexReader for Index<R> {
     }
 
     fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry> {
-        if fuzzy {
-            self.entries.iter().filter(|entry| levenshtein(headword, &entry.headword) <= 1).cloned().collect()
-        } else {
-            if let Ok(mut i) = self.entries.binary_search_by_key(&headword, |entry| &entry.headword) {
-                let mut results = vec![self.entries[i].clone()];
-                let j = i;
-                while i > 0 {
-                    i -= 1;
-                    if self.entries[i].headword != headword {
-                        break;
-                    }
-                    results.insert(0, self.entries[i].clone());
-                }
-                i = j;
-                while i < self.entries.len() - 1 {
-                    i += 1;
-                    if self.entries[i].headword != headword {
-                        break;
-                    }
-                    results.push(self.entries[i].clone());
-                }
-                results
-            } else {
-                Vec::new()
-            }
-        }
+        find(self.entries.as_ref(), headword, fuzzy)
+    }
+
+    fn settings(&self) -> Settings {
+        self.settings.clone()
     }
 }
 
@@ -136,32 +126,89 @@ fn parse_line(line: &str, line_number: usize) -> Result<(&str, u64, u64, Option<
 
 /// Parse the index for a dictionary from a given BufRead compatible object.
 /// When `lazy` is `true`, the loop stops once all the metadata entries are parsed.
-pub fn parse_index<B: BufRead>(mut br: B, lazy: bool) -> Result<Index<B>, DictError> {
-    let mut info = false;
+pub fn parse_index<B: BufRead>(br: B, lazy: bool) -> Result<Index<B>, DictError> {
+    parse_index_with_settings(br, lazy, None)
+}
+
+// parse_index_with_settings accounts for the following possibilities:
+// - lazy parse -> parse index metadata (00-database-*)
+// - full parse -> parse whole index
+// - resume parse -> resume from lazy parse
+fn parse_index_with_settings<B: BufRead>(mut br: B, lazy: bool, settings: Option<&Settings>) -> Result<Index<B>, DictError> {
+    let mut found_metadata = false;
+    let mut settings_created = false;
     let mut entries = Vec::new();
     let mut line_number = 0;
     let mut line = String::new();
 
+    let mut s = Settings{all_characters: false, case_sensitive: false};
+
+    if let Some(settings) = settings {
+        s = settings.clone();
+        found_metadata = true;
+        settings_created = true;
+    }
+
     while let Ok(nb) = br.read_line(&mut line) {
         if nb == 0 {
             break;
         }
         let (headword, offset, size, original) = parse_line(line.trim_end(), line_number)?;
-        if lazy {
-            if !info && (headword.starts_with("00-database-") || headword.starts_with("00database")) {
-                info = true;
-            } else if info && !headword.starts_with("00-database-") && !headword.starts_with("00database") {
-                break;
+
+        if !found_metadata && (headword.starts_with("00-database-") || headword.starts_with("00database")) {
+            found_metadata = true;
+        } else if found_metadata && !settings_created && !headword.starts_with("00-database-") && !headword.starts_with("00database") {
+
+            // A DICT index may not be case-sensitive, but the indexed headwords may not have been casefolded
+            // Therefore if the index is not case-sensitive, we will have to casefold all headwords ourselves along with the query
+            let all_chars = !find(entries.as_ref(), "00-database-allchars", false).is_empty();
+
+            let word = if all_chars {
+                "00-database-case-sensitive"
+            } else {
+                "00databasecasesensitive"
+            };
+
+            let case_sensitive = !find(entries.as_ref(),word, false).is_empty();
+            s.all_characters = all_chars;
+            s.case_sensitive = case_sensitive;
+
+            settings_created = true;
+
+            // It is possible for headwords to precede the 00-database- entries so we need to go back and clean them up
+            for mut entry in entries.iter_mut() {
+                let formatted_entry = &mut Entry{
+                    headword: default_case_fold_str(&entry.headword),
+                    offset: entry.offset,
+                    size: entry.size,
+                    original: entry.original.clone()
+                };
+
+                entry = formatted_entry;
             }
         }
+
+        let formatted_word: String;
+
+        if !s.case_sensitive {
+            formatted_word = default_case_fold_str(headword.as_ref());
+        } else {
+            formatted_word = headword.to_string();
+        }
+
         entries.push(Entry {
-            headword: headword.to_string(),
+            headword: formatted_word,
             offset,
             size,
             original: original.map(String::from),
         });
         line_number += 1;
         line.clear();
+
+        // Break *after* current headword is committed for lazy load
+        if lazy && settings_created {
+            break;
+        }
     }
 
     let state = if lazy {
@@ -170,12 +217,167 @@ pub fn parse_index<B: BufRead>(mut br: B, lazy: bool) -> Result<Index<B>, DictEr
         None
     };
 
-    Ok(Index { entries, state })
+    Ok(Index{entries, state, settings: s})
 }
 
 /// Parse the index for a dictionary from a given path.
-pub fn parse_index_from_file<P: AsRef<Path>>(path: P, lazy: bool) -> Result<Index<BufReader<File>>, DictError> {
+pub fn parse_index_from_file(path: impl AsRef<Path>, lazy: bool) -> Result<Index<BufReader<File>>, DictError> {
     let file = File::open(path)?;
     let reader = BufReader::new(file);
     parse_index(reader, lazy)
 }
+
+fn find(entries: &Vec<Entry>, headword: &str, fuzzy: bool) -> Vec<Entry> {
+    if fuzzy {
+        entries.iter().filter(|entry| levenshtein(headword, &entry.headword) <= 1).cloned().collect()
+    } else {
+        if let Ok(mut i) = entries.binary_search_by_key(&headword, |entry| &entry.headword) {
+            let mut results = vec![entries[i].clone()];
+            let j = i;
+            while i > 0 {
+                i -= 1;
+                if entries[i].headword != headword {
+                    break;
+                }
+                results.insert(0, entries[i].clone());
+            }
+            i = j;
+            while i < entries.len() - 1 {
+                i += 1;
+                if entries[i].headword != headword {
+                    break;
+                }
+                results.push(entries[i].clone());
+            }
+            results
+        } else {
+            Vec::new()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::io::Empty;
+
+    const PATH_CASE_SENSITIVE_INDEX: &str = "src/dictionary/testdata/case_sensitive_dict.index";
+    const PATH_CASE_INSENSITIVE_INDEX: &str = "src/dictionary/testdata/case_insensitive_dict.index";
+
+    #[test]
+    fn test_index_find() {
+        let words = vec![
+            Entry{
+                headword: String::from("bar"),
+                offset: 0,
+                size: 8,
+                original: None,
+            },
+            Entry{
+                headword: String::from("baz"),
+                offset: 8,
+                size: 4,
+                original: None,
+            },
+            Entry{
+                headword: String::from("foo"),
+                offset: 12,
+                size: 4,
+                original: None,
+            },
+        ];
+
+        let index: Index<Empty> = Index{
+            entries: words,
+            state: None,
+            settings: Settings{ all_characters: false, case_sensitive: false },
+        };
+
+        let r = index.find("apples", false);
+        assert!(r.is_empty());
+
+        let r = index.find("baz", false);
+        assert!(!r.is_empty());
+        assert_eq!(r.len(), 1);
+        assert_eq!(r.first().unwrap().headword, "baz");
+
+        let r = index.find("bas", true);
+        assert!(!r.is_empty());
+        assert_eq!(r.len(), 2);
+        assert_eq!(r.first().unwrap().headword, "bar");
+    }
+
+    #[test]
+    // Make sure that a lazy load does not inadvertently skip a word when it returns to BufRead
+    fn test_index_load_and_find() {
+        let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true);
+        assert!(r.is_ok());
+
+        let mut index = r.unwrap();
+        assert_eq!(index.entries[0].headword, "00-database-allchars");
+        assert_eq!(index.entries.last().unwrap().headword, "bar");
+
+        let r = index.load_and_find("bar", false);
+        assert!(!r.is_empty());
+
+        let r = index.load_and_find("foo", false);
+        assert!(!r.is_empty());
+    }
+
+    #[test]
+    fn test_parse_index_from_file() {
+        let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false);
+        assert!(r.is_ok());
+
+        let index = r.unwrap();
+        assert_eq!(index.entries[0].headword, "00-database-allchars");
+        assert_eq!(index.entries.last().unwrap().headword, "あいおい");
+    }
+
+    #[test]
+    fn test_parse_index_from_file_lazy() {
+        let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true);
+        assert!(r.is_ok());
+
+        let index = r.unwrap();
+        assert_eq!(index.entries[0].headword, "00-database-allchars");
+        assert_eq!(index.entries.last().unwrap().headword, "bar");
+    }
+
+    #[test]
+    fn test_parse_index_from_file_handles_case_insensitivity() {
+        let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false);
+        assert!(r.is_ok());
+
+        let index = r.unwrap();
+
+        let r = index.find("bar", false);
+        assert!(!r.is_empty());
+        assert_eq!(r.first().unwrap().headword, "bar");
+
+        // straße should fold to strasse
+        // https://www.w3.org/International/wiki/Case_folding
+        let r = index.find("strasse", false);
+        assert!(!r.is_empty());
+        assert_eq!(r.first().unwrap().headword, "strasse");
+
+    }
+
+    #[test]
+    fn test_parse_index_from_file_handles_case_sensitivity() {
+        let r = parse_index_from_file(PATH_CASE_SENSITIVE_INDEX, false);
+        assert!(r.is_ok());
+
+        let index = r.unwrap();
+
+        let r = index.find("Bar", false);
+        assert!(!r.is_empty());
+        assert_eq!(r.first().unwrap().headword, "Bar");
+
+        let r = index.find("straße", false);
+        assert!(!r.is_empty());
+        assert_eq!(r.first().unwrap().headword, "straße");
+
+    }
+}
+
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index f4b75bda..1a788fe1 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -10,7 +10,8 @@ mod indexing;
 use std::path::Path;
 
 use self::dictreader::DictReader;
-use self::indexing::IndexReader;
+use self::indexing::{IndexReader,Settings};
+use caseless::default_case_fold_str;
 
 /// A dictionary wrapper.
 ///
@@ -21,8 +22,7 @@ use self::indexing::IndexReader;
 pub struct Dictionary {
     content: Box<dyn DictReader>,
     index: Box<dyn IndexReader>,
-    all_chars: bool,
-    case_sensitive: bool,
+    settings: Settings,
 }
 
 impl Dictionary {
@@ -32,10 +32,11 @@ impl Dictionary {
     /// found, the returned vector is empty. Errors result from the parsing of the underlying files.
     pub fn lookup(&mut self, word: &str, fuzzy: bool) -> Result<Vec<[String; 2]>, errors::DictError> {
         let mut query = word.to_string();
-        if !self.case_sensitive {
-            query = query.to_lowercase();
+        if !self.settings.case_sensitive {
+            // https://www.w3.org/International/wiki/Case_folding
+            query = default_case_fold_str(&query);
         }
-        if !self.all_chars {
+        if !self.settings.all_characters {
             query = query.chars().filter(|c| c.is_alphanumeric() || c.is_whitespace()).collect();
         }
         let entries = self.index.load_and_find(&query, fuzzy);
@@ -52,7 +53,7 @@ impl Dictionary {
     /// The metadata headwords start with `00-database-` or `00database`.
     pub fn metadata(&mut self, name: &str) -> Result<String, errors::DictError> {
         let mut query = format!("00-database-{}", name);
-        if !self.all_chars {
+        if !self.settings.all_characters {
             query = query.replace(|c: char| !c.is_alphanumeric(), "");
         }
         let entries = self.index.find(&query, false);
@@ -100,12 +101,90 @@ pub fn load_dictionary_from_file<P: AsRef<Path>>(content_path: P, index_path: P)
 /// `dictReader` as trait object. This way, dictionaries from RAM or similar can be
 /// implemented.
 pub fn load_dictionary(content: Box<dyn DictReader>, index: Box<dyn IndexReader>) -> Dictionary {
-    let all_chars = !index.find("00-database-allchars", false).is_empty();
-    let word = if all_chars {
-        "00-database-case-sensitive"
-    } else {
-        "00databasecasesensitive"
-    };
-    let case_sensitive = !index.find(word, false).is_empty();
-    Dictionary { content, index, all_chars, case_sensitive }
+    let settings = index.settings();
+    Dictionary { content, index, settings }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const PATH_CASE_SENSITIVE_DICT: &str = "src/dictionary/testdata/case_sensitive_dict.dict";
+    const PATH_CASE_SENSITIVE_INDEX: &str = "src/dictionary/testdata/case_sensitive_dict.index";
+    const PATH_CASE_INSENSITIVE_DICT: &str = "src/dictionary/testdata/case_insensitive_dict.dict";
+    const PATH_CASE_INSENSITIVE_INDEX: &str = "src/dictionary/testdata/case_insensitive_dict.index";
+
+    fn assert_dict_word_exists(mut dict: Dictionary, headword: &str, definition: &str) -> Dictionary {
+        let r = dict.lookup(headword, false);
+        assert!(r.is_ok());
+        let search = r.unwrap();
+        assert_eq!(search.len(), 1);
+        assert!(search[0][1].contains(definition));
+
+        dict
+    }
+
+    #[test]
+    fn test_load_dictionary_from_file() {
+
+        let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX);
+        assert!(r.is_ok());
+    }
+
+    #[test]
+    fn test_dictionary_lookup_case_insensitive() {
+
+        let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX);
+        let mut dict = r.unwrap();
+
+        dict = assert_dict_word_exists(dict, "bar", "test for case-sensitivity");
+        dict = assert_dict_word_exists(dict, "Bar", "test for case-sensitivity");
+        dict = assert_dict_word_exists(dict, "straße", "test for non-latin case-sensitivity");
+        assert_dict_word_exists(dict, "strasse", "test for non-latin case-sensitivity");
+    }
+
+    #[test]
+    fn test_dictionary_lookup_case_insensitive_fuzzy() {
+
+        let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX);
+        let mut dict = r.unwrap();
+
+        let r = dict.lookup("ba", true);
+        assert!(r.is_ok());
+        let search = r.unwrap();
+        assert_eq!(search.len(), 1);
+        assert_eq!(search[0][0], "bar");
+        assert!(search[0][1].contains("test for case-sensitivity"));
+    }
+
+    #[test]
+    fn test_dictionary_lookup_case_sensitive() {
+
+        let r = load_dictionary_from_file(PATH_CASE_SENSITIVE_DICT, PATH_CASE_SENSITIVE_INDEX);
+        let mut dict = r.unwrap();
+
+        dict = assert_dict_word_exists(dict, "Bar", "test for case-sensitivity");
+        dict = assert_dict_word_exists(dict, "straße", "test for non-latin case-sensitivity");
+
+        let r = dict.lookup("bar", false);
+        assert!(r.unwrap().is_empty());
+
+        let r = dict.lookup("strasse", false);
+        assert!(r.unwrap().is_empty());
+    }
+
+    #[test]
+    fn test_dictionary_lookup_case_sensitive_fuzzy() {
+
+        let r = load_dictionary_from_file(PATH_CASE_SENSITIVE_DICT, PATH_CASE_SENSITIVE_INDEX);
+        let mut dict = r.unwrap();
+
+        let r = dict.lookup("Ba", true);
+        assert!(r.is_ok());
+        let search = r.unwrap();
+        assert_eq!(search.len(), 1);
+        assert_eq!(search[0][0], "Bar");
+        assert!(search[0][1].contains("test for case-sensitivity"));
+    }
+}
+
diff --git a/src/dictionary/testdata/case_insensitive_dict.dict b/src/dictionary/testdata/case_insensitive_dict.dict
new file mode 100644
index 00000000..51891df8
--- /dev/null
+++ b/src/dictionary/testdata/case_insensitive_dict.dict
@@ -0,0 +1,26 @@
+
+
+00-database-dictfmt-1.12.1
+00-database-short
+     Test Dict
+This file was converted from the original database on:
+          Fri Jun 12 12:16:13 2020
+
+The original data is available from:
+     unknown
+
+The original data was distributed with the notice shown below. No
+additional restrictions are claimed.  Please redistribute this changed
+version under the same conditions and restriction that apply to the
+original version.
+
+foo
+definition
+Bar
+test for case-sensitivity
+あいおい
+test for non-latin characters
+straße
+test for non-latin case-sensitivity
+unknown
+abeforstßあいお
diff --git a/src/dictionary/testdata/case_insensitive_dict.index b/src/dictionary/testdata/case_insensitive_dict.index
new file mode 100644
index 00000000..d10bb212
--- /dev/null
+++ b/src/dictionary/testdata/case_insensitive_dict.index
@@ -0,0 +1,11 @@
+00-database-allchars	B	B
+00-database-alphabet	I4	U
+00-database-dictfmt-1.12.1	C	b
+00-database-info	+	Fu
+00-database-short	d	h
+00-database-url	Iw	I
+00-database-utf8	A	B
+bar	G7	e
+foo	Gs	P
+straße	IE	s
+あいおい	HZ	r
diff --git a/src/dictionary/testdata/case_sensitive_dict.dict b/src/dictionary/testdata/case_sensitive_dict.dict
new file mode 100644
index 00000000..cbe065d8
--- /dev/null
+++ b/src/dictionary/testdata/case_sensitive_dict.dict
@@ -0,0 +1,27 @@
+
+
+
+00-database-dictfmt-1.12.1
+00-database-short
+     Case Sensitive Test Dict
+This file was converted from the original database on:
+          Fri Jun 12 15:24:14 2020
+
+The original data is available from:
+     unknown
+
+The original data was distributed with the notice shown below. No
+additional restrictions are claimed.  Please redistribute this changed
+version under the same conditions and restriction that apply to the
+original version.
+
+foo
+definition
+Bar
+test for case-sensitivity
+あいおい
+test for non-latin characters
+straße
+test for non-latin case-sensitivity
+unknown
+Baeforstßあいお
diff --git a/src/dictionary/testdata/case_sensitive_dict.index b/src/dictionary/testdata/case_sensitive_dict.index
new file mode 100644
index 00000000..abb08481
--- /dev/null
+++ b/src/dictionary/testdata/case_sensitive_dict.index
@@ -0,0 +1,12 @@
+00-database-allchars	B	B
+00-database-alphabet	JI	U
+00-database-case-sensitive	C	B
+00-database-dictfmt-1.12.1	D	b
+00-database-info	BO	Fu
+00-database-short	e	w
+00-database-url	JA	I
+00-database-utf8	A	B
+Bar	HL	e
+foo	G8	P
+straße	IU	s
+あいおい	Hp	r