diff --git a/examples/duplicates/1.java b/examples/duplicates/1.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/1.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/10.java b/examples/duplicates/10.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/10.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/11.java b/examples/duplicates/11.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/11.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/12.java b/examples/duplicates/12.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/12.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/13.java b/examples/duplicates/13.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/13.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/14.java b/examples/duplicates/14.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/14.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/15.java b/examples/duplicates/15.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/15.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/16.java b/examples/duplicates/16.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/16.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/17.java b/examples/duplicates/17.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/17.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/18.java b/examples/duplicates/18.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/18.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/19.java b/examples/duplicates/19.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/19.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/2.java b/examples/duplicates/2.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/2.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/20.java b/examples/duplicates/20.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/20.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/3.java b/examples/duplicates/3.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/3.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/4.java b/examples/duplicates/4.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/4.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/5.java b/examples/duplicates/5.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/5.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/6.java b/examples/duplicates/6.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/6.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/7.java b/examples/duplicates/7.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/7.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/8.java b/examples/duplicates/8.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/8.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/examples/duplicates/9.java b/examples/duplicates/9.java new file mode 100644 index 000000000..21fee454b --- /dev/null +++ b/examples/duplicates/9.java @@ -0,0 +1,186 @@ +package com.boyter.SpellingCorrector; + +import java.util.*; +import java.util.stream.Stream; + +/** + * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and + * the like. Attempts to be highly performing by never changing the first character since we can assume that the + * user got that correct. + */ +public class SpellingCorrector implements ISpellingCorrector { + + // word to count map - how may times a word is present - or a weight attached to a word + private Map dictionary = null; + + public SpellingCorrector(int lruCount) { + this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount)); + } + + @Override + public void putWord(String word) { + word = word.toLowerCase(); + if (dictionary.containsKey(word)) { + dictionary.put(word, (dictionary.get(word) + 1)); + } + else { + dictionary.put(word, 1); + } + } + + @Override + public String correct(String word) { + if (word == null || word.trim().isEmpty()) { + return word; + } + + word = word.toLowerCase(); + + // If the word exists in our dictionary then return + if (dictionary.containsKey(word)) { + return word; + } + + Map possibleMatches = new HashMap<>(); + + List closeEdits = wordEdits(word); + for (String closeEdit: closeEdits) { + if (dictionary.containsKey(closeEdit)) { + possibleMatches.put(closeEdit, this.dictionary.get(closeEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + // Ok we did't find anything, so lets run the edits function on the previous results and use those + // this gives us results which are 2 characters away from whatever was entered + List furtherEdits = new ArrayList<>(); + for(String closeEdit: closeEdits) { + furtherEdits.addAll(this.wordEdits(closeEdit)); + } + + for (String futherEdit: furtherEdits) { + if (dictionary.containsKey(futherEdit)) { + possibleMatches.put(futherEdit, this.dictionary.get(futherEdit)); + } + } + + if (!possibleMatches.isEmpty()) { + // Sorted least likely first + Object[] matches = this.sortByValue(possibleMatches).keySet().toArray(); + + // Try to match anything of the same length first + String bestMatch = ""; + for(Object o: matches) { + if (o.toString().length() == word.length()) { + bestMatch = o.toString(); + } + } + + if (!bestMatch.trim().isEmpty()) { + return bestMatch; + } + + // Just return whatever is the best match + return matches[matches.length - 1].toString(); + } + + + // If unable to find something better return the same string + return word; + } + + @Override + public boolean containsWord(String word) { + if (dictionary.containsKey(word)) { + return true; + } + + return false; + } + + + /** + * Return a list of strings which are words similar to our one which could potentially be misspellings + * Abuse the fact that a char can be used as an integer + * Assume that they got the first letter correct for all edits to cut on CPU burn time + */ + private List wordEdits(String word) { + List closeWords = new ArrayList(); + + for (int i = 1; i < word.length() + 1; i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they forgot to type a letter? Try adding one + StringBuilder sb = new StringBuilder(word); + sb.insert(i, character); + closeWords.add(sb.toString()); + } + } + + for (int i = 1; i < word.length(); i++) { + for (char character = 'a'; character <= 'z'; character++) { + // Maybe they mistyped a single letter? Try replacing them all + StringBuilder sb = new StringBuilder(word); + sb.setCharAt(i, character); + closeWords.add(sb.toString()); + + // Maybe they added an extra letter? Try deleting one + sb = new StringBuilder(word); + sb.deleteCharAt(i); + closeWords.add(sb.toString()); + } + } + + return closeWords; + } + + + /** + * Sorts a map by value taken from + * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java + */ + public static > Map sortByValue( Map map ) { + Map result = new LinkedHashMap<>(); + Stream> st = map.entrySet().stream(); + + st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) ); + + return result; + } + + /** + * A very simple LRU cache implementation that can be used for random data types. + */ + public class LruCache extends LinkedHashMap { + private final int maxEntries; + + public LruCache(final int maxEntries) { + super(maxEntries + 1, 1.0f, true); + this.maxEntries = maxEntries; + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return super.size() > maxEntries; + } + } + +} diff --git a/processor/structs.go b/processor/structs.go index 917537f3c..36881d964 100644 --- a/processor/structs.go +++ b/processor/structs.go @@ -87,17 +87,15 @@ type OpenClose struct { Close []byte } -// CheckDuplicates is used to hold hashes if duplicate detection is enabled +// CheckDuplicates is used to hold hashes if duplicate detection is enabled it comes with a mutex +// that should be locked while a check is being performed then added type CheckDuplicates struct { hashes map[int64][][]byte mux sync.Mutex } -// Add concurrent safe add a key into the duplicates check +// Non thread safe add a key into the duplicates check need to use mutex inside struct before calling this func (c *CheckDuplicates) Add(key int64, hash []byte) { - c.mux.Lock() - defer c.mux.Unlock() - hashes, ok := c.hashes[key] if ok { c.hashes[key] = append(hashes, hash) @@ -106,11 +104,8 @@ func (c *CheckDuplicates) Add(key int64, hash []byte) { } } -// Check concurrent safe check to see if the key exists already +// Non thread safe check to see if the key exists already need to use mutex inside struct before calling this func (c *CheckDuplicates) Check(key int64, hash []byte) bool { - c.mux.Lock() - defer c.mux.Unlock() - hashes, ok := c.hashes[key] if ok { for _, h := range hashes { diff --git a/processor/workers.go b/processor/workers.go index 879aba524..50b504f7c 100644 --- a/processor/workers.go +++ b/processor/workers.go @@ -526,15 +526,18 @@ func fileProcessorWorker(input chan *FileJob, output chan *FileJob) { CountStats(res) if Duplicates { + duplicates.mux.Lock() if duplicates.Check(res.Bytes, res.Hash) { if Verbose { printWarn(fmt.Sprintf("skipping duplicate file: %s", res.Location)) } - wg.Done() - return + + duplicates.mux.Unlock() + continue } duplicates.Add(res.Bytes, res.Hash) + duplicates.mux.Unlock() } if Trace { diff --git a/test-all.sh b/test-all.sh index 7ba66109d..c9fbddb38 100755 --- a/test-all.sh +++ b/test-all.sh @@ -114,6 +114,20 @@ else exit fi +# Try out duplicates +for i in {1..100} +do + if ./scc -d "examples/duplicates/" | grep -e "Java" | grep -q -e " 1 "; then + : + else + echo -e "${RED}=======================================================" + echo -e "FAILED Duplicates should be consistent" + echo -e "=======================================================" + exit + fi +done +echo -e "${GREEN}PASSED duplicates test" + echo -e "${NC}Cleaning up..." rm ./scc