From ccdb537a886d640c1d1fd437414d037ec615562e Mon Sep 17 00:00:00 2001 From: Akash K Date: Thu, 29 Nov 2018 21:23:17 +0000 Subject: [PATCH] disregard word boundaries; add bestMatchIndex --- README.md | 26 +++++++---- compare-strings.js | 83 ++++++++++++++++++++++++------------ package-lock.json | 16 +++---- package.json | 4 +- spec/compare-strings.spec.js | 53 ++++++++++++++--------- 5 files changed, 115 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index d509531..672a325 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Finds degree of similarity between two strings, based on [Dice's Coefficient](ht * [Examples](#examples-1) * [Release Notes](#release-notes) * [2.0.0](#200) + * [3.0.0](#300) ## Usage @@ -41,7 +42,7 @@ Requiring the module gives an object with two methods: ### compareTwoStrings(string1, string2) -Returns a fraction between 0 and 1, which indicates the degree of similarity between the two strings. 0 indicates completely different strings, 1 indicates identical strings. The comparison is case-insensitive. +Returns a fraction between 0 and 1, which indicates the degree of similarity between the two strings. 0 indicates completely different strings, 1 indicates identical strings. The comparison is case-sensitive. ##### Arguments @@ -62,15 +63,15 @@ stringSimilarity.compareTwoStrings('healed', 'sealed'); stringSimilarity.compareTwoStrings('Olive-green table for sale, in extremely good condition.', 'For sale: table in very good condition, olive green in colour.'); -// → 0.7073170731707317 +// → 0.6060606060606061 stringSimilarity.compareTwoStrings('Olive-green table for sale, in extremely good condition.', 'For sale: green Subaru Impreza, 210,000 miles'); -// → 0.3013698630136986 +// → 0.2558139534883721 stringSimilarity.compareTwoStrings('Olive-green table for sale, in extremely good condition.', 'Wanted: mountain bike with at least 21 gears.'); -// → 0.11267605633802817 +// → 0.1411764705882353 ``` ### findBestMatch(mainString, targetStrings) @@ -83,7 +84,7 @@ Compares `mainString` against each string in `targetStrings`. 2. targetStrings (Array): Each string in this array will be matched against the main string. ##### Returns -(Object): An object with a `ratings` property, which gives a similarity rating for each target string, and a `bestMatch` property, which specifies which target string was most similar to the main string. +(Object): An object with a `ratings` property, which gives a similarity rating for each target string, a `bestMatch` property, which specifies which target string was most similar to the main string, and a `bestMatchIndex` property, which specifies the index of the bestMatch in the targetStrings array. ##### Examples ```javascript @@ -95,14 +96,16 @@ stringSimilarity.findBestMatch('Olive-green table for sale, in extremely good co // → { ratings: [ { target: 'For sale: green Subaru Impreza, 210,000 miles', - rating: 0.3013698630136986 }, + rating: 0.2558139534883721 }, { target: 'For sale: table in very good condition, olive green in colour.', - rating: 0.7073170731707317 }, + rating: 0.6060606060606061 }, { target: 'Wanted: mountain bike with at least 21 gears.', - rating: 0.11267605633802817 } ], + rating: 0.1411764705882353 } ], bestMatch: { target: 'For sale: table in very good condition, olive green in colour.', - rating: 0.7073170731707317 } } + rating: 0.6060606060606061 }, + bestMatchIndex: 1 +} ``` ## Release Notes @@ -111,6 +114,11 @@ stringSimilarity.findBestMatch('Olive-green table for sale, in extremely good co * Removed production dependencies * Updated to ES6 (this breaks backward-compatibility for pre-ES6 apps) +### 3.0.0 +* Performance improvement for `compareTwoStrings(..)`: now O(n) instead of O(n^2) +* The algorithm has been tweaked slightly to disregard spaces and word boundaries. This will change the rating values slightly but not enough to make a significant difference +* Adding a `bestMatchIndex` to the results for `findBestMatch(..)` to point to the best match in the supplied `targetStrings` array + ![Build status](https://codeship.com/projects/2aa453d0-0959-0134-8a76-4abcb29fe9b4/status?branch=master) [![Known Vulnerabilities](https://snyk.io/test/github/aceakash/string-similarity/badge.svg)](https://snyk.io/test/github/aceakash/string-similarity) diff --git a/compare-strings.js b/compare-strings.js index aa65181..9cb5c44 100644 --- a/compare-strings.js +++ b/compare-strings.js @@ -3,39 +3,68 @@ module.exports = { findBestMatch }; -function compareTwoStrings (str1, str2) { - if (!str1.length && !str2.length) return 1; // if both are empty strings - if (!str1.length || !str2.length) return 0; // if only one is empty string - if (str1.toUpperCase() === str2.toUpperCase()) return 1; // identical - if (str1.length === 1 && str2.length === 1) return 0; // both are 1-letter strings - - const pairs1 = wordLetterPairs(str1); - const pairs2 = wordLetterPairs(str2); - const union = pairs1.length + pairs2.length; - let intersection = 0; - pairs1.forEach(pair1 => { - for (let i = 0, pair2; pair2 = pairs2[i]; i++) { - if (pair1 !== pair2) continue; - intersection++; - pairs2.splice(i, 1); - break; +function compareTwoStrings(first, second) { + first = first.replace(/\s+/g, '') + second = second.replace(/\s+/g, '') + + if (!first.length && !second.length) return 1; // if both are empty strings + if (!first.length || !second.length) return 0; // if only one is empty string + if (first === second) return 1; // identical + if (first.length === 1 && second.length === 1) return 0; // both are 1-letter strings + if (first.length < 2 || second.length < 2) return 0; // if either is a 1-letter string + + let firstBigrams = new Map(); + for (let i = 0; i < first.length - 1; i++) { + const bigram = first.substr(i, 2); + const count = firstBigrams.has(bigram) + ? firstBigrams.get(bigram) + 1 + : 1; + + firstBigrams.set(bigram, count); + }; + + let intersectionSize = 0; + for (let i = 0; i < second.length - 1; i++) { + const bigram = second.substr(i, 2); + const count = firstBigrams.has(bigram) + ? firstBigrams.get(bigram) + : 0; + + if (count > 0) { + firstBigrams.set(bigram, count - 1); + intersectionSize++; } - }); - return intersection * 2 / union; + } + + return (2.0 * intersectionSize) / (first.length + second.length - 2); } -function findBestMatch (mainString, targetStrings) { +function findBestMatch(mainString, targetStrings) { if (!areArgsValid(mainString, targetStrings)) throw new Error('Bad arguments: First argument should be a string, second should be an array of strings'); - const ratings = targetStrings.map(target => ({ target, rating: compareTwoStrings(mainString, target) })); - const bestMatch = Array.from(ratings).sort((a, b) => b.rating - a.rating)[0]; - return { ratings, bestMatch }; + + const ratings = []; + let bestMatchIndex = 0; + + for (let i = 0; i < targetStrings.length; i++) { + const currentTargetString = targetStrings[i]; + const currentRating = compareTwoStrings(mainString, currentTargetString) + ratings.push({target: currentTargetString, rating: currentRating}) + if (currentRating > ratings[bestMatchIndex].rating) { + bestMatchIndex = i + } + } + + + const bestMatch = ratings[bestMatchIndex] + + return { ratings, bestMatch, bestMatchIndex }; } -function flattenDeep (arr) { - return Array.isArray(arr) ? arr.reduce((a, b) => a.concat(flattenDeep(b)) , []) : [arr]; +function flattenDeep(arr) { + return Array.isArray(arr) ? arr.reduce((a, b) => a.concat(flattenDeep(b)), []) : [arr]; } -function areArgsValid (mainString, targetStrings) { +function areArgsValid(mainString, targetStrings) { if (typeof mainString !== 'string') return false; if (!Array.isArray(targetStrings)) return false; if (!targetStrings.length) return false; @@ -43,13 +72,13 @@ function areArgsValid (mainString, targetStrings) { return true; } -function letterPairs (str) { +function letterPairs(str) { const pairs = []; for (let i = 0, max = str.length - 1; i < max; i++) pairs[i] = str.substring(i, i + 2); return pairs; } -function wordLetterPairs (str) { +function wordLetterPairs(str) { const pairs = str.toUpperCase().split(' ').map(letterPairs); return flattenDeep(pairs); } diff --git a/package-lock.json b/package-lock.json index d6fec0e..fcf2327 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "string-similarity", - "version": "1.2.2", + "version": "3.0.0", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -63,19 +63,19 @@ "dev": true }, "jasmine": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/jasmine/-/jasmine-3.2.0.tgz", - "integrity": "sha512-qv6TZ32r+slrQz8fbx2EhGbD9zlJo3NwPrpLK1nE8inILtZO9Fap52pyHk7mNTh4tG50a+1+tOiWVT3jO5I0Sg==", + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/jasmine/-/jasmine-3.3.0.tgz", + "integrity": "sha512-haZzMvmoWSI2VCKfDgPqyEOPBQA7C1fgtIMgKNU4hVMcrVkWU5NPOWQqOTA6mVFyKcSUUrnkXu/ZEgY0bRnd6A==", "dev": true, "requires": { "glob": "^7.0.6", - "jasmine-core": "~3.2.0" + "jasmine-core": "~3.3.0" } }, "jasmine-core": { - "version": "3.2.1", - "resolved": "https://registry.npmjs.org/jasmine-core/-/jasmine-core-3.2.1.tgz", - "integrity": "sha512-pa9tbBWgU0EE4SWgc85T4sa886ufuQdsgruQANhECYjwqgV4z7Vw/499aCaP8ZH79JDS4vhm8doDG9HO4+e4sA==", + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/jasmine-core/-/jasmine-core-3.3.0.tgz", + "integrity": "sha512-3/xSmG/d35hf80BEN66Y6g9Ca5l/Isdeg/j6zvbTYlTzeKinzmaTM4p9am5kYqOmE05D7s1t8FGjzdSnbUbceA==", "dev": true }, "minimatch": { diff --git a/package.json b/package.json index f571169..ceffcd5 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "string-similarity", - "version": "2.0.0", + "version": "3.0.0", "description": "Finds degree of similarity between strings, based on Dice's Coefficient, which is mostly better than Levenshtein distance.", "main": "compare-strings.js", "scripts": { @@ -29,6 +29,6 @@ "author": "Akash Kurdekar (http://untilfalse.com/)", "license": "ISC", "devDependencies": { - "jasmine": "^3.2.0" + "jasmine": "^3.3.0" } } diff --git a/spec/compare-strings.spec.js b/spec/compare-strings.spec.js index 220f750..448acc0 100644 --- a/spec/compare-strings.spec.js +++ b/spec/compare-strings.spec.js @@ -4,27 +4,32 @@ describe('compareTwoStrings', function () { var compareTwoStrings = stringSimilarity.compareTwoStrings; it('is a function', function () { - expect(typeof compareTwoStrings).toBe('function'); + expect(typeof compareTwoStrings).toEqual('function'); }); it('returns the correct value for different inputs:', function () { const testData = [ - {first: 'french', second: 'quebec', expected: 0}, - {first: 'france', second: 'france', expected: 1}, - {first: 'fRaNce', second: 'france', expected: 1}, - {first: 'healed', second: 'sealed', expected: 0.8}, - {first: 'web applications', second: 'applications of the web', expected: 0.896551724137931}, - {first: 'this will have a typo somewhere', second: 'this will huve a typo somewhere', expected: 0.9}, - {first: 'this has one extra word', second: 'this has one word', expected: 0.8333333333333334}, - {first: 'a', second: 'a', expected: 1}, - {first: 'a', second: 'b', expected: 0}, - {first: '', second: '', expected: 1}, - {first: 'a', second: '', expected: 0}, - {first: '', second: 'a', expected: 0} - ]; + { first: 'french', second: 'quebec', expected: 0 }, + { first: 'france', second: 'france', expected: 1 }, + { first: 'fRaNce', second: 'france', expected: 0.2 }, + { first: 'healed', second: 'sealed', expected: 0.8 }, + { first: 'web applications', second: 'applications of the web', expected: 0.7878787878787878 }, + { first: 'this will have a typo somewhere', second: 'this will huve a typo somewhere', expected: 0.92 }, + { first: 'Olive-green table for sale, in extremely good condition.', second: 'For sale: table in very good condition, olive green in colour.', expected: 0.6060606060606061 }, + { first: 'Olive-green table for sale, in extremely good condition.', second: 'For sale: green Subaru Impreza, 210,000 miles', expected: 0.2558139534883721 }, + { first: 'Olive-green table for sale, in extremely good condition.', second: 'Wanted: mountain bike with at least 21 gears.', expected: 0.1411764705882353 }, + { first: 'this has one extra word', second: 'this has one word', expected: 0.7741935483870968 }, + { first: 'a', second: 'a', expected: 1 }, + { first: 'a', second: 'b', expected: 0 }, + { first: '', second: '', expected: 1 }, + { first: 'a', second: '', expected: 0 }, + { first: '', second: 'a', expected: 0 }, + { first: 'apple event', second: 'apple event', expected: 1 }, + { first: 'iphone', second: 'iphone x', expected: 0.9090909090909091 } + ]; testData.forEach(td => { - expect(compareTwoStrings(td.first, td.second)).toEqual(td.expected); + expect(compareTwoStrings(td.first, td.second)).toBe(td.expected, td); }); }); }); @@ -78,16 +83,22 @@ describe('findBestMatch', function () { var matches = findBestMatch('healed', ['mailed', 'edward', 'sealed', 'theatre']); expect(matches.ratings).toEqual([ - {target: 'mailed', rating: 0.4}, - {target: 'edward', rating: 0.2}, - {target: 'sealed', rating: 0.8}, - {target: 'theatre', rating: 0.36363636363636365} + { target: 'mailed', rating: 0.4 }, + { target: 'edward', rating: 0.2 }, + { target: 'sealed', rating: 0.8 }, + { target: 'theatre', rating: 0.36363636363636365 } ]); }); - it("returns the best match and it's similarity rating", function () { + it("returns the best match and its similarity rating", function () { var matches = findBestMatch('healed', ['mailed', 'edward', 'sealed', 'theatre']); - expect(matches.bestMatch).toEqual({target: 'sealed', rating: 0.8}); + expect(matches.bestMatch).toEqual({ target: 'sealed', rating: 0.8 }); + }); + + it("returns the index of best match from the target strings", function () { + var matches = findBestMatch('healed', ['mailed', 'edward', 'sealed', 'theatre']); + + expect(matches.bestMatchIndex).toBe(2); }); });