Merge branch 'master' of https://github.com/bert-w/sqomplexity

bert-w · Feb 26, 2024 · 60774d4 · 60774d4
2 parents 3637a32 + 3f474f7
commit 60774d4
Show file tree

Hide file tree

Showing 6 changed files with 146 additions and 17 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -22,12 +22,27 @@ jobs:
         with:
           cache: 'npm'
           node-version: ${{ matrix.node-version }}
+
       # Use separate run commands so command status handled correctly on Windows
       - name: npm install
         run: npm ci
+
       - name: npm test
         run: npm test
+
       - name: npm run lint
         run: npm run lint
+
       - name: npm run build
-        run: npm run build
+        run: npm run build
+
+      # Run some simply CLI tool tests.
+      - name: CLI test
+        run: |
+          node dist/sqomplexity.js "SELECT * FROM users"
+          node dist/sqomplexity.js "SELECT * FROM users" -s
+          node dist/sqomplexity.js "SELECT * FROM users" -a
+          node dist/sqomplexity.js "SELECT * FROM users" -p
+          node dist/sqomplexity.js "U0VMRUNUICogRlJPTSB1c2Vycw==" -b
+          node dist/sqomplexity.js "U0VMRUNUICogRlJPTSB1c2Vycw==" -b -s
+          
diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
 # SQompLexity
+[![Build Status](https://github.com/bert-w/sqomplexity/actions/workflows/tests.yml/badge.svg)](https://github.com/bert-w/sqomplexity/actions)
 [![NPM Version](http://img.shields.io/npm/v/sqomplexity.svg?style=flat)](https://www.npmjs.org/package/sqomplexity)
 [![NPM Downloads](https://img.shields.io/npm/dm/sqomplexity.svg?style=flat)](https://npmcharts.com/compare/sqomplexity?minimal=true)
 [![Install Size](https://packagephobia.now.sh/badge?p=sqomplexity)](https://packagephobia.now.sh/result?p=sqomplexity)
@@ -16,21 +17,134 @@
 
 This is a product of my thesis on complexity progression and correlations on Stack Overflow.
 
-SQompLexity is a Node.js program that assigns a complexity score to SELECT queries, based on a data and cognitive complexity score.
-It is specifically made to work with MySQL queries, but other dialects of SQL will likely work as well.
+SQompLexity is a metric that assigns a complexity score to SELECT queries. It is specifically tailored to work with
+MySQL queries, but other dialects of SQL will likely work as well. It needs no knowledge of the database schema and
+quantifies each query in a vacuum.
+
+## Installation
+```shell
+npm i sqomplexity
+```
+Alternatively, download the `dist/sqomplexity.js` file from the repository to use it as a standalone CLI application.
+Node.js is required to run this tool.
+
+## Defining a complexity metric
+
+The scoring of an SQL query is based on 2 major components, being:
+
+**Data complexity** (see prefix **D** in the table below), also called [_Computational complexity_](https://en.wikipedia.org/wiki/Computational_complexity), which takes into account elements like the _amount of rows_
+that a query operates on (relatively speaking), the _computation paths_ a query may take, and the usage of
+_table indexes_ (_indices_). All of these determine the computational cost of a certain component.
+
+**Cognitive complexity** (see prefix **C** in the table below), which describes the mental effort and the concepts a
+person must understand in order to parse the query. This includes components like understanding of [_First-order logic_](https://en.wikipedia.org/wiki/First-order_logic),
+understanding of _grouping_, _filtering_ and _sorting_ (common SQL concepts), and [_Domain knowledge_](https://en.wikipedia.org/wiki/Domain_knowledge)
+like the context of the query compared to its database schema.
+
+## Complexity indicators
+| Code                 | Explanation                                                                                    |
+|----------------------|------------------------------------------------------------------------------------------------|
+| *Indexing behavior*  |                                                                                                |
+| D1-A                 | No possibility to affect the chosen index                                                      |
+| D1-B                 | Low possibility to affect the chosen index                                                     |
+| D1-C                 | High possibility to affect the chosen index                                                    |
+|                      |                                                                                                |
+| *Running time*       |                                                                                                |
+| D2-A                 | $O(0)$ (negligible) running time w.r.t. the number of rows                                     |
+| D2-B                 | $O(1)$ (constant) running time w.r.t. the number of rows                                       |
+| D2-C                 | $O(\log n)$ (logarithmic) running time w.r.t. the number of rows                               |
+| D2-D                 | $O(n)$ (linear) running time w.r.t. the number of rows                                         |
+| D2-E                 | $O(n \log n)$ (linearithmic) running time w.r.t. the number of rows                            |
+| D2-F                 | $O(x)$ (highly variable) running time w.r.t. the number of rows                                |
+|                      |                                                                                                |
+| *Relational algebra* |                                                                                                |
+| C1                   | Requires understanding of *projection* (selection of columns)                                  |
+| C2                   | Requires understanding of *selection* (e.g. boolean logic like (in)equalities and comparisons) |
+| C3                   | Requires understanding of *composition* (multiple tables, column relations, set theory)        |
+| C4                   | Requires understanding of *grouping*                                                           |
+| C5                   | Requires understanding of *aggregation*                                                        |
+|                      |                                                                                                |
+| *Programming*        |                                                                                                |
+| C6                   | Requires understanding of *data types* (e.g. integers, decimals, booleans, dates, times)       |
+| C7                   | Requires understanding of variable *scopes*                                                    |
+| C8                   | Requires understanding of *nesting*                                                            |
+|                      |                                                                                                |
+| *Usage*              |                                                                                                |
+| C9-A                 | One parameter                                                                                  |
+| C9-B                 | Low amount of parameters                                                                       |
+| C9-C                 | High amount of parameters                                                                      |
+| C10                  | Requires understanding of the *database schema*                                                |
+| C11                  | Requires understanding of the *RDBMS* toolset (e.g. function support and differences)          |
+
+What follows is the assignment of each of these indicators to components of an SQL query. The table below shows the
+result of this process. The combination and presence of these indicators are combined into a final weighting for each
+component, namely **Low**, **Medium** or **High**.
+
+## Complexity scoring
+| Component                   | Data Complexity | By            | Cognitive Complexity | By                            |
+|-----------------------------|-----------------|---------------|----------------------|-------------------------------|
+| **Clause:SELECT**           | Low             | D1-A, D2-D    | Low                  | C1, C6, C9-B, C10             |
+| **Clause:FROM**             | Medium          | D1-B, D2-D    | Low                  | C3, C7, C9-A, C10             |
+| **Clause:JOIN**             | Medium          | D1-C, D2-F    | Medium               | C2, C3, C7, C9-B, C10         |
+| **Clause:WHERE**            | High            | D1-C, D2-C/D  | Medium               | C2, C6, C9-B, C10             |
+| **Clause:GROUP BY**         | High            | D1-C, D2-D/E  | High                 | C2, C4, C5, C9-B, C10         |
+| **Clause:HAVING**           | Medium          | D1-A, D2-D    | High                 | C2, C4, C5, C9-C, C10         |
+| **Clause:ORDER BY**         | Low             | D1-C, D2-D/E  | Medium               | C6, C9-B, C10                 |
+| **Clause:LIMIT**            | Low             | D1-A, D2-B    | Low                  | C9-A                          |
+| **Clause:OFFSET**           | Low             | D1-A, D2-B    | Low                  | C9-A                          |
+| **Expression:Table**        | Medium          | D1-B, D2-A    | Medium               | C9-A, C10                     |
+| **Expression:Column**       | Medium          | D1-B, D2-A    | Medium               | C6, C9-A, C10                 |
+| **Expression:String**       | Low             | D1-A, D2-A    | Low                  | C6, C9-A                      |
+| **Expression:Number**       | Low             | D1-A, D2-A    | Low                  | C6, C9-A                      |
+| **Expression:Null**         | Low             | D1-A, D2-A    | Low                  | C6, C9-A                      |
+| **Expression:Star**         | Low             | D1-A, D2-A    | Low                  | C1, C9-A                      |
+| **Expression:Unary**        | Low             | D1-A, D2-A    | Medium               | C2, C6, C9-A                  |
+| **Expression:Binary**       | Low             | D1-A, D2-A    | Medium               | C2, C6, C9-B                  |
+| **Expression:Function**     | High            | D1-B, D2-D    | Medium               | C6, C9-A, C11                 |
+| **Expression:List**         | Low             | D1-C, D2-A    | Low                  | C6, C9-C                      |
+| **Expression:Agg-Function** | High            | D1-B, D2-F    | High                 | C4, C5, C9-A, C10, C11        |
+| **Operator**                | Low             | D1-C, D2-A    | Medium               | C2, C6, C9-B                  |
+| **Emergent:Cycle**          | Medium          | D1-B, D2-F    | High                 | C2, C3, C9-C, C10             |
+| **Emergent:Mixed-Style**    | None            | D1-A, D2-A    | Medium               | C9-C                          |
+| **Emergent:Subquery**       | High            | D1-C, D2-F    | High                 | C1, C2, C3, C7, C8, C9-C, C10 |
+| **Emergent:Variety**        | None            | D1-A, D2-A    | Medium               | C9-C                          |
+
+## Calculation
+Each query that passes through SQompLexity is parsed into an Abstract Syntax Tree (AST), which provides the backbone of
+the algorithm that sums up the weights. Each query is traversed fully (including subqueries), and the scores are summed
+to result in a final SQompLexity score for any given SQL query.
+
+The numerical weights for each of groups are like so:
+
+| **Category**         | **Numerical Score** |
+|----------------------|---------------------|
+| Data Complexity      | 50%                 |
+| Cognitive Complexity | 50%                 |
+|                      |                     |
+| Low                  | 1.0                 |
+| Medium               | 1.25                |
+| High                 | 1.5                 |
+
+The equal contribution of both _Data Complexity_ and _Cognitive Complexity_ is arbitrary, and research could still be done
+to develop a distribution that more fairly approaches a general sense of _complexity_.
+
+Similarly, the weights of _Low_, _Medium_ and _High_ are set to some sensible defaults. It is necessary though for all
+weights to be greater than or equal to 1, since multiplication may take place during the algorithm.
+
+
 ## Execution from JavaScript
 ```js
-import {Sqomplexity} from 'sqomplexity';
+import { Sqomplexity } from 'sqomplexity';
 
-(async () => {
+(async() => {
     // Provide one or multiple queries:
     const queries = [
         'SELECT id FROM users WHERE role = "admin"',
-        'SELECT COUNT(*) FROM users WHERE creation_date > "2023-01-01 00:00:00" GROUP BY id',
+        'SELECT COUNT(*) FROM users WHERE creation_date > "2023-01-01 00:00:00" GROUP BY id'
     ];
 
     // Construct SQompLexity (passing `score` only outputs the complexity score):
-    const command = (new Sqomplexity({score: true}, null, false));
+    const command = (new Sqomplexity({ score: true }, null, false));
 
     console.log(await command.run(queries));
 

diff --git a/app.js b/app.js
@@ -1,5 +1,5 @@
 import { program } from 'commander';
-import { Program } from './src/program.js';
+import { Sqomplexity } from './src/sqomplexity.js';
 import path from 'path';
 import { fileURLToPath } from 'url';
 
@@ -31,7 +31,7 @@ program
     .option('-p, --pretty-print', 'output JSON with indentation and newlines', false)
     .action(async(queries, options) => {
         try {
-            await (new Program(options, process.cwd())).run(queries);
+            await (new Sqomplexity(options, process.cwd())).run(queries);
         } catch (e) {
             program.addHelpText('after', '\n' + e.stack);
             program.help();

diff --git a/dist/sqomplexity.js b/dist/sqomplexity.js
diff --git a/examples/index.js b/examples/index.js
@@ -1,16 +1,16 @@
-import {Sqomplexity} from 'sqomplexity';
+import { Sqomplexity } from 'sqomplexity';
 
-(async () => {
+(async() => {
     // Provide one or multiple queries:
     const queries = [
         'SELECT id FROM users WHERE role = "admin"',
-        'SELECT COUNT(*) FROM users WHERE creation_date > "2023-01-01 00:00:00" GROUP BY id',
+        'SELECT COUNT(*) FROM users WHERE creation_date > "2023-01-01 00:00:00" GROUP BY id'
     ];
 
     // Construct SQompLexity (passing `score` only outputs the complexity score):
-    const command = (new Sqomplexity({score: true}, null, false));
+    const command = (new Sqomplexity({ score: true }, null, false));
 
     console.log(await command.run(queries));
 
     // Result: [ 7.876953, 10.001953 ]
-})();
+})();
diff --git a/package.json b/package.json
@@ -15,8 +15,8 @@
     "type": "module",
     "scripts": {
         "test": "cross-env NODE_OPTIONS=\"$NODE_OPTIONS --experimental-vm-modules\" npx jest",
-        "lint": "eslint app.js \"src/*.js\" \"tests/**/*.js\"",
-        "lint:fix": "eslint app.js \"src/*.js\" \"tests/**/*.js\" --fix",
+        "lint": "eslint app.js \"src/*.js\" \"tests/**/*.js\" \"examples/*.js\"",
+        "lint:fix": "npm run lint -- --fix",
         "build": "webpack",
         "build:pegjs:mysql": "pegjs -o build/pegjs-parser-mysql.cjs parsers/mysql.pegjs",
         "build:pegjs:mariadb": "pegjs -o build/pegjs-parser-mariadb.cjs parsers/mariadb.pegjs"