SAP · marcorosa · Jun 7, 2023 · Mar 25, 2023 · May 16, 2023 · May 16, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -30,14 +30,14 @@ jobs:
           --health-retries 5
 
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v3
 
     - name: Create PostgreSQL database
       run: |
         PGPASSWORD=${{ secrets.POSTGRES_PASSWORD }} psql -U ${{ secrets.POSTGRES_USER }} -h 127.0.0.1 -p 5432 -d credential_digger_tests -f sql/create_table.sql
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
 
@@ -46,7 +46,7 @@ jobs:
         sudo apt install -y build-essential python3-dev libhyperscan-dev
 
     - name: Cache python dependencies
-      uses: actions/cache@v1
+      uses: actions/cache@v3
       with:
         path: ~/.cache/pip # This path is specific to Ubuntu
         key: ${{ runner.os }}-pip-${{ hashFiles('./requirements.txt') }}-${{ hashFiles('./tests/tests-requirements.txt') }}

diff --git a/credentialdigger/cli/get_discoveries.py b/credentialdigger/cli/get_discoveries.py
@@ -225,6 +225,7 @@ def run(client, args):
     args: `argparse.Namespace`
         Arguments from command line parser.
     """
+    discoveries = [] 
     try:
         discoveries = client.get_discoveries(
             repo_url=args.repo_url, file_name=args.filename, with_rules=args.with_rules)

diff --git a/requirements.txt b/requirements.txt
@@ -11,11 +11,11 @@ python-dotenv
 pyyaml
 rich~=12.2
 srsly>=2.4.0
-tensorflow==2.9.3; python_version >= "3.8"
+tensorflow==2.11.1; python_version >= "3.8"
 tensorflow~=2.4; python_version < "3.8"
-tensorflow-estimator==2.9.0; python_version >= "3.8"
+tensorflow-estimator==2.11.0; python_version >= "3.8"
 tensorflow-estimator~=2.4; python_version < "3.8"
-tensorflow-text==2.9.0; python_version >= "3.8"
+tensorflow-text==2.11.0; python_version >= "3.8"
 tensorflow-text~=2.4; python_version < "3.8"
 tf-models-official
 transformers
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@ def requirements():
 
 setuptools.setup(
     name='credentialdigger',
-    version='4.10.0',
+    version='4.11.0',
     author='SAP SE',
     maintainer='Marco Rosa, Slim Trabelsi',
     maintainer_email='[email protected], [email protected]',

diff --git a/tests/functional_tests/test_get_discoveries_postgres.py b/tests/functional_tests/test_get_discoveries_postgres.py
@@ -58,7 +58,10 @@ def tearDownClass(cls):
         """ Remove the repo and all its discoveries. """
         cls.client.delete_repo(REPO_URL)
         cls.client.delete_discoveries(REPO_URL)
-        os.remove(cls.csv_path)
+        try:
+            os.remove(cls.csv_path)
+        except OSError as ex:
+            print(f'Failed to cleanup {cls.csv_path}, error={ex}')
 
     @parameterized.expand([
         param(state='new', count=5),
@@ -142,5 +145,33 @@ def test_csv_written(self):
         data_frame = pd.read_csv(self.csv_path)
         try:
             assert data_frame.notna().values.all()
+            self.assertEqual(len(data_frame.columns), 9)
+            self.assertFalse('rule_regex' in data_frame.columns)
+            self.assertFalse('rule_category' in data_frame.columns)
+            self.assertFalse('rule_description' in data_frame.columns)
         except AssertionError:
             assert False, 'CSV file contains NaN'
+
+    def test_csv_written_with_rules(self):
+        """ Test if the CLI command writes correctly the CSV file with the rule details. """
+        with self.assertRaises(SystemExit) as cm:
+            cli.main(
+                [
+                    '',
+                    'get_discoveries',
+                    REPO_URL,
+                    '--save',
+                    self.csv_path,
+                    '--dotenv',
+                    self.dotenv,
+                    '--with_rules',
+                ]
+            )
+        data_frame = pd.read_csv(self.csv_path)
+        try:
+            self.assertEqual(len(data_frame.columns), 12)
+            self.assertTrue('rule_regex' in data_frame.columns)
+            self.assertTrue('rule_category' in data_frame.columns)
+            self.assertTrue('rule_description' in data_frame.columns)
+        except AssertionError:
+            assert False, 'CSV file does not contain the rule details'
diff --git a/tests/functional_tests/test_get_discoveries_sqlite.py b/tests/functional_tests/test_get_discoveries_sqlite.py
@@ -132,5 +132,33 @@ def test_csv_written(self):
         data_frame = pd.read_csv(self.csv_path)
         try:
             assert data_frame.notna().values.all()
+            self.assertEqual(len(data_frame.columns), 9)
+            self.assertFalse('rule_regex' in data_frame.columns)
+            self.assertFalse('rule_category' in data_frame.columns)
+            self.assertFalse('rule_description' in data_frame.columns)
         except AssertionError:
             assert False, 'CSV file contains NaN'
+
+    def test_csv_written_with_rules(self):
+        """ Test if the CLI command writes correctly the CSV file with the rule details. """
+        with self.assertRaises(SystemExit):
+            cli.main(
+                [
+                    '',
+                    'get_discoveries',
+                    'test_repo',
+                    '--sqlite',
+                    self.db_path,
+                    '--save',
+                    self.csv_path,
+                    '--with_rules',
+                ]
+            )
+        data_frame = pd.read_csv(self.csv_path)
+        try:
+            self.assertEqual(len(data_frame.columns), 12)
+            self.assertTrue('rule_regex' in data_frame.columns)
+            self.assertTrue('rule_category' in data_frame.columns)
+            self.assertTrue('rule_description' in data_frame.columns)
+        except AssertionError:
+            assert False, 'CSV file does not contain the rule details'
diff --git a/ui/backend/client_ui.py b/ui/backend/client_ui.py
@@ -8,6 +8,7 @@
 from credentialdigger import Client
 from git import GitCommandError, InvalidGitRepositoryError, NoSuchPathError
 from git import Repo as GitRepo
+from credentialdigger.client import DiscoveryWithRule
 
 FilesSummary = namedtuple(
     'FilesSummary',
@@ -223,3 +224,35 @@ def _check_repo_commit(self, repo_url, commit_id, local_repo=False):
                 return False, 'WrongBranchError'
 
         return True, None
+
+    def get_discoveries_with_rules(self, query, repo_url, file_name=None):
+        """ Get all the discoveries of a repository with rule details.
+
+        Parameters
+        ----------
+        query: str
+            The query to be run, with placeholders in place of parameters
+        repo_url: str
+            The url of the repository
+        file_name: str, optional
+            The name of the file to filter discoveries on
+
+        Returns
+        -------
+        list
+            A list of discoveries (dictionaries)
+
+        Raises
+        ------
+        TypeError
+            If any of the required arguments is missing
+        """
+        cursor = self.db.cursor()
+        all_discoveries = []
+        params = (repo_url,) if not file_name else (repo_url, file_name)
+        cursor.execute(query, params)
+        result = cursor.fetchone()
+        while result:
+            all_discoveries.append(dict(DiscoveryWithRule(*result)._asdict()))
+            result = cursor.fetchone()
+        return all_discoveries
diff --git a/ui/backend/client_ui_postgres.py b/ui/backend/client_ui_postgres.py
@@ -167,3 +167,32 @@ def get_files_summary(self, repo_url):
                 " FROM discoveries WHERE repo_url=%s"
                 " GROUP BY file_name"
             ))
+
+    def get_discoveries_with_rules(self, repo_url, file_name=None):
+        """ Get all the discoveries of a repository with rule details.
+
+        Parameters
+        ----------
+        repo_url: str
+            The url of the repository
+        file_name: str, optional
+            The filename to filter discoveries on
+
+        Returns
+        -------
+        list
+            A list of discoveries (dictionaries)
+        """
+        query = '''
+            SELECT discoveries.*, r.regex as rule_regex, r.category as rule_category, r.description as rule_description
+            FROM discoveries
+            LEFT JOIN rules r
+            ON rule_id=r.id
+            WHERE repo_url=%s
+        '''
+        if file_name:
+            query += ' AND file_name=%s'
+        return super().get_discoveries_with_rules(
+            repo_url=repo_url,
+            file_name=file_name,
+            query=query)
diff --git a/ui/backend/client_ui_sqlite.py b/ui/backend/client_ui_sqlite.py
@@ -170,3 +170,32 @@ def get_files_summary(self, repo_url):
                 " FROM discoveries WHERE repo_url=?"
                 " GROUP BY file_name"
             ))
+
+    def get_discoveries_with_rules(self, repo_url, file_name=None):
+        """ Get all the discoveries of a repository with rule details.
+
+        Parameters
+        ----------
+        repo_url: str
+            The url of the repository
+        file_name: str, optional
+            The filename to filter discoveries on
+
+        Returns
+        -------
+        list
+            A list of discoveries (dictionaries)
+        """
+        query = '''
+            SELECT discoveries.*, r.regex as rule_regex, r.category as rule_category, r.description as rule_description
+            FROM discoveries
+            LEFT JOIN rules r
+            ON rule_id=r.id
+            WHERE repo_url=?
+        '''
+        if file_name:
+            query += ' AND file_name=?'
+        return super().get_discoveries_with_rules(
+            repo_url=repo_url,
+            file_name=file_name,
+            query=query)
diff --git a/ui/server.py b/ui/server.py
@@ -538,6 +538,66 @@ def update_similar_discoveries():
         return 'OK', 200
 
 
+@app.route('/scan_file', methods=['POST'])
+def scan_file():
+    """ Scan a file. """
+    # Get scan properties
+    rules_to_use = request.form.get('rule_to_use')
+    use_password_model = request.form.get('passwordModel')
+    use_path_model = request.form.get('pathModel')
+    force_scan = request.form.get('forceScan') == 'force'
+    file = request.files['filename']
+    filename = secure_filename(file.filename)
+    # Save file
+    # TODO: perform malware scan on the file
+    try:
+        file_path = os.path.abspath(os.path.join(
+            app.config['UPLOAD_FOLDER'], 'uploads', filename))
+        file.save(file_path)
+        app.logger.debug(f'File saved to {file_path}')
+    except Exception as ex:
+        app.logger.error(
+            f'Error occured when saving file={filename}, file path={file_path}, error={ex}')
+        return 'Error in saving file', 500
+
+    # Set up models
+    models = []
+    if use_path_model == 'path':
+        models.append('PathModel')
+    if use_password_model == 'password':
+        models.append('PasswordModel')
+
+    # Setup scan arguments
+    if rules_to_use != 'all':
+        app.logger.debug(f'Use rules only from {rules_to_use} category')
+    else:
+        rules_to_use = None
+
+    # Scan
+    try:
+        discoveries = c.scan_path(scan_path=file_path, models=models, force=force_scan,
+                                  similarity=False, max_depth=-1, ignore_list=[], category=rules_to_use)
+    except OSError as ex:
+        app.logger.error(
+            f'Error occured when scanning file={filename}, file path={file_path}, error={ex}')
+        os.remove(file_path)
+        return f'Error in scanning file {filename}', 500
+
+    # Get discoveries
+    discoveries_with_rules = []
+    if len(discoveries):
+        try:
+            discoveries_with_rules = c.get_discoveries_with_rules(
+                repo_url=file_path)
+        except OSError as ex:
+            app.logger.error(
+                f'Error occured when getting discoveries of file={filename}, file path={file_path}, error={ex}')
+            return f'Error in getting discoveries of file {filename}', 500
+        finally:
+            os.remove(file_path)
+    return jsonify(discoveries_with_rules)
+
+
 jwt = JWTManager(app)
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)