Skip to content

Commit

Permalink
initial public release (v1.0.0)
Browse files Browse the repository at this point in the history
  • Loading branch information
leonmoonen committed Jul 19, 2021
1 parent 8e28a2e commit ac63605
Show file tree
Hide file tree
Showing 22 changed files with 5,585 additions and 2 deletions.
382 changes: 382 additions & 0 deletions Code/collect_commits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,382 @@
import ast
import os
import re
import uuid

import pandas as pd
import configuration as cf
from guesslang import Guess
from pydriller import Repository
from utils import log_commit_urls


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

fixes_columns = [
'cve_id',
'hash',
'repo_url',
]

commit_columns = [
'hash',
'repo_url',
'author',
'author_date',
'author_timezone',
'committer',
'committer_date',
'committer_timezone',
'msg',
'merge',
'parents',
'num_lines_added',
'num_lines_deleted',
'dmm_unit_complexity',
'dmm_unit_interfacing',
'dmm_unit_size'
]

file_columns = [
'file_change_id',
'hash',
'filename',
'old_path',
'new_path',
'change_type',
'diff',
'diff_parsed',
'num_lines_added',
'num_lines_deleted',
'code_after',
'code_before',
'nloc',
'complexity',
'token_count',
'programming_language'
]

method_columns = [
'method_change_id',
'file_change_id',
'name',
'signature',
'parameters',
'start_line',
'end_line',
'code',
'nloc',
'complexity',
'token_count',
'top_nesting_level',
'before_change',
]


def extract_project_links(df_master):
"""
extracts all the reference urls from CVE records that match to the repo commit urls
"""
df_fixes = pd.DataFrame(columns=fixes_columns)
git_url = r'(((?P<repo>(https|http):\/\/(bitbucket|github|gitlab)\.(org|com)\/(?P<owner>[^\/]+)\/(?P<project>[^\/]*))\/(commit|commits)\/(?P<hash>\w+)#?)+)'
cf.logger.info('-' * 70)
cf.logger.info('Extracting all the reference urls from CVE...')
for i in range(len(df_master)):
ref_list = ast.literal_eval(df_master['reference_json'].iloc[i])
if len(ref_list) > 0:
for ref in ref_list:
url = dict(ref)['url']
link = re.search(git_url, url)
if link:
row = {
'cve_id': df_master['cve_id'][i],
'hash': link.group('hash'),
'repo_url': link.group('repo').replace(r'http:', r'https:')
}
df_fixes = df_fixes.append(pd.Series(row), ignore_index=True)

df_fixes = df_fixes.drop_duplicates().reset_index(drop=True)
cf.logger.info('Number of collected references to vulnerability fixing commits:', len(df_fixes))
return df_fixes


def guess_pl(code):
"""
:returns guessed programming language of the code
"""
if code:
return Guess().language_name(code.strip())
else:
return 'unknown'


def clean_string(signature):
return signature.strip().replace(' ','')


def get_method_code(source_code, start_line, end_line):
try:
if source_code is not None:
code = ('\n'.join(source_code.split('\n')[int(start_line) - 1: int(end_line)]))
return code
else:
return None
except Exception as e:
cf.logger.warning('Problem while getting method code from the file!', e)
pass


def changed_methods_both(file):
"""
Return the list of methods that were changed.
:return: list of methods
"""
new_methods = file.methods
old_methods = file.methods_before
added = file.diff_parsed["added"]
deleted = file.diff_parsed["deleted"]

methods_changed_new = {
y
for x in added
for y in new_methods
if y.start_line <= x[0] <= y.end_line
}
methods_changed_old = {
y
for x in deleted
for y in old_methods
if y.start_line <= x[0] <= y.end_line
}
return methods_changed_new, methods_changed_old


# --------------------------------------------------------------------------------------------------------
# extracting method_change data
def get_methods(file, file_change_id):
"""
returns the list of methods in the file.
"""
file_methods = []
try:
if file.changed_methods:
cf.logger.debug('-' * 70)
cf.logger.debug('\nmethods_after: ')
cf.logger.debug('- ' * 35)
for m in file.methods:
if m.name != '(anonymous)':
cf.logger.debug(m.long_name)

cf.logger.debug('\nmethods_before: ')
cf.logger.debug('- ' * 35)
for mb in file.methods_before:
if mb.name != '(anonymous)':
cf.logger.debug(mb.long_name)

cf.logger.debug('\nchanged_methods: ')
cf.logger.debug('- ' * 35)
for mc in file.changed_methods:
if mc.name != '(anonymous)':
cf.logger.debug(mc.long_name)
cf.logger.debug('-' * 70)

# for mb in file.methods_before:
# for mc in file.changed_methods:
# #if mc.name == mb.name and mc.name != '(anonymous)':
# if clean_string(mc.long_name) == clean_string(mb.long_name) and mc.name != '(anonymous)':

if file.changed_methods:
methods_after, methods_before = changed_methods_both(file) # modified methods in source_code_after/_before
if methods_before:
for mb in methods_before:
# filtering out code not existing, and (anonymous)
# because lizard API classifies the code part not as a correct function.
# Since, we did some manual test, (anonymous) function are not function code.
# They are also not listed in the changed functions.
if file.source_code_before is not None and mb.name != '(anonymous)':
# method_before_code = ('\n'.join(file.source_code_before.split('\n')[int(mb.start_line) - 1: int(mb.end_line)]))
method_before_code = get_method_code(file.source_code_before, mb.start_line, mb.end_line)
method_before_row = {
'method_change_id': uuid.uuid4().fields[-1],
'file_change_id': file_change_id,
'name': mb.name,
'signature': mb.long_name,
'parameters': mb.parameters,
'start_line': mb.start_line,
'end_line': mb.end_line,
'code': method_before_code,
'nloc': mb.nloc,
'complexity': mb.complexity,
'token_count': mb.token_count,
'top_nesting_level': mb.top_nesting_level,
'before_change': 'True',
}
file_methods.append(method_before_row)

if methods_after:
for mc in methods_after:
if file.source_code is not None and mc.name != '(anonymous)':
# changed_method_code = ('\n'.join(file.source_code.split('\n')[int(mc.start_line) - 1: int(mc.end_line)]))
changed_method_code = get_method_code(file.source_code, mc.start_line, mc.end_line)
changed_method_row = {
'method_change_id': uuid.uuid4().fields[-1],
'file_change_id': file_change_id,
'name': mc.name,
'signature': mc.long_name,
'parameters': mc.parameters,
'start_line': mc.start_line,
'end_line': mc.end_line,
'code': changed_method_code,
'nloc': mc.nloc,
'complexity': mc.complexity,
'token_count': mc.token_count,
'top_nesting_level': mc.top_nesting_level,
'before_change': 'False',
}
file_methods.append(changed_method_row)

if file_methods:
return file_methods
else:
return None

except Exception as e:
cf.logger.warning('Problem while fetching the methods!', e)
pass


# ---------------------------------------------------------------------------------------------------------
# extracting file_change data of each commit
def get_files(commit):
"""
returns the list of files of the commit.
"""
commit_files = []
commit_methods = []
try:
cf.logger.info(f'Extracting files for {commit.hash}')
if commit.modified_files:
for file in commit.modified_files:
cf.logger.debug(f'Processing file {file.filename} in {commit.hash}')
# programming_language = (file.filename.rsplit(".')[-1] if '.' in file.filename else None)
programming_language = guess_pl(file.source_code) # guessing the programming language of fixed code
file_change_id = uuid.uuid4().fields[-1]

file_row = {
'file_change_id': file_change_id, # filename: primary key
'hash': commit.hash, # hash: foreign key
'filename': file.filename,
'old_path': file.old_path,
'new_path': file.new_path,
'change_type': file.change_type, # i.e. added, deleted, modified or renamed
'diff': file.diff, # diff of the file as git presents it (e.g. @@xx.. @@)
'diff_parsed': file.diff_parsed, # diff parsed in a dict containing added and deleted lines lines
'num_lines_added': file.added_lines, # number of lines added
'num_lines_deleted': file.deleted_lines, # number of lines removed
'code_after': file.source_code,
'code_before': file.source_code_before,
'nloc': file.nloc,
'complexity': file.complexity,
'token_count': file.token_count,
'programming_language': programming_language,
}
file_methods = []
commit_files.append(file_row)
file_methods = get_methods(file, file_change_id)

if file_methods is not None:
commit_methods.extend(file_methods)
else:
cf.logger.info('The list of modified_files is empty')

return commit_files, commit_methods

except Exception as e:
cf.logger.warning('Problem while fetching the files!', e)
pass


def extract_commits(repo_url, hashes):
"""This function extract git commit information of only the hashes list that were specified in the
commit URL. All the commit_fields of the corresponding commit have been obtained.
Every git commit hash can be associated with one or more modified/manipulated files.
One vulnerability with same hash can be fixed in multiple files so we have created a dataset of modified files
as 'df_file' of a project.
:param repo_url: list of url links of all the projects.
:param hashes: list of hashes of the commits to collect
:return dataframes: at commit level and file level.
"""
repo_commits = []
repo_files = []
repo_methods = []

# ----------------------------------------------------------------------------------------------------------------
# extracting commit-level data
if 'github' in repo_url:
repo_url = repo_url + '.git'

cf.logger.debug(f'Extracting commits for {repo_url} with {cf.NUM_WORKERS} worker(s) looking for the following hashes:')
log_commit_urls(repo_url, hashes)

# giving first priority to 'single' parameter for single hash because
# it has been tested that 'single' gets commit information in some cases where 'only_commits' does not,
# for example: https://github.com/hedgedoc/hedgedoc.git/35b0d39a12aa35f27fba8c1f50b1886706e7efef
single_hash = None
if len(hashes) == 1:
single_hash = hashes[0]
hashes = None

for commit in Repository(path_to_repo=repo_url,
only_commits=hashes,
single=single_hash,
num_workers=cf.NUM_WORKERS).traverse_commits():
cf.logger.debug(f'Processing {commit.hash}')
try:
commit_row = {
'hash': commit.hash,
'repo_url': repo_url,
'author': commit.author.name,
'author_date': commit.author_date,
'author_timezone': commit.author_timezone,
'committer': commit.committer.name,
'committer_date': commit.committer_date,
'committer_timezone': commit.committer_timezone,
'msg': commit.msg,
'merge': commit.merge,
'parents': commit.parents,
'num_lines_added': commit.insertions,
'num_lines_deleted': commit.deletions,
'dmm_unit_complexity': commit.dmm_unit_complexity,
'dmm_unit_interfacing': commit.dmm_unit_interfacing,
'dmm_unit_size': commit.dmm_unit_size,
}
commit_files, commit_methods = get_files(commit)

repo_commits.append(commit_row)
repo_files.extend(commit_files)
repo_methods.extend(commit_methods)

except Exception as e:
cf.logger.warning('Problem while fetching the commits!', e)
pass

if repo_commits:
df_repo_commits = pd.DataFrame.from_dict(repo_commits)
df_repo_commits = df_repo_commits[commit_columns] # ordering the columns
else:
df_repo_commits = None

if repo_files:
df_repo_files = pd.DataFrame.from_dict(repo_files)
df_repo_files = df_repo_files[file_columns] # ordering the columns
else:
df_repo_files = None

if repo_methods:
df_repo_methods = pd.DataFrame.from_dict(repo_methods)
df_repo_methods = df_repo_methods[method_columns] # ordering the
else:
df_repo_methods = None


return df_repo_commits, df_repo_files, df_repo_methods
Loading

0 comments on commit ac63605

Please sign in to comment.