Merge pull request #39 from KnowledgeCaptureAndDiscovery/cli

CLI Branch PR
KnowledgeCaptureAndDiscovery · Feb 21, 2020 · 2e71c78 · 2e71c78
2 parents 6968d28 + a491bbd
commit 2e71c78
Show file tree

Hide file tree

Showing 7 changed files with 1,423 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,43 @@
 # SM2KG
 Software Metadata 2 Knowledge Graphs: A tool for automatically extracting relevant information from readme files
+
+Installation Instructions - 
+
+`pip3 install -r requirements.txt`
+
+Create a config.json file using the sample file in the repository.
+
+Command Line Interface - 
+
+createJSON.py generates a JSON object after extracting useful information from the github repository. It classifies the readme file into one of four categories - description, invocation, installation, citation depending on highest confidence above a given threshold.
+
+The createJSON.py file takes as input the following parameters:
+
+-r / --repo_url: Link to the github repository for extracting information
+
+-m / --model_path: Path to the pickled models for extraction
+
+-o / --output: Output file name
+
+-t / --threshold: Threshold to classify the content of the readme file
+
+-d / --doc_src: Path of documentation file
+
+
+cli.py generates a JSON object after extracting useful information from the github repository. It classifies the readme file into one of four categories - description, invocation, installation, citation depending on confidence above a given threshold.
+
+The cli.py file takes as input the following parameters:
+
+-r / --repo_url: Link to the github repository for extracting information
+
+-o / --output: Output file name
+
+-t / --threshold: Threshold to classify the content of the readme file
+
+-d / --doc_src: Path of documentation file
+
+Example:
+
+`python3 createJSON.py -r https://github.com/{owner}/{repository_name} -m ./models/ -o output.json -t 0.5`
+
+`python3 cli.py -r https://github.com/{owner}/{repository_name} -o output.json -t 0.5`
diff --git a/cli.py b/cli.py
@@ -0,0 +1,228 @@
+# creatJSON.py
+# parameters:
+## input file: either: url to github repository OR markdown documentation file path
+## output file: json with each excerpt marked with all four classification scores
+
+import argparse
+import json
+import base64
+from urllib.parse import urlparse
+import sys
+import os
+from os import path
+import requests
+from markdown import Markdown
+from bs4 import BeautifulSoup
+from io import StringIO
+import pickle
+import pprint
+import pandas as pd
+import numpy as np
+import re
+
+## Markdown to plain text conversion: begin ##
+# code snippet from https://stackoverflow.com/a/54923798
+def unmark_element(element, stream=None):
+    if stream is None:
+        stream = StringIO()
+    if element.text:
+        stream.write(element.text)
+    for sub in element:
+        unmark_element(sub, stream)
+    if element.tail:
+        stream.write(element.tail)
+    return stream.getvalue()
+
+# patching Markdown
+Markdown.output_formats["plain"] = unmark_element
+__md = Markdown(output_format="plain")
+__md.stripTopLevelTags = False
+
+def unmark(text):
+    return __md.convert(text)
+## Markdown to plain text conversion: end ##
+
+def restricted_float(x):
+    x = float(x)
+    if x < 0.0 or x > 1.0:
+        raise argparse.ArgumentTypeError(f"{x} not in range [0.0, 1.0]")
+    return x
+
+categories = ['description','citation','installation','invocation']
+keep_keys = ('description', 'name', 'owner', 'license', 'languages_url', 'forks_url')
+
+
+## Function uses the repository_url provided to load required information from github.
+## Information kept from the repository is written in keep_keys.
+## Returns the readme text and required metadata
+def load_repository_metadata(repository_url):
+    print("Loading Repository Information....")
+    ## load general response of the repository
+    url = urlparse(repository_url)
+    if url.netloc != 'github.com':
+        sys.exit("Error: repository must come from github")
+    _, owner, repo_name = url.path.split('/')
+    general_resp = requests.get(f"https://api.github.com/repos/{owner}/{repo_name}", headers=header).json() 
+
+    if 'message' in general_resp.keys() and general_resp['message']=="Not Found":
+        sys.exit("Error: repository name is incorrect")
+
+    ## Remove extraneous data
+    filtered_resp = {k: general_resp[k] for k in keep_keys}
+
+    ## Condense owner information
+    if filtered_resp['owner'] and 'login' in filtered_resp['owner'].keys():
+        filtered_resp['owner'] = filtered_resp['owner']['login']
+
+    ## condense license information
+    license_info = {}
+    for k in ('name', 'url'):
+        if filtered_resp['license'] and k in filtered_resp['license'].keys():
+            license_info[k] = filtered_resp['license'][k]
+    filtered_resp['license'] = license_info
+
+    # get keywords / topics
+    topics_headers = {}
+    topics_headers.update(header)
+    topics_headers = {'accept': 'application/vnd.github.mercy-preview+json'}
+    topics_resp = requests.get('https://api.github.com/repos/' + owner + "/" + repo_name + '/topics', headers=topics_headers).json()
+    if topics_resp and 'names' in topics_resp.keys():
+        filtered_resp['topics'] = topics_resp['names']
+
+    ## get languages
+    filtered_resp['languages'] = list(requests.get(filtered_resp['languages_url']).json().keys())
+    del filtered_resp['languages_url']
+
+    ## get default README
+    readme_info = requests.get('https://api.github.com/repos/' + owner + "/" + repo_name + '/readme', headers=topics_headers).json()
+    readme = base64.b64decode(readme_info['content']).decode("utf-8")
+    text = unmark(readme)
+    filtered_resp['readme_url'] = readme_info['html_url']
+
+    ## get releases
+    releases_list = requests.get('https://api.github.com/repos/' + owner + "/" + repo_name + '/releases', headers=header).json()
+    releases_list = map(lambda release : {'tag_name': release['tag_name'], 'name': release['name'], 'author_name': release['author']['login'], 'body': release['body'], 'tarball_url': release['tarball_url'], 'zipball_url': release['zipball_url'], 'html_url':release['html_url'], 'url':release['url']}, releases_list)
+    filtered_resp['releases'] = list(releases_list)
+
+    print("Repository Information Successfully Loaded.")
+    return text, filtered_resp
+
+## Function takes readme text as input and divides it into excerpts
+## Returns the extracted excerpts
+def create_excerpts(text):
+    divisions = text.splitlines()
+    divisions = [i for i in divisions if i]
+    return divisions
+
+## Function takes readme text as input and runs the provided classifiers on it
+## Returns the dictionary containing scores for each excerpt.
+def run_classifiers(text):
+    score_dict={}
+    for category in categories:
+        excerpts = create_excerpts(text)
+        file_name = file_paths[category]
+        if file_name=="":
+            print('I am here')
+            continue
+        if not path.exists(file_name):
+            sys.exit("Error: File/Directory does not exist")
+        print("Classifying excerpts for the catgory",category)
+        classifier = pickle.load(open(file_name, 'rb'))
+        scores = classifier.predict_proba(excerpts)
+        score_dict[category]={'excerpt': excerpts, 'confidence': scores[:,1]}
+        print("Excerpt Classification Successful for the Category",category)   
+    return score_dict 
+
+## Function takes scores dictionary and a threshold as input 
+## Returns predictions containing excerpts with a confidence above the given threshold.
+def classify(scores, threshold):
+    print("Checking Thresholds for Excerpt Classification.")
+    predictions = {}
+    for ele in scores.keys():
+        print("Running for",ele)
+        flag = False
+        predictions[ele] = []
+        excerpt=""
+        confid=[]
+        for i in range(len(scores[ele]['confidence'])):
+            if scores[ele]['confidence'][i]>=threshold:
+                if flag==False:
+                    excerpt=excerpt+scores[ele]['excerpt'][i]+' \n'
+                    confid.append(scores[ele]['confidence'][i])
+                    flag=True
+                else:
+                    excerpt=excerpt+scores[ele]['excerpt'][i]+' \n'
+                    confid.append(scores[ele]['confidence'][i])
+            else :
+                if flag==True:
+                    element = {'excerpt':excerpt,'confidence':confid}
+                    predictions[ele].append(element)
+                    excerpt=""
+                    confid=[]
+                    flag=False
+        print("Run completed.")
+    print("All Excerpts below the given Threshold Removed.")
+    return predictions
+
+## Function takes readme text as input and runs a regex parser on it
+## Returns a list of bibtex citations
+def extract_bibtex(readme_text):
+    print("Extracting bibtex citation from readme")
+    regex = r'\@[a-zA-z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}'
+    excerpts = readme_text
+    citations = re.findall(regex,excerpts)
+    print("Extracting bibtex citation from readme completed.")
+    print(citations)
+    print(len(citations))
+    return citations
+
+## Function takes metadata, readme text predictions, bibtex citations and path to the output file
+## Performs some combinations and saves the final json Object in the file
+def save_json(git_data, repo_data, citations, outfile):   
+
+    for i in git_data.keys():
+        if i == 'description':
+            if 'description' not in repo_data.keys():
+                repo_data['description'] = []
+            repo_data['description'].append(git_data[i])
+        else:
+            repo_data[i] = git_data[i]
+
+    for i in range(len(citations)):
+        if 'citation' not in repo_data.keys():
+            repo_data['citation'] = []
+        repo_data['citation'].append({'excerpt': citations[i]})
+
+    print("Saving json data to",outfile)
+    with open(outfile, 'w') as output:
+        json.dump(repo_data, output)  
+
+header = {}
+with open('config.json') as fh:
+    file_paths = json.load(fh)
+header['Authorization'] = file_paths['Authorization']
+header['accept'] = 'application/vnd.github.v3+json'
+
+argparser = argparse.ArgumentParser(description="Fetch Github README, split paragraphs, run classifiers and output json containing repository information, classified excerpts and confidence.")
+src = argparser.add_mutually_exclusive_group(required=True)
+src.add_argument('-r', '--repo_url', help="URL of the Github repository")
+src.add_argument('-d', '--doc_src', help='path to documentation file')
+argparser.add_argument('-o', '--output', help="path for output json", required=True)
+argparser.add_argument('-t','--threshold', help="threshold score", type=restricted_float, default=0.5)
+argv = argparser.parse_args()
+
+github_data = {}
+if (argv.repo_url):
+    text, github_data = load_repository_metadata(argv.repo_url)
+elif (argv.doc_src):
+    # Documentation from already downloaded Markdown file.
+    with open(argv.doc_src, 'r') as doc_fh:
+        text = unmark(doc_fh.read())
+
+score_dict = run_classifiers(text)
+
+predictions = classify(score_dict, argv.threshold)
+
+citations = extract_bibtex(text)
+
+save_json(github_data, predictions, citations, argv.output)
diff --git a/config.json b/config.json
@@ -0,0 +1,7 @@
+{
+	"Authorization" : "token PersonalAccessToken",
+	"description" : "./models/description.sk",
+	"citation" : "./models/citation.sk",
+	"installation" : "./models/installation.sk",
+	"invocation" : "./models/invocation.sk"
+}