From 34ee1fb814b773b1fa09a71fbc72b6ddf655fbbe Mon Sep 17 00:00:00 2001
From: Balaji Alwar <balajialwar@berkeley.edu>
Date: Wed, 8 May 2024 19:08:29 -0700
Subject: [PATCH] Add a github action to convert qmd files to html and render
 it in Github pages

---
 .github/workflows/qmd_tohtml.yml         |  42 +
 nbgitpuller_processing_visualization.qmd | 957 -----------------------
 2 files changed, 42 insertions(+), 957 deletions(-)
 create mode 100644 .github/workflows/qmd_tohtml.yml
 delete mode 100644 nbgitpuller_processing_visualization.qmd

diff --git a/.github/workflows/qmd_tohtml.yml b/.github/workflows/qmd_tohtml.yml
new file mode 100644
index 0000000..1a5d8b4
--- /dev/null
+++ b/.github/workflows/qmd_tohtml.yml
@@ -0,0 +1,42 @@
+name: Convert qmd to html
+
+on:
+  push:
+
+jobs:
+  convert:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+
+      - name: Install dependencies
+        run: |
+          pip install quarto
+
+      - name: Convert qmd to html
+        run: |
+          mkdir -p dashboard/visualization_output
+          for qmd in $(find . -name "*.qmd"); do
+            quarto render $qmd --to html --output-dir dashboard/visualization_output
+          done
+
+      - name: Publish to GitHub Pages
+        uses: quarto-dev/quarto-actions/publish@v2
+        with:
+          path: dashboard/visualization_output
+          render: false
+
+      - name: Push changes
+        uses: ad-m/github-push-action@master
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          branch: ${{ github.ref }}
diff --git a/nbgitpuller_processing_visualization.qmd b/nbgitpuller_processing_visualization.qmd
deleted file mode 100644
index 0317e5a..0000000
--- a/nbgitpuller_processing_visualization.qmd
+++ /dev/null
@@ -1,957 +0,0 @@
----
-title: All File Types
-jupyter: python3
-format: dashboard
-server: shiny
----
-
-```{r}
-library(shiny)
-```
-
-```{python}
-#!pip install pandas
-#!pip install matplotlib
-#!pip install shiny
-```
-
-```{python}
-#| editable: true
-#| slideshow: {slide_type: ''}
-#| tags: []
-# importing all packages
-import pandas as pd
-import gzip
-import re
-from urllib.parse import urlparse, parse_qsl, unquote
-import os
-import matplotlib.pyplot as plt
-import matplotlib.dates as mdates
-import math
-import numpy as np
-import datetime 
-#%matplotlib inline
-```
-
-```{python}
-#!conda env list
-```
-
-```{python}
-#| editable: true
-#| slideshow: {slide_type: ''}
-#| tags: []
-# opens up the nbgitpuller file - uses gzip to open so need to ensure that file is zipped
-nbgitpuller_filename = 'nbgitpuller-clicks-fall-2023.jsonl.gz'
-nbgitpuller_df = pd.read_json(gzip.open(nbgitpuller_filename), lines = True)
-#nbgitpuller_df = pd.read_json("nbgitpuller-clicks-fall-2023.jsonl")
-nbgitpuller_df.head()
-```
-
-```{python}
-# obtaining substring after GET and before the redirection
-urls_all = nbgitpuller_df.textPayload.apply(lambda x: x[x.find('GET')+3:x.find('->')].strip())
-
-# uses urllib.parse to parse the url into path and query
-urls_parsed_all = urls_all.apply(lambda x: urlparse(x))
-
-# uses the parsed urls to obtain the action from the path
-nbgitpuller_df['actions'] = urls_parsed_all.apply(lambda x: os.path.basename(x.path))
-```
-
-```{python}
-nbgitpuller_df.actions.unique()
-```
-
-## For git-pull
-
-```{python}
-# function to determine the filetypes
-def path_extension_puller(row):
-    """
-    pandas row function; uses apply
-    function to pull out select file extensions and urlpaths
-    """
-    row_dict = dict(row)
-    if 'urlpath' in row_dict:
-        key = 'urlpath'
-    elif 'subPath' in row_dict:
-        key = 'subPath'
-    else: 
-        return 'NaN', 'NaN'
-    
-    # files that the analysis is interested in 
-    file_extension_list = ['ipyn[b]?', 'Rmd', 'pdf', 'txt', 'xml', 'ini', 'csv', 'py', 'R', 'md']
-    if len(row_dict[key].split('.')) > 1:
-        file_extension_split_string = row_dict[key].split('.')[-1]
-        for file_extension in file_extension_list:
-            if (len(re.findall(file_extension, file_extension_split_string)) > 0):
-                return row_dict[key], re.findall(file_extension, file_extension_split_string)[-1]
-        else:
-            return row_dict[key], 'NaN'
-    else:
-            return row_dict[key], 'NaN'
-
-def get_repo(row):
-    """
-    pandas row function; uses apply
-    returns repo url from parsed url
-    """
-    for item in row:
-        key, value = item
-        if 'repo' in key:
-            return unquote(value)
-    return 'NaN'
-
-def repo_parsing(row):
-    """
-    pandas row function; uses apply
-    parses the repo url so that it obtains the user and folder/content user is accessing
-    """
-    if row:
-        if len(row[0].split('/')) > 2:
-            return row[0].split('/')[1]
-        else:
-            return row[0].split('/')[-1]
-    else:
-        return 'NaN'
-```
-
-```{python}
-# makes a new dataframe that only contains git-pull and resets index
-nbgitpuller_df_pull = nbgitpuller_df[nbgitpuller_df.actions == 'git-pull'].reset_index()
-
-# obtains all the log info
-log_info_pull = nbgitpuller_df_pull.textPayload.apply(lambda x: ''.join(re.findall("\[.*\]", x)).replace('[', '').replace(']', '').split(' '))
-
-# retreives the hubs for each textpayload
-hub_source_pull = nbgitpuller_df_pull.resource.apply(lambda x: x['labels']['namespace_name'])
-
-# obtains substring after GET and before the redirection
-urls_pull = nbgitpuller_df_pull.textPayload.apply(lambda x: x[x.find('GET')+3:x.find('->')].strip())
-
-# uses urllib.parse to parse the url into path and query
-urls_parsed_pull = urls_pull.apply(lambda x: urlparse(x))
-
-# uses parsed urls to obtain the action as a quality check
-actions_pull = urls_parsed_pull.apply(lambda x: os.path.basename(x.path))
-
-# breaks apart the parsed query into repo/urlpath
-urls_queries_pull = urls_parsed_pull.apply(lambda x: parse_qsl(x.query))
-
-# getting the file type from urlpath
-path_extension_pull = urls_queries_pull.apply(path_extension_puller)
-
-# gets repo urls from the parsed url
-repos_pull = urls_queries_pull.apply(get_repo)
-
-# extract ones that have github.com in the repo url or else its a null value
-repos_parsed_pull = repos_pull.apply(lambda x: re.findall("github\.com/+(.+)", x) if x else 'NaN')
-
-# obtains the user and git content from github.com repo urls
-git_user_pull = repos_parsed_pull.apply(lambda x: x[0].split('/')[0] if x else 'NaN')
-git_user_repo_pull = repos_parsed_pull.apply(repo_parsing)
-
-# adds it all into a dataframe
-nbgitpuller_textPayload_df_pull = pd.DataFrame({'log_info_type': log_info_pull.apply(lambda x: x[0]),
-                                           'timestamp_date': log_info_pull.apply(lambda x: x[1]),
-                                           'timestamp_time': log_info_pull.apply(lambda x: x[2]),
-                                           'action': actions_pull,
-                                           'git_query': urls_queries_pull,
-                                           'repo': repos_pull,
-                                           'git_user_content': repos_parsed_pull,
-                                           'git_user': git_user_pull,
-                                           'git_content': git_user_repo_pull,
-                                           'git_path': path_extension_pull.apply(lambda x: x[0]),
-                                           'file_extension': path_extension_pull.apply(lambda x: x[1]),
-                                           'hub': hub_source_pull})
-```
-
-```{python}
-nbgitpuller_textPayload_df_pull['git_user_content_path'] = nbgitpuller_textPayload_df_pull.apply(lambda x: ''.join(x['git_user_content']) + '/' + ''.join(x['git_path']), axis = 1)
-```
-
-```{python}
-def course_assigner_regex(row):
-    """
-    pandas row function; uses apply
-    determines which classes and semesters are for each github repo
-    """
-    courses = {'(data8|ds8)': 'data8', '(ds100|data100)': 'data100', '(prob140)': 'data140', #data
-               '(caldataeng|data101|ds101)': 'data101', '(data6|ds6)': 'data6', '(data102|ds102)': 'data102', #data
-               '(data4ac|ds4ac)': 'data4ac', '(data198|ds198)': 'data198',
-               '(cs189|compsci189)': 'compsci189', '(cs170|compsci170)': 'compsci170', #compsci 
-               '(ee16a|eecs16a)': 'eecs16a', '(ee16b|eecs16b)': 'eecs16b', '(eecs127)': 'eecs127',#eecs
-               '(ee120|eleng120)': 'eleng120', #electrical engineering
-               '(physics111b)': 'physics111b', '(physics88)': 'physics88', # physics
-               '(polsci3|ps3|polisci3)': 'polsci3', '(polsci5|ps5)': 'polsci5', '(polsci88|ps88)': 'polsci88', '(ps109|polsci109)': 'polsci109', # polisci
-               '(ce190|civeng90)': 'civgeng190', '(ce93|civeng93)': 'civeng93', '(ce200b|civeng200b)': 'civeng200b', '(ce110|civeng110)': 'civeng110', #civileng
-               '(envecon118|eep118)': 'envecon118', '(eep147|envecon147)': 'envecon147', '(eep153|envecon153)': 'envecon153', #environmental
-               'ph[w]?142': 'pbhlth142', 'ph[w]?251': 'pbhlth251', 'ph[w]?290': 'pbhlth290', 'ph[w]?252': 'pbhlth252', 'ph[w]?253': 'pbhlth253', 'pbhlth250c': 'pbhlth250c',
-               'ph[w]?196': 'pbhlth196', # public health
-               'mcb163l': 'mcellbi163l', 'mcb280': 'mcellbi280', 'mcbc117': 'mcellbic117', 'mcb32': 'mcellbi32', 'mcb288': 'mcellbi288', #molecular cell bio
-               '(bio1b|biology1b)': 'biology1b', # biology
-               'stat88': 'stat88', 'stat157': 'stat157', 'stat159': 'stat159', 'stat131': 'stat131', 'stat135': 'stat135', 'stat20': 'stat20', 
-               'stat150': 'stat150', #stat
-               'math124': 'math124', #math
-               '(demog180)': 'demog180', 'demog[c]?175': 'demog175', #demography
-               '(eps130)': 'eps130', '(eps88)': 'eps88', 'eps256': 'eps256', 'eps24': 'eps24',
-               '(econ140)': 'econ140', '(econ148)': 'econ148', 'econ141': 'econ141', 'econ172': 'econ172', 'econ151': 'econ151', #econ
-               'econ157': 'econ157', 'econ130': 'econ130', 'econ143': 'econ143', 'econ135': 'econ135',
-               '(rbridge)': 'datasci_rbridge', '(midsw241)': 'datasci241', '(midsw203)': 'datasci203', #datasci
-               '(legal123|legalst123)': 'legalst123', '(legalst190|legal190)': 'legalst190', # legal
-               '(es22ac|ethstd22ac)': 'ethstd22ac', '(esc164a|ethstdc164a)': 'ethstdc164a', '(es21ac|ethstd21ac)': 'ethstd21ac',  # ethnic studies
-               'cp201b': 'cyplan201b', '(cityplanning88|cp88)': 'cyplan88', 
-               'ib120': 'integbi120', 'ibc32': 'integbi32', 'ib134l': 'integbi134l',
-               'mse104l': 'matsci104l',
-               'are212': 'aresec212',
-               'educw142': 'educw142',
-               '(cogscic131|psych123)': 'cogscic131', 'psych198': 'psych198',
-               'anth[ro]?115': 'anthro115',
-               'espmc167': 'espmc167', '(ibespm105)': 'espmc105',
-               'ls88': 'ls88',
-               'dighum101': 'dighum101', 'dighum160': 'dighum160',
-               'plantbi135': 'plantbi135',
-               'hist160': 'history160',
-               'soc88': 'sociol88', 'sw282': 'socwel282',
-               'music30': 'music30', 'artw23ac': 'artw23ac'} 
-    # hard coded
-    git_content_user = {'danielabrahamgit120': 'eleng120', 'evalencialopezw142': 'educw142', 'charismasacey[A-Za-z0-9]+cp201': 'cp201a'}
-
-    #strips anything thats not a letter or number
-    git_string_cleaned = re.sub(r'[^a-zA-Z0-9]', '', ''.join(row)).lower()
-    for key in courses:
-        if re.findall(key, git_string_cleaned):
-            return courses[key]
-    for key in git_content_user:
-        if re.findall(key, git_string_cleaned):
-            return git_content_user[key]
-    else:
-        return 'unknown'
-```
-
-```{python}
-# assigns classes/courses to each log
-nbgitpuller_textPayload_df_pull['course'] = nbgitpuller_textPayload_df_pull.git_user_content_path.apply(course_assigner_regex)
-```
-
-```{python}
-def semester_assigner_regex(row):
-    """
-    pandas row function; uses apply
-    returns the semester of the course material if known
-    """
-    semester = [r'fa[ll]*\d{1,4}', r'su[mmer]*\d{1,4}', r'sp[ring]*\d{1,4}', r'\d{1,4}fa[ll]', r'\d{1,4}su[mmer]*', r'\d{1,4}sp[ring]*']
-    sem_match_dict = {'sp': 'spring', 'fa': 'fall', 'su':'summer'}
-
-    git_string_cleaned = re.sub(r'[^a-zA-Z0-9]', '', ''.join(row)).lower()
-
-    year_range = [2018, datetime.datetime.now().year]
-
-    for sem in semester:
-        try:
-            if re.findall(sem, git_string_cleaned):
-                sem_match = re.findall(sem, git_string_cleaned)[-1]
-                sem_match_split = re.split('(\d+)', sem_match)
-                sem_char = re.findall('[a-z]+', sem_match)[-1]
-                sem_year = re.findall('[0-9]+', sem_match)[-1]
-                for key, value in sem_match_dict.items():
-                    if key in sem_char and sem_match_split[-1] == '':
-                        if len(sem_year) < 4:
-                            if year_range[0] <= int(f'20{sem_year[-2:]}') <= year_range[1]:
-                                return f'{value}20{sem_year[-2:]}'
-                            else:
-                                return 
-                        elif len(sem_year) == 4:
-                            if year_range[0] <= int(sem_year) <= year_range[1]:
-                                return f'{value}{sem_year}'
-                            else:
-                                return 'unknown'
-                    elif key in sem_char and sem_match_split[-1] != '':
-                        if year_range[0] <= int(sem_year) <= year_range[1]:
-                            return f'{value}{sem_year}'
-                        else:
-                            return 'unknown'
-        except Exception as e:
-            print(f"Failed findall: {e=} {sem=} {git_string_cleaned=}")
-            continue
-    else:
-        return 'unknown'
-```
-
-```{python}
-#| scrolled: true
-# assigns a semester to each log
-nbgitpuller_textPayload_df_pull['semester'] = nbgitpuller_textPayload_df_pull.git_user_content_path.apply(semester_assigner_regex)
-```
-
-```{python}
-# transforms timestamp into one and converts from UTC to PST
-nbgitpuller_textPayload_df_pull['timestamp_date_time_pst'] = pd.to_datetime(nbgitpuller_textPayload_df_pull.timestamp_date + ' ' + nbgitpuller_textPayload_df_pull.timestamp_time) - pd.Timedelta(8, unit = 'h')
-```
-
-```{python}
-# for ones that have NaN as their filetype, check if git_path contains r_studio
-nbgitpuller_textPayload_df_pull['file_extension'] = nbgitpuller_textPayload_df_pull.apply(lambda x: 'rstudio' if 'rstudio' in x['git_path'] else x['file_extension'], axis = 1)
-```
-
-```{python}
-# determines if the links are github or non-github
-nbgitpuller_textPayload_df_pull['abnormal'] = nbgitpuller_textPayload_df_pull.repo.apply(lambda x: 'N' if 'github.com' in x else 'Y')
-```
-
-```{python}
-nbgitpuller_textPayload_df_pull.head()
-```
-
-```{python}
-# separates abnormal repos 
-nbgitpuller_textPayload_df_pull_abnormal = nbgitpuller_textPayload_df_pull[nbgitpuller_textPayload_df_pull.abnormal == 'Y']
-nbgitpuller_textPayload_df_pull_normal = nbgitpuller_textPayload_df_pull[nbgitpuller_textPayload_df_pull.abnormal == 'N']
-```
-
-## Graph and Visualizations
-
-### Ideas for Visualizations
-1. Clicks by hub
-2. What times people are accessing the hubs
-3. File types across hubs
-4. Semester usage of hubs
-
-### Github Repos
-
-```{python}
-# clicks by hub for normal
-clicks_by_hub = nbgitpuller_textPayload_df_pull_normal.hub.value_counts()
-fig = plt.figure(figsize = (8,4))
-plt.bar(clicks_by_hub.index, clicks_by_hub)
-plt.xlabel('Hub')
-plt.ylabel('Count')
-plt.xticks(rotation = 90)
-plt.title('Clicks by Hub (Github Repos)');
-```
-
-```{python}
-# clicks by hub pie chart
-# only take the top 8 and then add the rest
-clicks_by_hub_top8 = dict(clicks_by_hub[:8])
-fig = plt.figure()
-cm = plt.get_cmap('tab20')
-num_colors = 9
-clicks_by_hub_top8['others'] = clicks_by_hub[8:].sum()
-clicks_by_hub_top8_df = pd.DataFrame(clicks_by_hub_top8.items(), columns = ['hub', 'counts'])
-plt.pie(clicks_by_hub_top8_df.counts, labels = clicks_by_hub_top8_df.hub, autopct='%1.0f%%', textprops={'fontsize': 12, 'color': 'k'}, labeldistance= 1.05, colors= [cm(i) for i in range(num_colors)])
-plt.title('Clicks By Hub (Percentage) - Github Repos', fontsize = 14, color = 'k')
-```
-
-```{python}
-fig.savefig("images/hub_pie.png", transparent = True)
-```
-
-```{python}
-# filetypes; removed NaN counts; github repos
-file_types_count = nbgitpuller_textPayload_df_pull_normal.file_extension[nbgitpuller_textPayload_df_pull_normal.file_extension != 'NaN'].value_counts()
-fig = plt.figure(figsize=(8,4))
-plt.bar(file_types_count.index, file_types_count)
-plt.xticks(rotation=90)
-plt.xlabel('File Type')
-plt.ylabel('Counts')
-plt.title('File Type By Counts (Github Repos)');
-```
-
-```{python}
-# filetypes across hubs
-hubs = nbgitpuller_textPayload_df_pull_normal.hub.unique()
-total_plots = len(hubs)
-total_columns = 4
-total_rows = math.ceil(total_plots/total_columns)
-
-for k in range(total_plots):
-    hub_file_filter_count = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.hub == hubs[k]].file_extension.value_counts().tolist()
-    hub_file_filter_proportion = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.hub == hubs[k]].file_extension.value_counts(normalize = True).mul(100).round(2).tolist()
-    hub_file_filter_key = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.hub == hubs[k]].file_extension.value_counts().keys().tolist()
-    hub_file_filter_df = pd.DataFrame(data = {'file_type': hub_file_filter_key, 'counts': hub_file_filter_count, 'proportions': hub_file_filter_proportion})
-    hub_file_filter_df['count_proportion'] = hub_file_filter_df['counts'].astype(str) + ' (' + hub_file_filter_df['proportions'].astype(str) + ')'
-    hub_file_filter_df.drop(columns = ['counts', 'proportions'], inplace = True)
-    print(hubs[k])
-    print(hub_file_filter_df)
-
-# for ax in axes.flatten():
-#     if not ax.get_visible():
-#         ax.set_axis_off()
-```
-
-```{python}
-# looking at the classes
-course_count = nbgitpuller_textPayload_df_pull_normal.course.value_counts()
-figure = plt.figure(figsize = (10,8))
-plt.bar(course_count.index, course_count)
-plt.xticks(rotation = 90, fontsize = 6)
-plt.xlabel('Course')
-plt.ylabel('Count')
-plt.title('Count of Clicks by Course');
-```
-
-```{python}
-# top 10 of the courses
-course_count.head(10)
-```
-
-```{python}
-fig = plt.figure()
-num_colors = 11
-cm = plt.get_cmap('tab20')
-courses_top10 = dict(course_count[:10])
-courses_top10['others'] = course_count[10:].sum()
-courses_top10_df = pd.DataFrame(courses_top10.items(), columns = ['course', 'counts'])
-plt.pie(courses_top10_df.counts, labels = courses_top10_df.course, autopct='%1.0f%%', textprops={'fontsize': 8}, labeldistance= 1.05, 
-        colors= [cm(i) for i in range(num_colors)])
-plt.title('Counts of Clicks by Course - Github Repos', fontsize = 14);
-```
-
-```{python}
-fig.savefig("images/course_pie.png",bbox_inches='tight', transparent = True)
-```
-
-```{python}
-courses_count_by_hub = nbgitpuller_textPayload_df_pull_normal.groupby('hub').course.value_counts()
-courses_count_by_hub.to_csv('courses_count_by_hub.csv')
-```
-
-```{python}
-# looking at the semester usage 
-sem_count = nbgitpuller_textPayload_df_pull_normal.semester.value_counts()
-figure = plt.figure(figsize = (8,4))
-plt.bar(sem_count.index, sem_count)
-plt.xticks(rotation = 90, fontsize = 6)
-plt.xlabel('Semester')
-plt.ylabel('Count')
-plt.title('Semester Content Count');
-```
-
-```{python}
-# looking at what times people access the hubs
-fig = plt.figure()
-num_colors = 4
-cm = plt.get_cmap('tab10')
-sem_top7 = dict(sem_count[:3])
-sem_top7['others'] = sem_count[3:].sum()
-sem_top7_df = pd.DataFrame(sem_top7.items(), columns = ['semester', 'counts'])
-plt.pie(sem_top7_df.counts, labels = sem_top7_df.semester, autopct='%1.0f%%', textprops={'fontsize': 12}, labeldistance= 1.05, 
-        colors= [cm(i) for i in range(num_colors)])
-plt.title('Semester Content - Github Repos', fontsize = 14);
-```
-
-```{python}
-# looks at date of usage
-date_usage = pd.DataFrame(nbgitpuller_textPayload_df_pull_normal.groupby(by = 'timestamp_date').timestamp_date.count())
-date_usage.rename(columns = {'timestamp_date': 'count'}, inplace = True)
-date_usage = date_usage.reset_index()
-fig,ax = plt.subplots()
-plt.bar(date_usage.timestamp_date, date_usage['count'], color = 'w')
-ax.xaxis.set_major_locator(mdates.MonthLocator())
-ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-ax.spines['bottom'].set_color('w')
-ax.spines['top'].set_color('w')
-ax.spines['left'].set_color('w')
-ax.spines['right'].set_color('w')
-ax.tick_params(axis ='x', colors = 'w')
-ax.tick_params(axis ='y', colors = 'w')
-plt.title('Usage by Date', color = 'w', fontsize = 12)
-plt.xlabel('Date', color = 'w', fontsize = 12)
-plt.ylabel('Frequency', color = 'w', fontsize = 12);
-```
-
-```{python}
-fig.savefig("images/usage_bar.png",bbox_inches='tight', transparent = True)
-```
-
-```{python}
-date_usage['timestamp_date'] = date_usage.timestamp_date.astype('datetime64[ns]')
-date_usage['month'] = date_usage['timestamp_date'].apply(lambda x: x.month)
-date_usage['day'] = date_usage['timestamp_date'].apply(lambda x: x.day)
-unique_months = date_usage['month'].unique()
-month_list = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
-
-total_plots = len(unique_months)
-total_columns = 2
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(10, 10))
-
-for k in range(total_plots):
-    month = date_usage[date_usage['month'] == unique_months[k]].reset_index()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(month['day'], month['count'])
-
-    ax.set_title(f'{month_list[unique_months[k]]} Usage Frequency')
-    ax.set_xlabel('Date')
-    ax.set_ylabel('Frequency')
-
-#axes[2,1].set_axis_off()
-plt.tight_layout()
-```
-
-```{python}
-nbgitpuller_textPayload_df_pull_normal.head()
-```
-
-```{python}
-# looks at date of usage
-nbgitpuller_textPayload_df_pull_normal['timestamp_time'] = pd.to_datetime(nbgitpuller_textPayload_df_pull_normal['timestamp_time'], format = '%H:%M:%S.%f')
-nbgitpuller_textPayload_df_pull_normal['timestamp_time_hour'] = nbgitpuller_textPayload_df_pull_normal['timestamp_time'].apply(lambda x: x.hour)
-usage_time = nbgitpuller_textPayload_df_pull_normal.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count()
-usage_time = pd.DataFrame(usage_time)
-usage_time.rename(columns = {'timestamp_time_hour': 'count'}, inplace = True)
-usage_time = usage_time.reset_index()
-fig,ax = plt.subplots()
-plt.bar(usage_time.timestamp_time_hour, usage_time['count'], color = '#003262')
-ax.spines['bottom'].set_color('k')
-ax.spines['top'].set_color('k')
-ax.spines['left'].set_color('k')
-ax.spines['right'].set_color('k')
-ax.tick_params(axis ='x', colors = 'k')
-ax.tick_params(axis ='y', colors = 'k')
-plt.title('Usage by Time (UTC)', color = 'k')
-plt.xlabel('Hour', color = 'k')
-plt.ylabel('Frequency', color = 'k');
-```
-
-```{python}
-fig.savefig("images/time_usage_bar.png",bbox_inches='tight', transparent = True)
-```
-
-##### Making Graphs of Date of Usage Per Course
-
-```{python}
-# look at usage by courses 1-15
-unique_courses = nbgitpuller_textPayload_df_pull_normal.course.unique()
-
-# length of unique_courses is 93, plotting 15 plots at a time
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k]].reset_index()
-    courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_date.index, courses_date)
-
-    ax.set_title(f'{unique_courses[k]} Usage Frequency')
-    ax.set_xlabel('Date')
-    ax.set_ylabel('Frequency')
-    ax.xaxis.set_major_locator(mdates.MonthLocator())
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-plt.tight_layout()
-```
-
-```{python}
-# look at plots 16-30
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+15]].reset_index()
-    courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_date.index, courses_date)
-
-    ax.set_title(f'{unique_courses[k+15]} Usage Frequency')
-    ax.set_xlabel('Date')
-    ax.set_ylabel('Frequency')
-    ax.xaxis.set_major_locator(mdates.MonthLocator())
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-# axes[2,1].set_axis_off()
-plt.tight_layout()
-```
-
-```{python}
-# look at plots 31-45
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+30]].reset_index()
-    courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_date.index, courses_date)
-
-    ax.set_title(f'{unique_courses[k+30]} Usage Frequency')
-    ax.set_xlabel('Date')
-    ax.set_ylabel('Frequency')
-    ax.xaxis.set_major_locator(mdates.MonthLocator())
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-# axes[2,1].set_axis_off()
-plt.tight_layout()
-```
-
-```{python}
-#| editable: true
-#| slideshow: {slide_type: ''}
-#| tags: []
-# look at plots 45-60
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+45]].reset_index()
-    courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_date.index, courses_date)
-
-    ax.set_title(f'{unique_courses[k+45]} Usage Frequency')
-    ax.set_xlabel('Date')
-    ax.set_ylabel('Frequency')
-    ax.xaxis.set_major_locator(mdates.MonthLocator())
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-# axes[2,1].set_axis_off()
-plt.tight_layout()
-```
-
-# look at plots 61-75
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+60]].reset_index()
-    courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_date.index, courses_date)
-
-    ax.set_title(f'{unique_courses[k+60]} Usage Frequency')
-    ax.set_xlabel('Date')
-    ax.set_ylabel('Frequency')
-    ax.xaxis.set_major_locator(mdates.MonthLocator())
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-# axes[2,1].set_axis_off()
-plt.tight_layout()
-
-# look at plots 76-90
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+75]].reset_index()
-    courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_date.index, courses_date)
-
-    ax.set_title(f'{unique_courses[k+75]} Usage Frequency')
-    ax.set_xlabel('Date')
-    ax.set_ylabel('Frequency')
-    ax.xaxis.set_major_locator(mdates.MonthLocator())
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-# axes[2,1].set_axis_off()
-plt.tight_layout()
-
-# look at plots 91-93
-total_plots = 3
-total_columns = 3
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(10, 3))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+90]].reset_index()
-    courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_date.index, courses_date)
-
-    ax.set_title(f'{unique_courses[k+90]} Usage Frequency')
-    ax.set_xlabel('Date')
-    ax.set_ylabel('Frequency')
-    ax.xaxis.set_major_locator(mdates.MonthLocator())
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-# axes[2,1].set_axis_off()
-plt.tight_layout()
-
-##### Making Graphs of Time of Usage Per Course
-
-# look at usage time by courses 1-15
-unique_courses = nbgitpuller_textPayload_df_pull_normal.course.unique()
-
-
-# length of unique_courses is 93, plotting 15 plots at a time
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15), sharex = False)
-plt.setp(axes, xlim=(-1,24))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k]].reset_index()
-    course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f')
-    course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour)
-    courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_time.index, courses_time)
-
-    ax.set_title(f'{unique_courses[k]} Time Usage Frequency')
-    ax.set_xlabel('Time')
-    ax.set_ylabel('Frequency')
-    # ax.xaxis.set_major_locator(mdates.MonthLocator())
-    # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-plt.tight_layout()
-
-# look at usage time by course 16-30
-# length of unique_courses is 93, plotting 15 plots at a time
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-plt.setp(axes, xlim=(-1,24))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+15]].reset_index()
-    course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f')
-    course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour)
-    courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_time.index, courses_time)
-
-    ax.set_title(f'{unique_courses[k+15]} Time Usage Frequency')
-    ax.set_xlabel('Time')
-    ax.set_ylabel('Frequency')
-    # ax.xaxis.set_major_locator(mdates.MonthLocator())
-    # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-plt.tight_layout()
-
-# look at usage time by course 31 - 45
-# length of unique_courses is 93, plotting 15 plots at a time
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-plt.setp(axes, xlim=(-1,24))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+30]].reset_index()
-    course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f')
-    course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour)
-    courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_time.index, courses_time)
-
-    ax.set_title(f'{unique_courses[k+30]} Time Usage Frequency')
-    ax.set_xlabel('Time')
-    ax.set_ylabel('Frequency')
-    # ax.xaxis.set_major_locator(mdates.MonthLocator())
-    # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-plt.tight_layout()
-
-# look at usage time by course 46 - 60
-# length of unique_courses is 93, plotting 15 plots at a time
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-plt.setp(axes, xlim=(-1,24))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+45]].reset_index()
-    course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f')
-    course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour)
-    courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_time.index, courses_time)
-
-    ax.set_title(f'{unique_courses[k+45]} Time Usage Frequency')
-    ax.set_xlabel('Time')
-    ax.set_ylabel('Frequency')
-    # ax.xaxis.set_major_locator(mdates.MonthLocator())
-    # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-plt.tight_layout()
-
-# look at usage time by course 61 - 75
-# length of unique_courses is 93, plotting 15 plots at a time
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-plt.setp(axes, xlim=(-1,24))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+60]].reset_index()
-    course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f')
-    course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour)
-    courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_time.index, courses_time)
-
-    ax.set_title(f'{unique_courses[k+60]} Time Usage Frequency')
-    ax.set_xlabel('Time')
-    ax.set_ylabel('Frequency')
-    # ax.xaxis.set_major_locator(mdates.MonthLocator())
-    # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-plt.tight_layout()
-
-# look at usage time by course 76 - 90
-# length of unique_courses is 93, plotting 15 plots at a time
-total_plots = 15
-total_columns = 5
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15))
-plt.setp(axes, xlim=(-1,24))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+75]].reset_index()
-    course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f')
-    course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour)
-    courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_time.index, courses_time)
-
-    ax.set_title(f'{unique_courses[k+75]} Time Usage Frequency')
-    ax.set_xlabel('Time')
-    ax.set_ylabel('Frequency')
-    # ax.xaxis.set_major_locator(mdates.MonthLocator())
-    # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-plt.tight_layout()
-
-# look at usage time by course 91-83
-# length of unique_courses is 93, plotting 15 plots at a time
-total_plots = 3
-total_columns = 3
-total_rows = int(np.ceil(total_plots/total_columns))
-
-fig, axes = plt.subplots(total_rows, total_columns, figsize=(10, 5))
-plt.setp(axes, xlim=(-1,24))
-
-for k in range(total_plots):
-    course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+90]].reset_index()
-    course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f')
-    course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour)
-    courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count()
-    ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases
-    ax.bar(courses_time.index, courses_time)
-
-    ax.set_title(f'{unique_courses[k+90]} Time Usage Frequency')
-    ax.set_xlabel('Time')
-    ax.set_ylabel('Frequency')
-    # ax.xaxis.set_major_locator(mdates.MonthLocator())
-    # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
-
-plt.tight_layout()
-
-### Non-Github Repos
-
-```{python}
-#| editable: true
-#| slideshow: {slide_type: ''}
-#| tags: []
-# clicks by hub for abnormal
-clicks_by_hub = nbgitpuller_textPayload_df_pull_abnormal.hub.value_counts()
-fig = plt.figure(figsize = (8,4))
-plt.bar(clicks_by_hub.index, clicks_by_hub)
-plt.xlabel('Hub')
-plt.ylabel('Count')
-plt.xticks(rotation = 90)
-plt.title('Clicks By Hub (Non-Github Repos)');
-```
-
-```{python}
-len(nbgitpuller_textPayload_df_pull.hub.unique())
-```
-
-```{python}
-clicks_by_hub_top6 = dict(clicks_by_hub[:6])
-clicks_by_hub_top6['others'] = clicks_by_hub[6:].sum()
-clicks_by_hub_top6_df = pd.DataFrame(clicks_by_hub_top6.items(), columns = ['hub', 'counts'])
-plt.pie(clicks_by_hub_top6_df.counts, labels = clicks_by_hub_top6_df.hub, autopct='%1.0f%%', textprops={'fontsize': 9}, labeldistance= 1.05)
-plt.title('Clicks By Hub (Percentage) - Non-Github Repos');
-```
-
-```{python}
-file_types_count = nbgitpuller_textPayload_df_pull_abnormal.file_extension[nbgitpuller_textPayload_df_pull_abnormal.file_extension != 'NaN'].value_counts()
-fig = plt.figure(figsize=(8,4))
-plt.bar(file_types_count.index, file_types_count)
-plt.xticks(rotation=90)
-plt.xlabel('File Type')
-plt.ylabel('Counts')
-plt.title('File Type By Counts (Non-Github Repos)');
-```
-
-```{python}
-# filetypes across hubs
-hubs = nbgitpuller_textPayload_df_pull_abnormal.hub.unique()
-total_plots = len(hubs)
-total_columns = 4
-total_rows = math.ceil(total_plots/total_columns)
-
-for k in range(total_plots):
-    hub_file_filter_count = nbgitpuller_textPayload_df_pull_abnormal[nbgitpuller_textPayload_df_pull_abnormal.hub == hubs[k]].file_extension.value_counts().tolist()
-    hub_file_filter_proportion = nbgitpuller_textPayload_df_pull_abnormal[nbgitpuller_textPayload_df_pull_abnormal.hub == hubs[k]].file_extension.value_counts(normalize = True).mul(100).round(2).tolist()
-    hub_file_filter_key = nbgitpuller_textPayload_df_pull_abnormal[nbgitpuller_textPayload_df_pull_abnormal.hub == hubs[k]].file_extension.value_counts().keys().tolist()
-    hub_file_filter_df = pd.DataFrame(data = {'file_type': hub_file_filter_key, 'counts': hub_file_filter_count, 'proportions': hub_file_filter_proportion})
-    hub_file_filter_df['count_proportion'] = hub_file_filter_df['counts'].astype(str) + ' (' + hub_file_filter_df['proportions'].astype(str) + ')'
-    hub_file_filter_df.drop(columns = ['counts', 'proportions'], inplace = True)
-    print(hubs[k])
-    print(hub_file_filter_df)
-
-# for ax in axes.flatten():
-#     if not ax.get_visible():
-#         ax.set_axis_off()
-```
-
-```{python}
-# looking at the classes
-course_count = nbgitpuller_textPayload_df_pull_abnormal.course.value_counts()
-figure = plt.figure(figsize = (8,4))
-plt.bar(course_count.index, course_count)
-plt.xticks(rotation = 90, fontsize = 6)
-plt.xlabel('Course')
-plt.ylabel('Count')
-plt.title('Count of Clicks by Course');
-```
-
-```{python}
-#| editable: true
-#| slideshow: {slide_type: ''}
-#| tags: []
-# looking at the semester usage 
-sem_count = nbgitpuller_textPayload_df_pull_abnormal.semester.value_counts()
-figure = plt.figure(figsize = (8,4))
-plt.bar(sem_count.index, sem_count)
-plt.xticks(rotation = 90, fontsize = 6)
-plt.xlabel('Semester')
-plt.ylabel('Count')
-plt.title('Semester Content Count');
-```
-