From 34ee1fb814b773b1fa09a71fbc72b6ddf655fbbe Mon Sep 17 00:00:00 2001 From: Balaji Alwar Date: Wed, 8 May 2024 19:08:29 -0700 Subject: [PATCH] Add a github action to convert qmd files to html and render it in Github pages --- .github/workflows/qmd_tohtml.yml | 42 + nbgitpuller_processing_visualization.qmd | 957 ----------------------- 2 files changed, 42 insertions(+), 957 deletions(-) create mode 100644 .github/workflows/qmd_tohtml.yml delete mode 100644 nbgitpuller_processing_visualization.qmd diff --git a/.github/workflows/qmd_tohtml.yml b/.github/workflows/qmd_tohtml.yml new file mode 100644 index 0000000..1a5d8b4 --- /dev/null +++ b/.github/workflows/qmd_tohtml.yml @@ -0,0 +1,42 @@ +name: Convert qmd to html + +on: + push: + +jobs: + convert: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + pip install quarto + + - name: Convert qmd to html + run: | + mkdir -p dashboard/visualization_output + for qmd in $(find . -name "*.qmd"); do + quarto render $qmd --to html --output-dir dashboard/visualization_output + done + + - name: Publish to GitHub Pages + uses: quarto-dev/quarto-actions/publish@v2 + with: + path: dashboard/visualization_output + render: false + + - name: Push changes + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + branch: ${{ github.ref }} diff --git a/nbgitpuller_processing_visualization.qmd b/nbgitpuller_processing_visualization.qmd deleted file mode 100644 index 0317e5a..0000000 --- a/nbgitpuller_processing_visualization.qmd +++ /dev/null @@ -1,957 +0,0 @@ ---- -title: All File Types -jupyter: python3 -format: dashboard -server: shiny ---- - -```{r} -library(shiny) -``` - -```{python} -#!pip install pandas -#!pip install matplotlib -#!pip install shiny -``` - -```{python} -#| editable: true -#| slideshow: {slide_type: ''} -#| tags: [] -# importing all packages -import pandas as pd -import gzip -import re -from urllib.parse import urlparse, parse_qsl, unquote -import os -import matplotlib.pyplot as plt -import matplotlib.dates as mdates -import math -import numpy as np -import datetime -#%matplotlib inline -``` - -```{python} -#!conda env list -``` - -```{python} -#| editable: true -#| slideshow: {slide_type: ''} -#| tags: [] -# opens up the nbgitpuller file - uses gzip to open so need to ensure that file is zipped -nbgitpuller_filename = 'nbgitpuller-clicks-fall-2023.jsonl.gz' -nbgitpuller_df = pd.read_json(gzip.open(nbgitpuller_filename), lines = True) -#nbgitpuller_df = pd.read_json("nbgitpuller-clicks-fall-2023.jsonl") -nbgitpuller_df.head() -``` - -```{python} -# obtaining substring after GET and before the redirection -urls_all = nbgitpuller_df.textPayload.apply(lambda x: x[x.find('GET')+3:x.find('->')].strip()) - -# uses urllib.parse to parse the url into path and query -urls_parsed_all = urls_all.apply(lambda x: urlparse(x)) - -# uses the parsed urls to obtain the action from the path -nbgitpuller_df['actions'] = urls_parsed_all.apply(lambda x: os.path.basename(x.path)) -``` - -```{python} -nbgitpuller_df.actions.unique() -``` - -## For git-pull - -```{python} -# function to determine the filetypes -def path_extension_puller(row): - """ - pandas row function; uses apply - function to pull out select file extensions and urlpaths - """ - row_dict = dict(row) - if 'urlpath' in row_dict: - key = 'urlpath' - elif 'subPath' in row_dict: - key = 'subPath' - else: - return 'NaN', 'NaN' - - # files that the analysis is interested in - file_extension_list = ['ipyn[b]?', 'Rmd', 'pdf', 'txt', 'xml', 'ini', 'csv', 'py', 'R', 'md'] - if len(row_dict[key].split('.')) > 1: - file_extension_split_string = row_dict[key].split('.')[-1] - for file_extension in file_extension_list: - if (len(re.findall(file_extension, file_extension_split_string)) > 0): - return row_dict[key], re.findall(file_extension, file_extension_split_string)[-1] - else: - return row_dict[key], 'NaN' - else: - return row_dict[key], 'NaN' - -def get_repo(row): - """ - pandas row function; uses apply - returns repo url from parsed url - """ - for item in row: - key, value = item - if 'repo' in key: - return unquote(value) - return 'NaN' - -def repo_parsing(row): - """ - pandas row function; uses apply - parses the repo url so that it obtains the user and folder/content user is accessing - """ - if row: - if len(row[0].split('/')) > 2: - return row[0].split('/')[1] - else: - return row[0].split('/')[-1] - else: - return 'NaN' -``` - -```{python} -# makes a new dataframe that only contains git-pull and resets index -nbgitpuller_df_pull = nbgitpuller_df[nbgitpuller_df.actions == 'git-pull'].reset_index() - -# obtains all the log info -log_info_pull = nbgitpuller_df_pull.textPayload.apply(lambda x: ''.join(re.findall("\[.*\]", x)).replace('[', '').replace(']', '').split(' ')) - -# retreives the hubs for each textpayload -hub_source_pull = nbgitpuller_df_pull.resource.apply(lambda x: x['labels']['namespace_name']) - -# obtains substring after GET and before the redirection -urls_pull = nbgitpuller_df_pull.textPayload.apply(lambda x: x[x.find('GET')+3:x.find('->')].strip()) - -# uses urllib.parse to parse the url into path and query -urls_parsed_pull = urls_pull.apply(lambda x: urlparse(x)) - -# uses parsed urls to obtain the action as a quality check -actions_pull = urls_parsed_pull.apply(lambda x: os.path.basename(x.path)) - -# breaks apart the parsed query into repo/urlpath -urls_queries_pull = urls_parsed_pull.apply(lambda x: parse_qsl(x.query)) - -# getting the file type from urlpath -path_extension_pull = urls_queries_pull.apply(path_extension_puller) - -# gets repo urls from the parsed url -repos_pull = urls_queries_pull.apply(get_repo) - -# extract ones that have github.com in the repo url or else its a null value -repos_parsed_pull = repos_pull.apply(lambda x: re.findall("github\.com/+(.+)", x) if x else 'NaN') - -# obtains the user and git content from github.com repo urls -git_user_pull = repos_parsed_pull.apply(lambda x: x[0].split('/')[0] if x else 'NaN') -git_user_repo_pull = repos_parsed_pull.apply(repo_parsing) - -# adds it all into a dataframe -nbgitpuller_textPayload_df_pull = pd.DataFrame({'log_info_type': log_info_pull.apply(lambda x: x[0]), - 'timestamp_date': log_info_pull.apply(lambda x: x[1]), - 'timestamp_time': log_info_pull.apply(lambda x: x[2]), - 'action': actions_pull, - 'git_query': urls_queries_pull, - 'repo': repos_pull, - 'git_user_content': repos_parsed_pull, - 'git_user': git_user_pull, - 'git_content': git_user_repo_pull, - 'git_path': path_extension_pull.apply(lambda x: x[0]), - 'file_extension': path_extension_pull.apply(lambda x: x[1]), - 'hub': hub_source_pull}) -``` - -```{python} -nbgitpuller_textPayload_df_pull['git_user_content_path'] = nbgitpuller_textPayload_df_pull.apply(lambda x: ''.join(x['git_user_content']) + '/' + ''.join(x['git_path']), axis = 1) -``` - -```{python} -def course_assigner_regex(row): - """ - pandas row function; uses apply - determines which classes and semesters are for each github repo - """ - courses = {'(data8|ds8)': 'data8', '(ds100|data100)': 'data100', '(prob140)': 'data140', #data - '(caldataeng|data101|ds101)': 'data101', '(data6|ds6)': 'data6', '(data102|ds102)': 'data102', #data - '(data4ac|ds4ac)': 'data4ac', '(data198|ds198)': 'data198', - '(cs189|compsci189)': 'compsci189', '(cs170|compsci170)': 'compsci170', #compsci - '(ee16a|eecs16a)': 'eecs16a', '(ee16b|eecs16b)': 'eecs16b', '(eecs127)': 'eecs127',#eecs - '(ee120|eleng120)': 'eleng120', #electrical engineering - '(physics111b)': 'physics111b', '(physics88)': 'physics88', # physics - '(polsci3|ps3|polisci3)': 'polsci3', '(polsci5|ps5)': 'polsci5', '(polsci88|ps88)': 'polsci88', '(ps109|polsci109)': 'polsci109', # polisci - '(ce190|civeng90)': 'civgeng190', '(ce93|civeng93)': 'civeng93', '(ce200b|civeng200b)': 'civeng200b', '(ce110|civeng110)': 'civeng110', #civileng - '(envecon118|eep118)': 'envecon118', '(eep147|envecon147)': 'envecon147', '(eep153|envecon153)': 'envecon153', #environmental - 'ph[w]?142': 'pbhlth142', 'ph[w]?251': 'pbhlth251', 'ph[w]?290': 'pbhlth290', 'ph[w]?252': 'pbhlth252', 'ph[w]?253': 'pbhlth253', 'pbhlth250c': 'pbhlth250c', - 'ph[w]?196': 'pbhlth196', # public health - 'mcb163l': 'mcellbi163l', 'mcb280': 'mcellbi280', 'mcbc117': 'mcellbic117', 'mcb32': 'mcellbi32', 'mcb288': 'mcellbi288', #molecular cell bio - '(bio1b|biology1b)': 'biology1b', # biology - 'stat88': 'stat88', 'stat157': 'stat157', 'stat159': 'stat159', 'stat131': 'stat131', 'stat135': 'stat135', 'stat20': 'stat20', - 'stat150': 'stat150', #stat - 'math124': 'math124', #math - '(demog180)': 'demog180', 'demog[c]?175': 'demog175', #demography - '(eps130)': 'eps130', '(eps88)': 'eps88', 'eps256': 'eps256', 'eps24': 'eps24', - '(econ140)': 'econ140', '(econ148)': 'econ148', 'econ141': 'econ141', 'econ172': 'econ172', 'econ151': 'econ151', #econ - 'econ157': 'econ157', 'econ130': 'econ130', 'econ143': 'econ143', 'econ135': 'econ135', - '(rbridge)': 'datasci_rbridge', '(midsw241)': 'datasci241', '(midsw203)': 'datasci203', #datasci - '(legal123|legalst123)': 'legalst123', '(legalst190|legal190)': 'legalst190', # legal - '(es22ac|ethstd22ac)': 'ethstd22ac', '(esc164a|ethstdc164a)': 'ethstdc164a', '(es21ac|ethstd21ac)': 'ethstd21ac', # ethnic studies - 'cp201b': 'cyplan201b', '(cityplanning88|cp88)': 'cyplan88', - 'ib120': 'integbi120', 'ibc32': 'integbi32', 'ib134l': 'integbi134l', - 'mse104l': 'matsci104l', - 'are212': 'aresec212', - 'educw142': 'educw142', - '(cogscic131|psych123)': 'cogscic131', 'psych198': 'psych198', - 'anth[ro]?115': 'anthro115', - 'espmc167': 'espmc167', '(ibespm105)': 'espmc105', - 'ls88': 'ls88', - 'dighum101': 'dighum101', 'dighum160': 'dighum160', - 'plantbi135': 'plantbi135', - 'hist160': 'history160', - 'soc88': 'sociol88', 'sw282': 'socwel282', - 'music30': 'music30', 'artw23ac': 'artw23ac'} - # hard coded - git_content_user = {'danielabrahamgit120': 'eleng120', 'evalencialopezw142': 'educw142', 'charismasacey[A-Za-z0-9]+cp201': 'cp201a'} - - #strips anything thats not a letter or number - git_string_cleaned = re.sub(r'[^a-zA-Z0-9]', '', ''.join(row)).lower() - for key in courses: - if re.findall(key, git_string_cleaned): - return courses[key] - for key in git_content_user: - if re.findall(key, git_string_cleaned): - return git_content_user[key] - else: - return 'unknown' -``` - -```{python} -# assigns classes/courses to each log -nbgitpuller_textPayload_df_pull['course'] = nbgitpuller_textPayload_df_pull.git_user_content_path.apply(course_assigner_regex) -``` - -```{python} -def semester_assigner_regex(row): - """ - pandas row function; uses apply - returns the semester of the course material if known - """ - semester = [r'fa[ll]*\d{1,4}', r'su[mmer]*\d{1,4}', r'sp[ring]*\d{1,4}', r'\d{1,4}fa[ll]', r'\d{1,4}su[mmer]*', r'\d{1,4}sp[ring]*'] - sem_match_dict = {'sp': 'spring', 'fa': 'fall', 'su':'summer'} - - git_string_cleaned = re.sub(r'[^a-zA-Z0-9]', '', ''.join(row)).lower() - - year_range = [2018, datetime.datetime.now().year] - - for sem in semester: - try: - if re.findall(sem, git_string_cleaned): - sem_match = re.findall(sem, git_string_cleaned)[-1] - sem_match_split = re.split('(\d+)', sem_match) - sem_char = re.findall('[a-z]+', sem_match)[-1] - sem_year = re.findall('[0-9]+', sem_match)[-1] - for key, value in sem_match_dict.items(): - if key in sem_char and sem_match_split[-1] == '': - if len(sem_year) < 4: - if year_range[0] <= int(f'20{sem_year[-2:]}') <= year_range[1]: - return f'{value}20{sem_year[-2:]}' - else: - return - elif len(sem_year) == 4: - if year_range[0] <= int(sem_year) <= year_range[1]: - return f'{value}{sem_year}' - else: - return 'unknown' - elif key in sem_char and sem_match_split[-1] != '': - if year_range[0] <= int(sem_year) <= year_range[1]: - return f'{value}{sem_year}' - else: - return 'unknown' - except Exception as e: - print(f"Failed findall: {e=} {sem=} {git_string_cleaned=}") - continue - else: - return 'unknown' -``` - -```{python} -#| scrolled: true -# assigns a semester to each log -nbgitpuller_textPayload_df_pull['semester'] = nbgitpuller_textPayload_df_pull.git_user_content_path.apply(semester_assigner_regex) -``` - -```{python} -# transforms timestamp into one and converts from UTC to PST -nbgitpuller_textPayload_df_pull['timestamp_date_time_pst'] = pd.to_datetime(nbgitpuller_textPayload_df_pull.timestamp_date + ' ' + nbgitpuller_textPayload_df_pull.timestamp_time) - pd.Timedelta(8, unit = 'h') -``` - -```{python} -# for ones that have NaN as their filetype, check if git_path contains r_studio -nbgitpuller_textPayload_df_pull['file_extension'] = nbgitpuller_textPayload_df_pull.apply(lambda x: 'rstudio' if 'rstudio' in x['git_path'] else x['file_extension'], axis = 1) -``` - -```{python} -# determines if the links are github or non-github -nbgitpuller_textPayload_df_pull['abnormal'] = nbgitpuller_textPayload_df_pull.repo.apply(lambda x: 'N' if 'github.com' in x else 'Y') -``` - -```{python} -nbgitpuller_textPayload_df_pull.head() -``` - -```{python} -# separates abnormal repos -nbgitpuller_textPayload_df_pull_abnormal = nbgitpuller_textPayload_df_pull[nbgitpuller_textPayload_df_pull.abnormal == 'Y'] -nbgitpuller_textPayload_df_pull_normal = nbgitpuller_textPayload_df_pull[nbgitpuller_textPayload_df_pull.abnormal == 'N'] -``` - -## Graph and Visualizations - -### Ideas for Visualizations -1. Clicks by hub -2. What times people are accessing the hubs -3. File types across hubs -4. Semester usage of hubs - -### Github Repos - -```{python} -# clicks by hub for normal -clicks_by_hub = nbgitpuller_textPayload_df_pull_normal.hub.value_counts() -fig = plt.figure(figsize = (8,4)) -plt.bar(clicks_by_hub.index, clicks_by_hub) -plt.xlabel('Hub') -plt.ylabel('Count') -plt.xticks(rotation = 90) -plt.title('Clicks by Hub (Github Repos)'); -``` - -```{python} -# clicks by hub pie chart -# only take the top 8 and then add the rest -clicks_by_hub_top8 = dict(clicks_by_hub[:8]) -fig = plt.figure() -cm = plt.get_cmap('tab20') -num_colors = 9 -clicks_by_hub_top8['others'] = clicks_by_hub[8:].sum() -clicks_by_hub_top8_df = pd.DataFrame(clicks_by_hub_top8.items(), columns = ['hub', 'counts']) -plt.pie(clicks_by_hub_top8_df.counts, labels = clicks_by_hub_top8_df.hub, autopct='%1.0f%%', textprops={'fontsize': 12, 'color': 'k'}, labeldistance= 1.05, colors= [cm(i) for i in range(num_colors)]) -plt.title('Clicks By Hub (Percentage) - Github Repos', fontsize = 14, color = 'k') -``` - -```{python} -fig.savefig("images/hub_pie.png", transparent = True) -``` - -```{python} -# filetypes; removed NaN counts; github repos -file_types_count = nbgitpuller_textPayload_df_pull_normal.file_extension[nbgitpuller_textPayload_df_pull_normal.file_extension != 'NaN'].value_counts() -fig = plt.figure(figsize=(8,4)) -plt.bar(file_types_count.index, file_types_count) -plt.xticks(rotation=90) -plt.xlabel('File Type') -plt.ylabel('Counts') -plt.title('File Type By Counts (Github Repos)'); -``` - -```{python} -# filetypes across hubs -hubs = nbgitpuller_textPayload_df_pull_normal.hub.unique() -total_plots = len(hubs) -total_columns = 4 -total_rows = math.ceil(total_plots/total_columns) - -for k in range(total_plots): - hub_file_filter_count = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.hub == hubs[k]].file_extension.value_counts().tolist() - hub_file_filter_proportion = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.hub == hubs[k]].file_extension.value_counts(normalize = True).mul(100).round(2).tolist() - hub_file_filter_key = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.hub == hubs[k]].file_extension.value_counts().keys().tolist() - hub_file_filter_df = pd.DataFrame(data = {'file_type': hub_file_filter_key, 'counts': hub_file_filter_count, 'proportions': hub_file_filter_proportion}) - hub_file_filter_df['count_proportion'] = hub_file_filter_df['counts'].astype(str) + ' (' + hub_file_filter_df['proportions'].astype(str) + ')' - hub_file_filter_df.drop(columns = ['counts', 'proportions'], inplace = True) - print(hubs[k]) - print(hub_file_filter_df) - -# for ax in axes.flatten(): -# if not ax.get_visible(): -# ax.set_axis_off() -``` - -```{python} -# looking at the classes -course_count = nbgitpuller_textPayload_df_pull_normal.course.value_counts() -figure = plt.figure(figsize = (10,8)) -plt.bar(course_count.index, course_count) -plt.xticks(rotation = 90, fontsize = 6) -plt.xlabel('Course') -plt.ylabel('Count') -plt.title('Count of Clicks by Course'); -``` - -```{python} -# top 10 of the courses -course_count.head(10) -``` - -```{python} -fig = plt.figure() -num_colors = 11 -cm = plt.get_cmap('tab20') -courses_top10 = dict(course_count[:10]) -courses_top10['others'] = course_count[10:].sum() -courses_top10_df = pd.DataFrame(courses_top10.items(), columns = ['course', 'counts']) -plt.pie(courses_top10_df.counts, labels = courses_top10_df.course, autopct='%1.0f%%', textprops={'fontsize': 8}, labeldistance= 1.05, - colors= [cm(i) for i in range(num_colors)]) -plt.title('Counts of Clicks by Course - Github Repos', fontsize = 14); -``` - -```{python} -fig.savefig("images/course_pie.png",bbox_inches='tight', transparent = True) -``` - -```{python} -courses_count_by_hub = nbgitpuller_textPayload_df_pull_normal.groupby('hub').course.value_counts() -courses_count_by_hub.to_csv('courses_count_by_hub.csv') -``` - -```{python} -# looking at the semester usage -sem_count = nbgitpuller_textPayload_df_pull_normal.semester.value_counts() -figure = plt.figure(figsize = (8,4)) -plt.bar(sem_count.index, sem_count) -plt.xticks(rotation = 90, fontsize = 6) -plt.xlabel('Semester') -plt.ylabel('Count') -plt.title('Semester Content Count'); -``` - -```{python} -# looking at what times people access the hubs -fig = plt.figure() -num_colors = 4 -cm = plt.get_cmap('tab10') -sem_top7 = dict(sem_count[:3]) -sem_top7['others'] = sem_count[3:].sum() -sem_top7_df = pd.DataFrame(sem_top7.items(), columns = ['semester', 'counts']) -plt.pie(sem_top7_df.counts, labels = sem_top7_df.semester, autopct='%1.0f%%', textprops={'fontsize': 12}, labeldistance= 1.05, - colors= [cm(i) for i in range(num_colors)]) -plt.title('Semester Content - Github Repos', fontsize = 14); -``` - -```{python} -# looks at date of usage -date_usage = pd.DataFrame(nbgitpuller_textPayload_df_pull_normal.groupby(by = 'timestamp_date').timestamp_date.count()) -date_usage.rename(columns = {'timestamp_date': 'count'}, inplace = True) -date_usage = date_usage.reset_index() -fig,ax = plt.subplots() -plt.bar(date_usage.timestamp_date, date_usage['count'], color = 'w') -ax.xaxis.set_major_locator(mdates.MonthLocator()) -ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) -ax.spines['bottom'].set_color('w') -ax.spines['top'].set_color('w') -ax.spines['left'].set_color('w') -ax.spines['right'].set_color('w') -ax.tick_params(axis ='x', colors = 'w') -ax.tick_params(axis ='y', colors = 'w') -plt.title('Usage by Date', color = 'w', fontsize = 12) -plt.xlabel('Date', color = 'w', fontsize = 12) -plt.ylabel('Frequency', color = 'w', fontsize = 12); -``` - -```{python} -fig.savefig("images/usage_bar.png",bbox_inches='tight', transparent = True) -``` - -```{python} -date_usage['timestamp_date'] = date_usage.timestamp_date.astype('datetime64[ns]') -date_usage['month'] = date_usage['timestamp_date'].apply(lambda x: x.month) -date_usage['day'] = date_usage['timestamp_date'].apply(lambda x: x.day) -unique_months = date_usage['month'].unique() -month_list = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'} - -total_plots = len(unique_months) -total_columns = 2 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(10, 10)) - -for k in range(total_plots): - month = date_usage[date_usage['month'] == unique_months[k]].reset_index() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(month['day'], month['count']) - - ax.set_title(f'{month_list[unique_months[k]]} Usage Frequency') - ax.set_xlabel('Date') - ax.set_ylabel('Frequency') - -#axes[2,1].set_axis_off() -plt.tight_layout() -``` - -```{python} -nbgitpuller_textPayload_df_pull_normal.head() -``` - -```{python} -# looks at date of usage -nbgitpuller_textPayload_df_pull_normal['timestamp_time'] = pd.to_datetime(nbgitpuller_textPayload_df_pull_normal['timestamp_time'], format = '%H:%M:%S.%f') -nbgitpuller_textPayload_df_pull_normal['timestamp_time_hour'] = nbgitpuller_textPayload_df_pull_normal['timestamp_time'].apply(lambda x: x.hour) -usage_time = nbgitpuller_textPayload_df_pull_normal.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count() -usage_time = pd.DataFrame(usage_time) -usage_time.rename(columns = {'timestamp_time_hour': 'count'}, inplace = True) -usage_time = usage_time.reset_index() -fig,ax = plt.subplots() -plt.bar(usage_time.timestamp_time_hour, usage_time['count'], color = '#003262') -ax.spines['bottom'].set_color('k') -ax.spines['top'].set_color('k') -ax.spines['left'].set_color('k') -ax.spines['right'].set_color('k') -ax.tick_params(axis ='x', colors = 'k') -ax.tick_params(axis ='y', colors = 'k') -plt.title('Usage by Time (UTC)', color = 'k') -plt.xlabel('Hour', color = 'k') -plt.ylabel('Frequency', color = 'k'); -``` - -```{python} -fig.savefig("images/time_usage_bar.png",bbox_inches='tight', transparent = True) -``` - -##### Making Graphs of Date of Usage Per Course - -```{python} -# look at usage by courses 1-15 -unique_courses = nbgitpuller_textPayload_df_pull_normal.course.unique() - -# length of unique_courses is 93, plotting 15 plots at a time -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k]].reset_index() - courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_date.index, courses_date) - - ax.set_title(f'{unique_courses[k]} Usage Frequency') - ax.set_xlabel('Date') - ax.set_ylabel('Frequency') - ax.xaxis.set_major_locator(mdates.MonthLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -plt.tight_layout() -``` - -```{python} -# look at plots 16-30 -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+15]].reset_index() - courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_date.index, courses_date) - - ax.set_title(f'{unique_courses[k+15]} Usage Frequency') - ax.set_xlabel('Date') - ax.set_ylabel('Frequency') - ax.xaxis.set_major_locator(mdates.MonthLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -# axes[2,1].set_axis_off() -plt.tight_layout() -``` - -```{python} -# look at plots 31-45 -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+30]].reset_index() - courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_date.index, courses_date) - - ax.set_title(f'{unique_courses[k+30]} Usage Frequency') - ax.set_xlabel('Date') - ax.set_ylabel('Frequency') - ax.xaxis.set_major_locator(mdates.MonthLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -# axes[2,1].set_axis_off() -plt.tight_layout() -``` - -```{python} -#| editable: true -#| slideshow: {slide_type: ''} -#| tags: [] -# look at plots 45-60 -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+45]].reset_index() - courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_date.index, courses_date) - - ax.set_title(f'{unique_courses[k+45]} Usage Frequency') - ax.set_xlabel('Date') - ax.set_ylabel('Frequency') - ax.xaxis.set_major_locator(mdates.MonthLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -# axes[2,1].set_axis_off() -plt.tight_layout() -``` - -# look at plots 61-75 -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+60]].reset_index() - courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_date.index, courses_date) - - ax.set_title(f'{unique_courses[k+60]} Usage Frequency') - ax.set_xlabel('Date') - ax.set_ylabel('Frequency') - ax.xaxis.set_major_locator(mdates.MonthLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -# axes[2,1].set_axis_off() -plt.tight_layout() - -# look at plots 76-90 -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+75]].reset_index() - courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_date.index, courses_date) - - ax.set_title(f'{unique_courses[k+75]} Usage Frequency') - ax.set_xlabel('Date') - ax.set_ylabel('Frequency') - ax.xaxis.set_major_locator(mdates.MonthLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -# axes[2,1].set_axis_off() -plt.tight_layout() - -# look at plots 91-93 -total_plots = 3 -total_columns = 3 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(10, 3)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+90]].reset_index() - courses_date = course.groupby(by = 'timestamp_date').timestamp_date.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_date.index, courses_date) - - ax.set_title(f'{unique_courses[k+90]} Usage Frequency') - ax.set_xlabel('Date') - ax.set_ylabel('Frequency') - ax.xaxis.set_major_locator(mdates.MonthLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -# axes[2,1].set_axis_off() -plt.tight_layout() - -##### Making Graphs of Time of Usage Per Course - -# look at usage time by courses 1-15 -unique_courses = nbgitpuller_textPayload_df_pull_normal.course.unique() - - -# length of unique_courses is 93, plotting 15 plots at a time -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15), sharex = False) -plt.setp(axes, xlim=(-1,24)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k]].reset_index() - course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f') - course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour) - courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_time.index, courses_time) - - ax.set_title(f'{unique_courses[k]} Time Usage Frequency') - ax.set_xlabel('Time') - ax.set_ylabel('Frequency') - # ax.xaxis.set_major_locator(mdates.MonthLocator()) - # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -plt.tight_layout() - -# look at usage time by course 16-30 -# length of unique_courses is 93, plotting 15 plots at a time -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) -plt.setp(axes, xlim=(-1,24)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+15]].reset_index() - course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f') - course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour) - courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_time.index, courses_time) - - ax.set_title(f'{unique_courses[k+15]} Time Usage Frequency') - ax.set_xlabel('Time') - ax.set_ylabel('Frequency') - # ax.xaxis.set_major_locator(mdates.MonthLocator()) - # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -plt.tight_layout() - -# look at usage time by course 31 - 45 -# length of unique_courses is 93, plotting 15 plots at a time -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) -plt.setp(axes, xlim=(-1,24)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+30]].reset_index() - course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f') - course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour) - courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_time.index, courses_time) - - ax.set_title(f'{unique_courses[k+30]} Time Usage Frequency') - ax.set_xlabel('Time') - ax.set_ylabel('Frequency') - # ax.xaxis.set_major_locator(mdates.MonthLocator()) - # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -plt.tight_layout() - -# look at usage time by course 46 - 60 -# length of unique_courses is 93, plotting 15 plots at a time -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) -plt.setp(axes, xlim=(-1,24)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+45]].reset_index() - course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f') - course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour) - courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_time.index, courses_time) - - ax.set_title(f'{unique_courses[k+45]} Time Usage Frequency') - ax.set_xlabel('Time') - ax.set_ylabel('Frequency') - # ax.xaxis.set_major_locator(mdates.MonthLocator()) - # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -plt.tight_layout() - -# look at usage time by course 61 - 75 -# length of unique_courses is 93, plotting 15 plots at a time -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) -plt.setp(axes, xlim=(-1,24)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+60]].reset_index() - course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f') - course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour) - courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_time.index, courses_time) - - ax.set_title(f'{unique_courses[k+60]} Time Usage Frequency') - ax.set_xlabel('Time') - ax.set_ylabel('Frequency') - # ax.xaxis.set_major_locator(mdates.MonthLocator()) - # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -plt.tight_layout() - -# look at usage time by course 76 - 90 -# length of unique_courses is 93, plotting 15 plots at a time -total_plots = 15 -total_columns = 5 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(25, 15)) -plt.setp(axes, xlim=(-1,24)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+75]].reset_index() - course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f') - course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour) - courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_time.index, courses_time) - - ax.set_title(f'{unique_courses[k+75]} Time Usage Frequency') - ax.set_xlabel('Time') - ax.set_ylabel('Frequency') - # ax.xaxis.set_major_locator(mdates.MonthLocator()) - # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -plt.tight_layout() - -# look at usage time by course 91-83 -# length of unique_courses is 93, plotting 15 plots at a time -total_plots = 3 -total_columns = 3 -total_rows = int(np.ceil(total_plots/total_columns)) - -fig, axes = plt.subplots(total_rows, total_columns, figsize=(10, 5)) -plt.setp(axes, xlim=(-1,24)) - -for k in range(total_plots): - course = nbgitpuller_textPayload_df_pull_normal[nbgitpuller_textPayload_df_pull_normal.course == unique_courses[k+90]].reset_index() - course['timestamp_time'] = pd.to_datetime(course['timestamp_time'], format = '%H:%M:%S.%f') - course['timestamp_time_hour'] = course['timestamp_time'].apply(lambda x: x.hour) - courses_time = course.groupby(by = 'timestamp_time_hour').timestamp_time_hour.count() - ax = axes.flatten()[k] if total_plots > 1 else axes # handles single plot cases - ax.bar(courses_time.index, courses_time) - - ax.set_title(f'{unique_courses[k+90]} Time Usage Frequency') - ax.set_xlabel('Time') - ax.set_ylabel('Frequency') - # ax.xaxis.set_major_locator(mdates.MonthLocator()) - # ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) - -plt.tight_layout() - -### Non-Github Repos - -```{python} -#| editable: true -#| slideshow: {slide_type: ''} -#| tags: [] -# clicks by hub for abnormal -clicks_by_hub = nbgitpuller_textPayload_df_pull_abnormal.hub.value_counts() -fig = plt.figure(figsize = (8,4)) -plt.bar(clicks_by_hub.index, clicks_by_hub) -plt.xlabel('Hub') -plt.ylabel('Count') -plt.xticks(rotation = 90) -plt.title('Clicks By Hub (Non-Github Repos)'); -``` - -```{python} -len(nbgitpuller_textPayload_df_pull.hub.unique()) -``` - -```{python} -clicks_by_hub_top6 = dict(clicks_by_hub[:6]) -clicks_by_hub_top6['others'] = clicks_by_hub[6:].sum() -clicks_by_hub_top6_df = pd.DataFrame(clicks_by_hub_top6.items(), columns = ['hub', 'counts']) -plt.pie(clicks_by_hub_top6_df.counts, labels = clicks_by_hub_top6_df.hub, autopct='%1.0f%%', textprops={'fontsize': 9}, labeldistance= 1.05) -plt.title('Clicks By Hub (Percentage) - Non-Github Repos'); -``` - -```{python} -file_types_count = nbgitpuller_textPayload_df_pull_abnormal.file_extension[nbgitpuller_textPayload_df_pull_abnormal.file_extension != 'NaN'].value_counts() -fig = plt.figure(figsize=(8,4)) -plt.bar(file_types_count.index, file_types_count) -plt.xticks(rotation=90) -plt.xlabel('File Type') -plt.ylabel('Counts') -plt.title('File Type By Counts (Non-Github Repos)'); -``` - -```{python} -# filetypes across hubs -hubs = nbgitpuller_textPayload_df_pull_abnormal.hub.unique() -total_plots = len(hubs) -total_columns = 4 -total_rows = math.ceil(total_plots/total_columns) - -for k in range(total_plots): - hub_file_filter_count = nbgitpuller_textPayload_df_pull_abnormal[nbgitpuller_textPayload_df_pull_abnormal.hub == hubs[k]].file_extension.value_counts().tolist() - hub_file_filter_proportion = nbgitpuller_textPayload_df_pull_abnormal[nbgitpuller_textPayload_df_pull_abnormal.hub == hubs[k]].file_extension.value_counts(normalize = True).mul(100).round(2).tolist() - hub_file_filter_key = nbgitpuller_textPayload_df_pull_abnormal[nbgitpuller_textPayload_df_pull_abnormal.hub == hubs[k]].file_extension.value_counts().keys().tolist() - hub_file_filter_df = pd.DataFrame(data = {'file_type': hub_file_filter_key, 'counts': hub_file_filter_count, 'proportions': hub_file_filter_proportion}) - hub_file_filter_df['count_proportion'] = hub_file_filter_df['counts'].astype(str) + ' (' + hub_file_filter_df['proportions'].astype(str) + ')' - hub_file_filter_df.drop(columns = ['counts', 'proportions'], inplace = True) - print(hubs[k]) - print(hub_file_filter_df) - -# for ax in axes.flatten(): -# if not ax.get_visible(): -# ax.set_axis_off() -``` - -```{python} -# looking at the classes -course_count = nbgitpuller_textPayload_df_pull_abnormal.course.value_counts() -figure = plt.figure(figsize = (8,4)) -plt.bar(course_count.index, course_count) -plt.xticks(rotation = 90, fontsize = 6) -plt.xlabel('Course') -plt.ylabel('Count') -plt.title('Count of Clicks by Course'); -``` - -```{python} -#| editable: true -#| slideshow: {slide_type: ''} -#| tags: [] -# looking at the semester usage -sem_count = nbgitpuller_textPayload_df_pull_abnormal.semester.value_counts() -figure = plt.figure(figsize = (8,4)) -plt.bar(sem_count.index, sem_count) -plt.xticks(rotation = 90, fontsize = 6) -plt.xlabel('Semester') -plt.ylabel('Count') -plt.title('Semester Content Count'); -``` -