From 7931fa3a75939fb71ebb2508669fecff9af92acf Mon Sep 17 00:00:00 2001
From: Jolene <jolenechong7@gmail.com>
Date: Mon, 23 Oct 2023 16:25:17 +0800
Subject: [PATCH] feat: added script to task 03

---
 03AnalysisApplicationData/main.py | 139 ++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 03AnalysisApplicationData/main.py
diff --git a/03AnalysisApplicationData/main.py b/03AnalysisApplicationData/main.py
new file mode 100644
index 0000000..384dfc3
--- /dev/null
+++ b/03AnalysisApplicationData/main.py
@@ -0,0 +1,139 @@
+# 
+# Script to process mentorship data from CSV files and save to CSV
+# # TODO: NOT TESTED YET (getting credentials)
+# 
+# 1. Set up /data with Application from different waves and their respective files
+# 2. Create env file with your credentials for Elastic Cloud
+# - CLOUD_ID=<CLOUD_ID>
+# - PASSWORD=<PASSWORD>
+# - USER=<USER>
+# 3. Run script, which produces mentors.csv and mentors_per_application files in /data
+# 
+# Written by: Jolene
+# 
+
+
+import pandas as pd
+import numpy as np
+from elasticsearch import Elasticsearch
+import re
+
+class ProcessApplicationData:
+    def __init__(self, user, cloud_id, password):
+        self.es = Elasticsearch(
+            cloud_id=cloud_id,
+            http_auth=(user, password)
+        )
+        self.mentors = {}
+        self.unknown_mentors = []
+
+    def search_documents(self, index, query_body):
+        result = self.es.search(index=index, body=query_body)
+        return result
+
+    def get_mentor_info_by_name(self, name):
+        search_options = {
+            'query': {
+                'bool': {
+                    'should': []
+                }
+            }
+        }
+
+        # Add match query for the name
+        search_options['query']['bool']['should'].append({
+            'match': {
+                'name': name
+            }
+        })
+
+        # Search by organization if name includes organization information in brackets
+        organization = re.search(r'\((.*?)\)', name)
+        if organization:
+            organization_name = organization.group(1)
+            # Add match query for the organisation
+            search_options['query']['bool']['should'].append({
+                'match': {
+                    'organisation': organization_name
+                }
+            })
+
+        result = self.search_documents('enterprise-search-engine-mentorship-page', search_options)
+        exact_matches = [doc for doc in result['hits']['hits']]
+
+        if len(exact_matches) == 0:
+            return None
+
+        if 'organisation' in exact_matches[0]['_source']:
+            return {
+                'name': exact_matches[0]['_source']['name'],
+                'industries': exact_matches[0]['_source']['industries'],
+                'organisation': exact_matches[0]['_source']['organisation']
+            }
+        else:
+            return {
+                'name': exact_matches[0]['_source']['name'],
+                'industries': exact_matches[0]['_source']['industries'],
+                'organisation': None
+            }
+
+    def check_same_name(self, name):
+        search_options = {
+            'query': {
+                'match': {
+                    'name': name
+                }
+            }
+        }
+        result = self.search_documents('enterprise-search-engine-mentorship-page', search_options)
+        documents = result['hits']['hits']
+        if len(documents) > 1:
+            return True
+        else:
+            return False
+
+    def process_mentors_data(self, dataframes):
+        df = pd.concat(dataframes)
+        df['year'] = np.concatenate([np.full(len(df_i), year_i) for df_i, year_i in dataframes])
+        df.columns = ['mentor_name', 'year']
+        df = df[df['mentor_name'] != '[INSERT NAME LIST OF WAVE 3 MENTORS]']
+
+        df['industries'] = ""
+        df['organisation'] = ""
+
+        for index, row in df.iterrows():
+            name = row['mentor_name'].lower()
+            year = row['year']
+
+            if name in self.mentors:
+                mentor = self.mentors[name]
+            else:
+                mentor = self.get_mentor_info_by_name(name)
+
+            if mentor is not None:
+                row['industries'] = mentor['industries']
+                row['organisation'] = mentor['organisation']
+            else:
+                self.unknown_mentors.append(name)
+
+        return df
+
+    def save_data_to_csv(self, df, filename):
+        df.to_csv(filename, index=False)
+
+if __name__ == '__main__':
+    CLOUD_ID = "<CLOUD_ID>"
+    PASSWORD = '<PASSWORD>'
+    USER = "jolene"
+
+    mentorship_system = ProcessApplicationData(USER, CLOUD_ID, PASSWORD)
+
+    # Load CSV dataframes
+    df_2020w3 = pd.read_csv('data/2020w3.csv')
+    df_2021w1 = pd.read_csv('data/2021w1.csv')
+    df_2022 = pd.read_csv('data/2022.csv')
+
+    # Process and save mentor data to CSV
+    dataframes = [df_2020w3, df_2021w1, df_2022]
+    mentors_df = mentorship_system.process_mentors_data(dataframes)
+    mentorship_system.save_data_to_csv(mentors_df, 'data/mentors.csv')
\ No newline at end of file