From 9b2735e93cb5a9298dba71fd511e03ffeb34a508 Mon Sep 17 00:00:00 2001
From: Restia <howardong2604@gmail.com>
Date: Thu, 4 Jan 2024 21:40:40 +0800
Subject: [PATCH 1/2] issue resolved

created a class??? similar to what i was asked
created a script which is how i would do it (and it is without class)
---
 04URLsInPostgreSQL/parsing urls class.py  | 73 +++++++++++++++++++++++
 04URLsInPostgreSQL/parsing urls script.py | 63 +++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 04URLsInPostgreSQL/parsing urls class.py
 create mode 100644 04URLsInPostgreSQL/parsing urls script.py

diff --git a/04URLsInPostgreSQL/parsing urls class.py b/04URLsInPostgreSQL/parsing urls class.py
new file mode 100644
index 0000000..bf2e45d
--- /dev/null
+++ b/04URLsInPostgreSQL/parsing urls class.py	
@@ -0,0 +1,73 @@
+import pandas as pd
+import numpy as np
+from urllib.parse import urlparse, parse_qs, unquote
+
+
+class parsing_urls:
+    def __init__(self, input_csv_file):
+        original_df = pd.read_csv(input_csv_file)
+
+    def helper_extract_query_params(url):
+        url = unquote(url) # make it human readable, not percentages
+        query_params = parse_qs(urlparse(url).query)
+        
+        return query_params
+    
+    def extract_query_params(self):
+        self.df = self.original_df.copy(deep=True)
+        self.df['url'] = self.df['url'].astype(str)
+        self.df['query_params'] = self.df['url'].apply(self.helper_extract_query_params) # gets all the query params and makes it a dictionary 
+
+    def helper_process_query_params(params):
+        '''
+        Parameters:
+        params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']}
+
+        Returns:
+        {
+            "search_query": "search term",
+            "filter_name": ["filter value 1", "filter value 2"]
+        }
+
+        Note:
+        - The filter_name is the name of the filter, e.g. industries, school etc.
+        - size and type are ignored
+        '''
+        result = {}
+        current_field = ''
+
+        for key, value in params.items():
+            if 'filters' in key:
+                parts = key.split('[')
+                field_or_value = parts[2].strip(']')
+
+                if field_or_value == 'field':
+                    # if its a field then use it as a key
+                    current_field = value[0]
+                    result[current_field] = []
+                elif field_or_value == 'type':
+                    # there's a type of all in all queries, not sure what that is and whether its relevant
+                    pass
+                else:
+                    # if its a value then add it to the list by using the last saved field
+                    result[current_field].extend(value)
+            elif key == "q":
+                result["search_query"] = value[0]
+
+        result = dict(result)
+        return result
+    
+    def process_query_params(self):
+        self.df_processed = self.df.copy(deep=True)
+        self.df_processed['query_params'] = self.df_processed['query_params'].apply(self.helper_process_query_params) # use only 1 for loop
+        self.df_processed = self.pd.DataFrame(self.df_processed['query_params'].values.tolist())
+    
+    def drop_unused_tables(self):
+        # drop unused columns ['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of']
+        self.df_processed = self.df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1)
+        self.df_processed.info()
+
+    def to_csv(self, output_csv_file):
+        df_processed.to_csv(output_csv_file, index=False)
+        
+
diff --git a/04URLsInPostgreSQL/parsing urls script.py b/04URLsInPostgreSQL/parsing urls script.py
new file mode 100644
index 0000000..fb667ad
--- /dev/null
+++ b/04URLsInPostgreSQL/parsing urls script.py	
@@ -0,0 +1,63 @@
+import pandas as pd
+import numpy as np
+from urllib.parse import urlparse, parse_qs, unquote
+
+
+
+def helper_extract_query_params(url):
+    url = unquote(url) # make it human readable, not percentages
+    query_params = parse_qs(urlparse(url).query)
+    
+    return query_params
+    
+def helper_process_query_params(params):
+    '''
+    Parameters:
+    params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']}
+
+    Returns:
+    {
+        "search_query": "search term",
+        "filter_name": ["filter value 1", "filter value 2"]
+    }
+
+    Note:
+    - The filter_name is the name of the filter, e.g. industries, school etc.
+    - size and type are ignored
+    '''
+    result = {}
+    current_field = ''
+
+    for key, value in params.items():
+        if 'filters' in key:
+            parts = key.split('[')
+            field_or_value = parts[2].strip(']')
+
+            if field_or_value == 'field':
+                # if its a field then use it as a key
+                current_field = value[0]
+                result[current_field] = []
+            elif field_or_value == 'type':
+                # there's a type of all in all queries, not sure what that is and whether its relevant
+                pass
+            else:
+                # if its a value then add it to the list by using the last saved field
+                result[current_field].extend(value)
+        elif key == "q":
+            result["search_query"] = value[0]
+
+    result = dict(result)
+    return result
+
+input_csv_file = input('enter the input csv file name')
+output_csv_file = input('enter the output csv file name')
+
+original_df = pd.read_csv(input_csv_file)
+df = original_df.copy(deep=True)
+df['url'] = df['url'].astype(str)
+df['query_params'] = df['url'].apply(extract_query_params) # gets all the query params and makes it a dictionary 
+df_processed = df.copy(deep=True)
+df_processed['query_params'] = df_processed['query_params'].apply(process_query_params) # use only 1 for loop
+df_processed = pd.DataFrame(df_processed['query_params'].values.tolist())
+df_processed = df_processed.drop(['i', 'ind', 'cou', 'indust', 'o', 'sch', 'course_of'], axis=1) #drop unused columns
+df_processed.to_csv(output_csv_file, index=False)
\ No newline at end of file

From 160805ac481fdcbf242a4a82c525a68ad729f350 Mon Sep 17 00:00:00 2001
From: fallendynasty <33785667+fallendynasty@users.noreply.github.com>
Date: Sat, 6 Jan 2024 17:56:10 +0800
Subject: [PATCH 2/2] issue resolved

---
 04URLsInPostgreSQL/parsing urls script.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/04URLsInPostgreSQL/parsing urls script.py b/04URLsInPostgreSQL/parsing urls script.py
index fb667ad..25c416d 100644
--- a/04URLsInPostgreSQL/parsing urls script.py	
+++ b/04URLsInPostgreSQL/parsing urls script.py	
@@ -4,13 +4,13 @@
 
 
 
-def helper_extract_query_params(url):
+def extract_query_params(url):
     url = unquote(url) # make it human readable, not percentages
     query_params = parse_qs(urlparse(url).query)
     
     return query_params
     
-def helper_process_query_params(params):
+def process_query_params(params):
     '''
     Parameters:
     params -> {'size': ['n_20_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Information and Communications Technology'], 'filters[0][type]': ['all']}
@@ -49,8 +49,8 @@ def helper_process_query_params(params):
     result = dict(result)
     return result
 
-input_csv_file = input('enter the input csv file name')
-output_csv_file = input('enter the output csv file name')
+input_csv_file = input('enter the input csv file name: ')
+output_csv_file = input('enter the output csv file name: ')
 
 original_df = pd.read_csv(input_csv_file)
 df = original_df.copy(deep=True)