diff --git a/FrequencyTable.py b/FrequencyTable.py index 781bb8d..57be1c7 100644 --- a/FrequencyTable.py +++ b/FrequencyTable.py @@ -1,23 +1,62 @@ import numpy as np -# Global Variable Used in Frequency Table Data Processing -top = [] -bottom = [] -top_limit = [] -bottom_limit = [] -frequency = [] -data_range = [] -data_limit = [] -data_midpoint = [] -bot_cumulative_frequency = [] -top_cumulative_frequency = [] -relative_frequency = [] -mode = [] +class StatisticalCalculations: + @staticmethod + def mean(dataset): + return sum(dataset) / len(dataset) + + @staticmethod + def variance(dataset, mean): + return sum((x - mean) ** 2 for x in dataset) / len(dataset) + + @staticmethod + def standard_deviation(variance): + return variance ** 0.5 + + @staticmethod + def skewness(dataset, mean, deviation): + n = len(dataset) + return (n / ((n - 1) * (n - 2))) * sum(((x - mean) / deviation) ** 3 for x in dataset) + + @staticmethod + def kurtosis(dataset, mean, deviation): + n = len(dataset) + return (n * (n + 1) * sum(((x - mean) / deviation) ** 4 for x in dataset) / + ((n - 1) * (n - 2) * (n - 3))) - (3 * (n - 1) ** 2) / ((n - 2) * (n - 3)) + + @staticmethod + def median(dataset): + sorted_data = sorted(dataset) + n = len(sorted_data) + mid = n // 2 + + if n % 2 == 0: # If even, return the average of the two middle numbers + return (sorted_data[mid - 1] + sorted_data[mid]) / 2 + else: # If odd, return the middle number + return sorted_data[mid] + +# Processed Data Assignment +class ProcessedData: + def __init__(self, data, bot, top, bot_L, top_L, F, R, L, M, bot_CF, top_CF, RF, mode): + self.top = top + self.limit = L + self.ranges = R + self.bottom = bot + self.midpoint = M + self.frequency = F + self.classval = data + self.top_limit = top_L + self.bottom_limit = bot_L + self.mode = mode + self.bottom_cumulative_frequency = bot_CF + self.top_cumulative_frequency = top_CF + self.relative_frequency = RF + self.percentage_relative_frequency = [f"{rf:.2f}%" for rf in self.relative_frequency] # Frequency Table Class class FrequencyTable: def __init__(self, dataset): - # Check for mixed data types (both numeric and string) + # Check for mixed data types if any(isinstance(item, str) for item in dataset) and any(isinstance(item, (int, float)) for item in dataset): raise ValueError("Data is corrupted: contains both numeric and string values.") @@ -28,232 +67,127 @@ def __init__(self, dataset): self.highest = max(dataset) if isinstance(dataset[0], (int, float)) else None if self.lowest is not None: # Only calculate classes for numeric data - # Classes is Rounding Down - self.classes = 1 + (3.222 * np.log10(self.length)) - self.classes = round(self.classes - 0.5) + self.calculate_statistics() + self.calculate_classes() + + def calculate_statistics(self): + self.sum = sum(self.dataset) + self.mean = StatisticalCalculations.mean(self.dataset) + self.median = StatisticalCalculations.median(self.dataset) + self.variance = StatisticalCalculations.variance(self.dataset, self.mean) + self.deviation = StatisticalCalculations.standard_deviation(self.variance) + self.skewness = StatisticalCalculations.skewness(self.dataset, self.mean, self.deviation) + self.kurtosis = StatisticalCalculations.kurtosis(self.dataset, self.mean, self.deviation) + + def calculate_classes(self): + self.classes = 1 + (3.222 * np.log10(self.length)) + self.classes = round(self.classes - 0.5) + self.range = self.highest - self.lowest + self.interval = round((self.range / self.classes) + 0.5) + self.base = self.roundy(self.lowest - 3) - # Sum of the data and range - self.sum = sum(dataset) - self.range = self.highest - self.lowest + # Base 5 Rounding + def roundy(self, x, base=5): + return base * round(x / base) - # Interval is Rounding Up - self.interval = self.range / self.classes - self.interval = round(self.interval + 0.5) + def reset(self): + # Clear all internal attributes + self.bottom = [] + self.top = [] + self.top_limit = [] + self.bottom_limit = [] + self.frequency = [] + self.data_range = [] + self.data_limit = [] + self.data_midpoint = [] + self.bot_cumulative_frequency = [] + self.top_cumulative_frequency = [] + self.relative_frequency = [] + self.mode = [] - # Rounding Both Limits So The Data Would Be Simple And Easier To Read - self.base = self.roundy(self.lowest - 3) - self.top = self.roundy(self.highest + 3) + def find_frequency(self, bot, top): + return sum(1 for x in self.dataset if bot <= x < top) - # Mean or Average - self.mean = (self.sum / self.length) + def populate_grouped(self): + self.reset() + + # Initiating Variables for Frequency Table + current_number = self.base - 1 - # Formula for Variance - self.variance = sum((x - self.mean) ** 2 for x in dataset) / self.length + while True: + old_number = current_number + 1 + self.bottom.append(old_number) - # Formula for Standard Deviation - self.deviation = (self.variance ** 0.5) + current_number += self.interval + self.top.append(current_number) - # Formula to find Dataset Skewness - self.skewness = (self.length / ((self.length - 1) * (self.length - 2))) * \ - sum(((x - self.mean) / self.deviation) ** 3 for x in self.dataset) + self.bottom_limit.append(old_number - 0.5) + self.top_limit.append(current_number + 0.5) - # Formula to find Dataset Kurtosis - self.kurtosis = (self.length * (self.length + 1) * sum(((x - self.mean) / self.deviation) ** 4 for x in self.dataset) / - ((self.length - 1) * (self.length - 2) * (self.length - 3))) - \ - (3 * (self.length - 1) ** 2) / ((self.length - 2) * (self.length - 3)) + current_frequency = self.find_frequency(old_number, current_number + 1) + self.frequency.append(current_frequency) - # Base 5 Rounding - def roundy(self, x, base=5): - return base * round(x / base) + current_data_range = f"{old_number:.2f} ~ {current_number:.2f}" + self.data_range.append(current_data_range) - # Function to Reset Frequency Table Data - def reset(self): - global top, bottom, top_limit, bottom_limit, frequency - global data_range, data_limit, data_midpoint - global bot_cumulative_frequency, top_cumulative_frequency, relative_frequency, mode - - top.clear() - bottom.clear() - top_limit.clear() - bottom_limit.clear() - frequency.clear() - data_range.clear() - data_limit.clear() - data_midpoint.clear() - bot_cumulative_frequency.clear() - top_cumulative_frequency.clear() - relative_frequency.clear() - mode.clear() - - # Function To Find Frequency in Dataset with Desired Range (Top and Down Limit) - def find_frequency(self, bot, top): - total_frequency = 0 - # Check if the dataset contains only integers - is_integer_data = all(isinstance(x, int) for x in self.dataset) - - if is_integer_data: - # Loop for integers - for i in range(bot, top): - frequency = self.dataset.count(i) - total_frequency += frequency - else: - # Loop for decimals - current = bot - while current < top: - frequency = self.dataset.count(round(current, 2)) # Round for matching - total_frequency += frequency - current += 0.01 # Increment by 0.01 for decimals - - return total_frequency - - # Populate Grouped Table Frequency Data Method - def PopulateGrouped(self): - try: - # Check if the dataset contains text - if any(isinstance(item, str) for item in self.dataset): - raise ValueError("Text data is not allowed for grouped frequency tables. Please provide numeric data only.") - - self.reset() # Reset the frequency table data before processing - - # Initiating Used Parameter for Frequency Table - old_number = 0 - interval = self.interval - current_number = self.base - 1 - current_top_cumulative_frequency = 1 - - # Processing the Frequency Table Data - while current_top_cumulative_frequency != 0: - # Finding Class Lowest Value - old_number = current_number + 1 - bottom.append(old_number) - - # Finding Class Highest Value - current_number = current_number + interval - top.append(current_number) - - # Append Class Bottom Limit - current_bottom_limit = old_number - 0.5 - bottom_limit.append(current_bottom_limit) - - # Append Class Top Limit - current_top_limit = current_number + 0.5 - top_limit.append(current_top_limit) - - # Finding The Frequency That Range - current_frequency = self.find_frequency(old_number, current_number + 1) - frequency.append(current_frequency) - - # Adding The Number Range From Both Frequency - current_data_range = f"{old_number:.2f} ~ {current_number:.2f}" if not all(isinstance(x, int) for x in self.dataset) else f"{old_number} ~ {current_number}" - data_range.append(current_data_range) - - # Adding Data Range Limit Of The Class Frequency - current_data_limit = f"{current_bottom_limit:.2f} ~ {current_top_limit:.2f}" if not all(isinstance(x, int) for x in self.dataset) else f"{current_bottom_limit} ~ {current_top_limit}" - data_limit.append(current_data_limit) - - # Adding Data Midpoint of The Class Frequency - current_data_midpoint = (old_number + current_number) / 2 - data_midpoint.append(current_data_midpoint) - - # Adding Bottom Cumulative Frequency of The Class - current_bot_cumulative_frequency = self.find_frequency(self.lowest - 1, old_number) - bot_cumulative_frequency.append(current_bot_cumulative_frequency) - - # Adding Top Cumulative Frequency of The Class - current_top_cumulative_frequency = self.find_frequency(current_number + 1, self.highest + 1) - top_cumulative_frequency.append(current_top_cumulative_frequency) - - # Counting the Relative Frequency in Percentage - current_relative_frequency = np.round((current_frequency / self.length) * 100) - relative_frequency.append(current_relative_frequency) - - # Find Mode or Data that appears most frequently - mode_index = [i for i, val in enumerate(frequency) if val == max(frequency)] - mode = [data_range[i] for i in mode_index] - - # Append Processed Data into Data Attributes - self.grouped = ProcessedData(None, bottom, top, bottom_limit, top_limit, - frequency, data_range, data_limit, data_midpoint, - bot_cumulative_frequency, top_cumulative_frequency, - relative_frequency, mode) - - except ValueError as e: - print(f"Error: {e}") - - # Populate Simple Table Frequency Data Method - def PopulateSimple(self): - self.reset() # Reset the frequency table data before processing - - # Initialize general variables - data = sorted(set(self.dataset)) # Remove duplicates and sort the data - - # Initialize limits for numeric data - top_limit = [] - bottom_limit = [] + current_data_limit = f"{old_number - 0.5:.2f} ~ {current_number + 0.5:.2f}" + self.data_limit.append(current_data_limit) + + self.data_midpoint.append((old_number + current_number) / 2) - # Single loop to process both numeric and string data - for current_class in data: - # Calculate the frequency of the current class + bot_cumulative_freq = self.find_frequency(self.lowest - 1, old_number) + self.bot_cumulative_frequency.append(bot_cumulative_freq) + + top_cumulative_freq = self.find_frequency(current_number + 1, self.highest + 1) + self.top_cumulative_frequency.append(top_cumulative_freq) + + current_relative_frequency = np.round((current_frequency / self.length) * 100) + self.relative_frequency.append(current_relative_frequency) + + if current_frequency == 0: + break + + # Find Mode + mode_index = [i for i, val in enumerate(self.frequency) if val == max(self.frequency)] + self.mode = [self.data_range[i] for i in mode_index] + + # Create ProcessedData object + self.grouped = ProcessedData(None, self.bottom, self.top, self.bottom_limit, self.top_limit, + self.frequency, self.data_range, self.data_limit, self.data_midpoint, + self.bot_cumulative_frequency, self.top_cumulative_frequency, + self.relative_frequency, self.mode, + ) + + def populate_simple(self): + self.reset() + unique_data = sorted(set(self.dataset)) + + for current_class in unique_data: current_frequency = self.dataset.count(current_class) - frequency.append(current_frequency) + self.frequency.append(current_frequency) - # Calculate the relative frequency for the current class current_relative_frequency = np.round((current_frequency / self.length) * 100) - relative_frequency.append(current_relative_frequency) - - # If the data is numeric, calculate limits and cumulative frequencies - if not all(isinstance(item, str) for item in self.dataset): - # Calculate top and bottom limits for numeric data - current_top_limit = current_class + 0.5 - current_bottom_limit = current_class - 0.5 - top_limit.append(current_top_limit) - bottom_limit.append(current_bottom_limit) - - # Calculate bottom cumulative frequency for numeric data - current_bot_cumulative_frequency = self.find_frequency(self.lowest - 1, current_class) - bot_cumulative_frequency.append(current_bot_cumulative_frequency) - - # Calculate top cumulative frequency for numeric data - current_top_cumulative_frequency = self.find_frequency(current_class + 1, self.highest + 1) - top_cumulative_frequency.append(current_top_cumulative_frequency) - - else: - # If the data is string-based, calculate cumulative frequencies - # Calculate bottom cumulative frequency for strings - current_bot_cumulative_frequency = self.dataset.count(current_class) - bot_cumulative_frequency.append(current_bot_cumulative_frequency) - - # Calculate top cumulative frequency for strings - current_top_cumulative_frequency = sum(frequency) - current_bot_cumulative_frequency - top_cumulative_frequency.append(current_top_cumulative_frequency) - - # Find the mode (the class with the highest frequency) - mode_index = [i for i, val in enumerate(frequency) if val == max(frequency)] - mode = [data[i] for i in mode_index] - - # Create the ProcessedData object based on the data type - self.simple = ProcessedData( - data, None, None, bottom_limit, top_limit, - frequency, None, None, None, - bot_cumulative_frequency, top_cumulative_frequency, - relative_frequency, mode - ) + self.relative_frequency.append(current_relative_frequency) -# Processed Data Assignment -class ProcessedData: - # Limit (L), Frequency (F), Ranges (R), Midpoint (M), Cumulative (C), Relative (R) - def __init__(self, data, bot, top, bot_L, top_L, F, R, L, M, bot_CF, top_CF, RF, mode): - self.classval = data - self.bottom = bot - self.top = top - self.bottom_limit = bot_L - self.top_limit = top_L - self.midpoint = M - self.ranges = R - self.limit = L - self.frequency = F - self.bottom_cumulative_frequency = bot_CF - self.top_cumulative_frequency = top_CF - self.relative_frequency = RF - self.percentage_relative_frequency = [f"{rf * 1:.2f}%" for rf in self.relative_frequency] - self.mode = mode + current_bottom_limit = current_class - 0.5 + current_top_limit = current_class + 0.5 + self.bottom_limit.append(current_bottom_limit) + self.top_limit.append(current_top_limit) + + bot_cumulative_freq = self.find_frequency(self.lowest - 1, current_class) + self.bot_cumulative_frequency.append(bot_cumulative_freq) + + top_cumulative_freq = self.find_frequency(current_class + 1, self.highest + 1) + self.top_cumulative_frequency.append(top_cumulative_freq) + + # Find Mode + mode_index = [i for i, val in enumerate(self.frequency) if val == max(self.frequency)] + self.mode = [unique_data[i] for i in mode_index] + + # Create ProcessedData object + self.simple = ProcessedData( + unique_data, None, None, self.bottom_limit, self.top_limit, + self.frequency, None, None, None, + self.bot_cumulative_frequency, self.top_cumulative_frequency, + self.relative_frequency, self.mode, + ) \ No newline at end of file diff --git a/Main.py b/Main.py index 7aa9d22..8486767 100644 --- a/Main.py +++ b/Main.py @@ -1,15 +1,74 @@ # EXAMPLE PROGRAM import FrequencyTable as ft +import tabulate as tabulate import Transform as tf - +import pandas as pd + # Raw Data dataset = [ 12.5, 12.5, 12.1, 12.6, 12.7, 12.5, 43.2, 43.2, 43.2, 43.5, 34.2, 34.1, 34.3, 34.2, 34.0, 34.5, 56.7, 56.8, 56.5, 56.6, 56.9, 57.0, 67.9, 67.8, 67.5, 67.6, 67.7, 78.4, 78.1, 78.3, 78.2, 78.9, 78.8, 89.0, 89.1, 89.2, 90.5, 91.2, 90.3, 90.0, - 98.3, 98.1, 98.0, 99.5, 99.4, 99.6, 99.1, 99.2, 99.3, 99.0 + 98.3, 98.1, 98.0, 99.5, 99.4, 99.6, 99.1, 99.2, 99.3, 99.0, + 22.4, 22.3, 22.5, 22.2, 22.1, 22.0, 25.4, 25.5, 25.6, 25.0, + 32.4, 32.5, 32.3, 32.2, 32.1, 32.0, 45.6, 45.5, 45.4, 45.0, + 56.3, 56.4, 56.2, 56.1, 56.0, 60.5, 64.0, 64.1, 64.2, 64.3, + 71.3, 71.4, 71.5, 71.6, 71.7, 71.8, 84.2, 84.3, 84.1, 84.0, + 12.9, 12.8, 12.7, 12.6, 12.5, 12.4 ] +# Initiate Object From The Raw Data +data = ft.FrequencyTable(dataset) +data.populate_simple() # Simple Data + +# Simple Populated Data +dfs = pd.DataFrame( + { + "Class" : data.simple.classval, + "Frequency" : data.simple.frequency, + "Relative Frequency" : data.simple.percentage_relative_frequency + } +) + +# Converting Pandas Data Into Tabulate +tablesimple = tabulate.tabulate( + dfs, + headers='keys', + tablefmt='pipe' +) + +# Print The Processed Data +print(tablesimple) +print(data.simple.mode) +print(data.mean) +print(data.median) + + +data.populate_grouped() # Grouped Data +dfg = pd.DataFrame( + { + "Class Interval" : data.grouped.ranges, + "Class Limit" : data.grouped.limit, + "Frequency" : data.grouped.frequency, + "Midpoint" : data.grouped.midpoint, + + "C <" : data.grouped.bottom_limit, + "CF <" : data.grouped.bottom_cumulative_frequency, + "C >" : data.grouped.top_limit, + "CF >" : data.grouped.top_cumulative_frequency, + "Relative Frequency" : data.grouped.percentage_relative_frequency + } +) + +tablegrouped = tabulate.tabulate( + dfg, + headers='keys', + tablefmt='pipe', +) +print(tablegrouped) +print(data.grouped.mode) +print(data.mean) + diff --git a/Statistic_Calculation.py b/Statistic_Calculation.py new file mode 100644 index 0000000..c640cbc --- /dev/null +++ b/Statistic_Calculation.py @@ -0,0 +1,35 @@ +# Statistical Calculations Class +class Statistic_Calculation: + @staticmethod + def mean(dataset): + return sum(dataset) / len(dataset) + + @staticmethod + def variance(dataset, mean): + return sum((x - mean) ** 2 for x in dataset) / len(dataset) + + @staticmethod + def standard_deviation(variance): + return variance ** 0.5 + + @staticmethod + def skewness(dataset, mean, deviation): + n = len(dataset) + return (n / ((n - 1) * (n - 2))) * sum(((x - mean) / deviation) ** 3 for x in dataset) + + @staticmethod + def kurtosis(dataset, mean, deviation): + n = len(dataset) + return (n * (n + 1) * sum(((x - mean) / deviation) ** 4 for x in dataset) / + ((n - 1) * (n - 2) * (n - 3))) - (3 * (n - 1) ** 2) / ((n - 2) * (n - 3)) + + @staticmethod + def median(dataset): + sorted_data = sorted(dataset) + n = len(sorted_data) + mid = n // 2 + + if n % 2 == 0: # If even, return the average of the two middle numbers + return (sorted_data[mid - 1] + sorted_data[mid]) / 2 + else: # If odd, return the middle number + return sorted_data[mid]