diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..46ca101 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [2024] [BrotherZhafif] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/pythistic/Chart.py b/pythistic/Chart.py new file mode 100644 index 0000000..7e708a2 --- /dev/null +++ b/pythistic/Chart.py @@ -0,0 +1,109 @@ +# Chart.py +import matplotlib.pyplot as plt +import numpy as np +from matplotlib_venn import venn2, venn3 + +class Chart: + def __init__(self, title="", xlabel="", ylabel=""): + self.title = title + self.xlabel = xlabel + self.ylabel = ylabel + self.figure = None + + def _apply_common_properties(self): + if self.title: + plt.title(self.title) + if self.xlabel: + plt.xlabel(self.xlabel) + if self.ylabel: + plt.ylabel(self.ylabel) + + def box(self, x_values, y_values, is_range=False): + self.figure = plt.figure(figsize=(10, 6)) + bar_width = 0.5 + indices = range(len(y_values)) + + plt.bar(indices, y_values, width=bar_width, alpha=0.7, color='b') + + if is_range: + plt.xticks(indices, x_values) # Use ranges as labels + else: + plt.xticks(indices, [str(x) for x in x_values]) + + self._apply_common_properties() + plt.grid(axis='y') + + def line(self, x_values, y_values, is_range=False): + self.figure = plt.figure(figsize=(10, 6)) + if is_range: + x_values = [midpoint for midpoint in x_values] # Use midpoints for line plot + + plt.plot(x_values, y_values, marker='o') + self._apply_common_properties() + plt.grid() + + def scatter(self, x_values, y_values, is_range=False): + self.figure = plt.figure(figsize=(10, 6)) + if is_range: + x_values = [midpoint for midpoint in x_values] # Use midpoints for scatter plot + + plt.scatter(x_values, y_values, alpha=0.6, edgecolors='w', s=100) + self._apply_common_properties() + plt.grid() + + def pie(self, data, labels): + # Prepare a pie chart to show percentage distribution. + self.figure = plt.figure(figsize=(8, 8)) + plt.pie(data, labels=labels, autopct='%1.1f%%', startangle=140) + if self.title: + plt.title(self.title) + + def heatmap(self, data, annot=True, cmap='viridis'): + # Prepare a heatmap for visualizing a 2D matrix data. + self.figure = plt.figure(figsize=(12, 8)) + plt.imshow(data, cmap=cmap, aspect='auto') + if annot: + for (i, j), val in np.ndenumerate(data): + plt.text(j, i, f'{val}', ha='center', va='center', color='white') + self._apply_common_properties() + plt.colorbar() # Show color scale. + + def venn(self, sets, set_labels): + # Prepare a Venn diagram for visualizing the overlap between two or three sets. + self.figure = plt.figure(figsize=(8, 8)) + if len(sets) == 2: + venn2(sets, set_labels) + elif len(sets) == 3: + venn3(sets, set_labels) + else: + raise ValueError("Only 2 or 3 sets can be displayed in a Venn diagram.") + if self.title: + plt.title(self.title) + + def pareto(self, data, labels): + # Prepare a Pareto chart with bars representing values and a line showing the cumulative percentage. + # Sort data in descending order. + sorted_data = sorted(zip(data, labels), reverse=True) + data, labels = zip(*sorted_data) + cumulative_percentage = [sum(data[:i + 1]) / sum(data) * 100 for i in range(len(data))] + + self.figure, ax1 = plt.subplots(figsize=(12, 8)) + ax1.bar(labels, data, color='b', alpha=0.6) + ax1.set_xlabel(self.xlabel) + ax1.set_ylabel(self.ylabel) + + # Plot cumulative percentage line. + ax2 = ax1.twinx() + ax2.plot(labels, cumulative_percentage, color='r', marker='D', linestyle='-', linewidth=2) + ax2.set_ylabel('Cumulative Percentage') + ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0f}%')) + + if self.title: + ax1.set_title(self.title) + + def show(self): + # Display the prepared chart. + if self.figure: + plt.show() + else: + print("No chart has been prepared. Please call a chart method first.") diff --git a/pythistic/FrequencyTable.py b/pythistic/FrequencyTable.py new file mode 100644 index 0000000..781bb8d --- /dev/null +++ b/pythistic/FrequencyTable.py @@ -0,0 +1,259 @@ +import numpy as np + +# Global Variable Used in Frequency Table Data Processing +top = [] +bottom = [] +top_limit = [] +bottom_limit = [] +frequency = [] +data_range = [] +data_limit = [] +data_midpoint = [] +bot_cumulative_frequency = [] +top_cumulative_frequency = [] +relative_frequency = [] +mode = [] + +# Frequency Table Class +class FrequencyTable: + def __init__(self, dataset): + # Check for mixed data types (both numeric and string) + if any(isinstance(item, str) for item in dataset) and any(isinstance(item, (int, float)) for item in dataset): + raise ValueError("Data is corrupted: contains both numeric and string values.") + + # Data Initiation + self.dataset = sorted(dataset) + self.length = len(dataset) + self.lowest = min(dataset) if isinstance(dataset[0], (int, float)) else None + self.highest = max(dataset) if isinstance(dataset[0], (int, float)) else None + + if self.lowest is not None: # Only calculate classes for numeric data + # Classes is Rounding Down + self.classes = 1 + (3.222 * np.log10(self.length)) + self.classes = round(self.classes - 0.5) + + # Sum of the data and range + self.sum = sum(dataset) + self.range = self.highest - self.lowest + + # Interval is Rounding Up + self.interval = self.range / self.classes + self.interval = round(self.interval + 0.5) + + # Rounding Both Limits So The Data Would Be Simple And Easier To Read + self.base = self.roundy(self.lowest - 3) + self.top = self.roundy(self.highest + 3) + + # Mean or Average + self.mean = (self.sum / self.length) + + # Formula for Variance + self.variance = sum((x - self.mean) ** 2 for x in dataset) / self.length + + # Formula for Standard Deviation + self.deviation = (self.variance ** 0.5) + + # Formula to find Dataset Skewness + self.skewness = (self.length / ((self.length - 1) * (self.length - 2))) * \ + sum(((x - self.mean) / self.deviation) ** 3 for x in self.dataset) + + # Formula to find Dataset Kurtosis + self.kurtosis = (self.length * (self.length + 1) * sum(((x - self.mean) / self.deviation) ** 4 for x in self.dataset) / + ((self.length - 1) * (self.length - 2) * (self.length - 3))) - \ + (3 * (self.length - 1) ** 2) / ((self.length - 2) * (self.length - 3)) + + # Base 5 Rounding + def roundy(self, x, base=5): + return base * round(x / base) + + # Function to Reset Frequency Table Data + def reset(self): + global top, bottom, top_limit, bottom_limit, frequency + global data_range, data_limit, data_midpoint + global bot_cumulative_frequency, top_cumulative_frequency, relative_frequency, mode + + top.clear() + bottom.clear() + top_limit.clear() + bottom_limit.clear() + frequency.clear() + data_range.clear() + data_limit.clear() + data_midpoint.clear() + bot_cumulative_frequency.clear() + top_cumulative_frequency.clear() + relative_frequency.clear() + mode.clear() + + # Function To Find Frequency in Dataset with Desired Range (Top and Down Limit) + def find_frequency(self, bot, top): + total_frequency = 0 + # Check if the dataset contains only integers + is_integer_data = all(isinstance(x, int) for x in self.dataset) + + if is_integer_data: + # Loop for integers + for i in range(bot, top): + frequency = self.dataset.count(i) + total_frequency += frequency + else: + # Loop for decimals + current = bot + while current < top: + frequency = self.dataset.count(round(current, 2)) # Round for matching + total_frequency += frequency + current += 0.01 # Increment by 0.01 for decimals + + return total_frequency + + # Populate Grouped Table Frequency Data Method + def PopulateGrouped(self): + try: + # Check if the dataset contains text + if any(isinstance(item, str) for item in self.dataset): + raise ValueError("Text data is not allowed for grouped frequency tables. Please provide numeric data only.") + + self.reset() # Reset the frequency table data before processing + + # Initiating Used Parameter for Frequency Table + old_number = 0 + interval = self.interval + current_number = self.base - 1 + current_top_cumulative_frequency = 1 + + # Processing the Frequency Table Data + while current_top_cumulative_frequency != 0: + # Finding Class Lowest Value + old_number = current_number + 1 + bottom.append(old_number) + + # Finding Class Highest Value + current_number = current_number + interval + top.append(current_number) + + # Append Class Bottom Limit + current_bottom_limit = old_number - 0.5 + bottom_limit.append(current_bottom_limit) + + # Append Class Top Limit + current_top_limit = current_number + 0.5 + top_limit.append(current_top_limit) + + # Finding The Frequency That Range + current_frequency = self.find_frequency(old_number, current_number + 1) + frequency.append(current_frequency) + + # Adding The Number Range From Both Frequency + current_data_range = f"{old_number:.2f} ~ {current_number:.2f}" if not all(isinstance(x, int) for x in self.dataset) else f"{old_number} ~ {current_number}" + data_range.append(current_data_range) + + # Adding Data Range Limit Of The Class Frequency + current_data_limit = f"{current_bottom_limit:.2f} ~ {current_top_limit:.2f}" if not all(isinstance(x, int) for x in self.dataset) else f"{current_bottom_limit} ~ {current_top_limit}" + data_limit.append(current_data_limit) + + # Adding Data Midpoint of The Class Frequency + current_data_midpoint = (old_number + current_number) / 2 + data_midpoint.append(current_data_midpoint) + + # Adding Bottom Cumulative Frequency of The Class + current_bot_cumulative_frequency = self.find_frequency(self.lowest - 1, old_number) + bot_cumulative_frequency.append(current_bot_cumulative_frequency) + + # Adding Top Cumulative Frequency of The Class + current_top_cumulative_frequency = self.find_frequency(current_number + 1, self.highest + 1) + top_cumulative_frequency.append(current_top_cumulative_frequency) + + # Counting the Relative Frequency in Percentage + current_relative_frequency = np.round((current_frequency / self.length) * 100) + relative_frequency.append(current_relative_frequency) + + # Find Mode or Data that appears most frequently + mode_index = [i for i, val in enumerate(frequency) if val == max(frequency)] + mode = [data_range[i] for i in mode_index] + + # Append Processed Data into Data Attributes + self.grouped = ProcessedData(None, bottom, top, bottom_limit, top_limit, + frequency, data_range, data_limit, data_midpoint, + bot_cumulative_frequency, top_cumulative_frequency, + relative_frequency, mode) + + except ValueError as e: + print(f"Error: {e}") + + # Populate Simple Table Frequency Data Method + def PopulateSimple(self): + self.reset() # Reset the frequency table data before processing + + # Initialize general variables + data = sorted(set(self.dataset)) # Remove duplicates and sort the data + + # Initialize limits for numeric data + top_limit = [] + bottom_limit = [] + + # Single loop to process both numeric and string data + for current_class in data: + # Calculate the frequency of the current class + current_frequency = self.dataset.count(current_class) + frequency.append(current_frequency) + + # Calculate the relative frequency for the current class + current_relative_frequency = np.round((current_frequency / self.length) * 100) + relative_frequency.append(current_relative_frequency) + + # If the data is numeric, calculate limits and cumulative frequencies + if not all(isinstance(item, str) for item in self.dataset): + # Calculate top and bottom limits for numeric data + current_top_limit = current_class + 0.5 + current_bottom_limit = current_class - 0.5 + top_limit.append(current_top_limit) + bottom_limit.append(current_bottom_limit) + + # Calculate bottom cumulative frequency for numeric data + current_bot_cumulative_frequency = self.find_frequency(self.lowest - 1, current_class) + bot_cumulative_frequency.append(current_bot_cumulative_frequency) + + # Calculate top cumulative frequency for numeric data + current_top_cumulative_frequency = self.find_frequency(current_class + 1, self.highest + 1) + top_cumulative_frequency.append(current_top_cumulative_frequency) + + else: + # If the data is string-based, calculate cumulative frequencies + # Calculate bottom cumulative frequency for strings + current_bot_cumulative_frequency = self.dataset.count(current_class) + bot_cumulative_frequency.append(current_bot_cumulative_frequency) + + # Calculate top cumulative frequency for strings + current_top_cumulative_frequency = sum(frequency) - current_bot_cumulative_frequency + top_cumulative_frequency.append(current_top_cumulative_frequency) + + # Find the mode (the class with the highest frequency) + mode_index = [i for i, val in enumerate(frequency) if val == max(frequency)] + mode = [data[i] for i in mode_index] + + # Create the ProcessedData object based on the data type + self.simple = ProcessedData( + data, None, None, bottom_limit, top_limit, + frequency, None, None, None, + bot_cumulative_frequency, top_cumulative_frequency, + relative_frequency, mode + ) + +# Processed Data Assignment +class ProcessedData: + # Limit (L), Frequency (F), Ranges (R), Midpoint (M), Cumulative (C), Relative (R) + def __init__(self, data, bot, top, bot_L, top_L, F, R, L, M, bot_CF, top_CF, RF, mode): + self.classval = data + self.bottom = bot + self.top = top + self.bottom_limit = bot_L + self.top_limit = top_L + self.midpoint = M + self.ranges = R + self.limit = L + self.frequency = F + self.bottom_cumulative_frequency = bot_CF + self.top_cumulative_frequency = top_CF + self.relative_frequency = RF + self.percentage_relative_frequency = [f"{rf * 1:.2f}%" for rf in self.relative_frequency] + self.mode = mode diff --git a/pythistic/Transform.py b/pythistic/Transform.py new file mode 100644 index 0000000..6e5da0b --- /dev/null +++ b/pythistic/Transform.py @@ -0,0 +1,75 @@ +# Transform.py +import numpy as np +from scipy.stats import boxcox, yeojohnson + +class Transform: + def __init__(self, data): + # Initialize the Transform class with the raw data. + self.data = np.array(data) + + def log_transform(self): + # Apply logarithmic transformation to the data. + if np.any(self.data <= 0): + raise ValueError("Log transformation requires positive data values.") + return np.log(self.data) + + def square_root_transform(self): + # Apply square root transformation to the data. + if np.any(self.data < 0): + raise ValueError("Square root transformation requires non-negative data values.") + return np.sqrt(self.data) + + def cube_root_transform(self): + # Apply cube root transformation to the data. + return np.cbrt(self.data) + + def reciprocal_transform(self): + # Apply reciprocal transformation to the data. + if np.any(self.data == 0): + raise ValueError("Reciprocal transformation cannot be applied to data containing zero.") + return 1 / self.data + + def box_cox_transform(self, lmbda=None): + # Apply Box-Cox transformation to the data. + if np.any(self.data <= 0): + raise ValueError("Box-Cox transformation requires positive data values.") + transformed_data, best_lambda = boxcox(self.data, lmbda) + return transformed_data, best_lambda + + def yeo_johnson_transform(self, lmbda=None): + # Apply Yeo-Johnson transformation to the data. + transformed_data, best_lambda = yeojohnson(self.data, lmbda) + return transformed_data, best_lambda + + def z_score_standardization(self): + # Apply Z-score standardization to the data. + mean = np.mean(self.data) + std_dev = np.std(self.data) + return (self.data - mean) / std_dev + + def min_max_scaling(self, feature_range=(0, 1)): + # Apply Min-Max scaling to the data. + min_val = np.min(self.data) + max_val = np.max(self.data) + scale = feature_range[1] - feature_range[0] + return feature_range[0] + ((self.data - min_val) / (max_val - min_val)) * scale + + def rank_transform(self): + # Apply rank transformation to the data. + return np.argsort(np.argsort(self.data)) + + def arcsine_transform(self): + # Apply arcsine transformation to the data. + if np.any((self.data < 0) | (self.data > 1)): + raise ValueError("Arcsine transformation requires data in the range [0, 1].") + return np.arcsin(np.sqrt(self.data)) + + def exponential_transform(self, base=np.e): + # Apply exponential transformation to the data. + return np.power(base, self.data) + + def logit_transform(self): + # Apply logit transformation to the data. + if np.any((self.data <= 0) | (self.data >= 1)): + raise ValueError("Logit transformation requires data between 0 and 1 (exclusive).") + return np.log(self.data / (1 - self.data)) diff --git a/pythistic/__init__.py b/pythistic/__init__.py new file mode 100644 index 0000000..a6dfa4f --- /dev/null +++ b/pythistic/__init__.py @@ -0,0 +1,3 @@ +from .FrequencyTable import * +from .Transform import * +from .Chart import * \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f39550f --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +from setuptools import setup, find_packages + +setup( + name='Pythistic', + version='1.1.1', + description='A Python library for statistical data processing', + long_description=open('README.md').read(), + long_description_content_type='text/markdown', + author='BrotherZhafif', + author_email='bangz1504@gmail.com', + url='https://github.com/brotherzhafif/Pythistic', # Ganti dengan URL repository GitHub kamu + license='MIT', + packages=find_packages(), + install_requires=[ + 'matplotlib', + 'matplotlib-venn', + 'numpy', + 'tabulate', + 'pandas', + 'scipy',], + classifiers=[ + 'Programming Language :: Python :: 3', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], + python_requires='>=3.6', +)