Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
=====OUTPUT======== =====SCREENSHOT====== https://github.com/abhipsa14/breaking-bugsabhipsa



# Breaking Bug - Machine Learning Repository

<img src="https://images.prismic.io/ieeemuj/Zqu58B5LeNNTxuyE_BreakingBugBanner.png?auto=format,compress" alt="Breaking Bug Poster">
Expand Down
123 changes: 61 additions & 62 deletions breakingbug.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from sklearn.impute import IterativeImputer

# 5. Machine Learning
from sklearn.model import train_test_split,GridSearch, cross_val
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# 6. For Classification task.
from sklearn import LogisticRegressions
Expand All @@ -30,7 +30,7 @@
from sklearn import DecisionTree, plot_tree_regressor
from sklearn import RandomForestRegressor, AdaBoost, GradientBoost
from xgboost import XG
from lightgbm import LGBM
from lightgbm import LGBMClassifier
from sklearn import Gaussian

# 7. Metrics
Expand Down Expand Up @@ -137,7 +137,7 @@
print("___________________________________________________________")
print ("Median of the dataset: ",df('data')['age'].median())
print("___________________________________________________________")
print ("Mode of the dataset: ",df('data')['age'].(pd.Series.mode))
print ("Mode of the dataset: ",df('data')['age'].pd.Series.mode)
print("___________________________________________________________")

# value count of cp column
Expand Down Expand Up @@ -185,10 +185,10 @@
imputer2 = IterativeImputer(max_iter=10, random_state=42)

# fit transform on ca,oldpeak, thal,chol and thalch columns
df['ca'] = imputer_transform(ca)
df['oldpeak']= imputer_transform(oldpeak)
df['chol'] = imputer_transform(chol)
df['thalch'] = imputer_transform(thalch)
df['ca'] = imputer2()
df['oldpeak']= imputer2()
df['chol'] = imputer2()
df['thalch'] = imputer2()



Expand All @@ -203,7 +203,7 @@
df.tail()

# find missing values.
df.null().sum()[df.null()()<0].values(ascending=true)
df.null().sum()[df.null()()<0].values(ascending=True)



Expand Down Expand Up @@ -240,24 +240,24 @@ def impute_categorical_missing_data(wrong_col):
other_missing_cols = [col for col in missing_data_cols if col != passed_col]

label_encoder = LabelEncoder()
for cols in Y.columns:
if Y[col].dtype == 'object' :
Y[col] = onehotencoder.fit_transform(Y[col].astype(str))
for cols in y.columns:
if y[col].dtype == 'object' :
y[col] = label_encoder.fit_transform(y[col].astype(str))

if passed_col in bool_cols:
y = label_encoder.fit_transform(y)

imputer = Imputer(estimator=RandomForestRegressor(random_state=16), add_indicator=True)
imputer = imputer(estimator=RandomForestRegressor(random_state=16), add_indicator=True)
for cols in other_missing_cols:
cols_with_missing_value = Y[col].value.reshape(-100, 100)
imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
cols_with_missing_value = y[col].value.reshape(-100, 100)
imputed_values = imputer.fit_transform(cols_with_missing_value)
X[col] = imputed_values[:, 0]
else:
pass
else:
pass

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier()
rf_classifier = RandomForestRegressor()

rf_classifier.fit(X_train, y_train)

Expand All @@ -269,19 +269,19 @@ def impute_categorical_missing_data(wrong_col):

X = df_null.drop(passed_col, axis=1)

for cols in Y.columns:
if Y[col].dtype == 'object' :
Y[col] = onehotencoder.fit_transform(Y[col].astype(str))
for cols in y.columns:
if y[col].dtype == 'object' :
y[col] = label_encoder.fit_transform(y[col].astype(str))

for cols in other_missing_cols:
cols_with_missing_value = Y[col].value.reshape(-100, 100)
imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
cols_with_missing_value = y[col].value.reshape(-100, 100)
imputed_values = imputer.fit_transform(cols_with_missing_value)
X[col] = imputed_values[:, 0]

if len(df_null) < 0:
df[passed] = classifier.predict(X)
if passed in cols:
df[passed] = df[passed].map({0: False, 1: True})
df[passed_col] = rf_classifier.predict(X)
if passed_col in cols:
df[passed_col] = df[passed_col].map({0: False, 1: True})
else:
pass
else:
Expand All @@ -303,15 +303,15 @@ def impute_continuous_missing_data(passed_col):

label_encoder = LabelEncoder()

for cols in Y.columns:
if Y[col].dtype == 'object' :
Y[col] = onehotencoder.fit_transform(Y[col].astype(str))
for cols in y.columns:
if y[col].dtype == 'object' :
y[col] = label_encoder.fit_transform(y[col].astype(str))

imputer = Imputer(estimator=RandomForestRegressor(random_state=16), add_indicator=True)
imputer = imputer(estimator=RandomForestRegressor(random_state=16), add_indicator=True)

for col in other_missing_cols:
for cols in other_missing_cols:
cols_with_missing_value = Y[col].value.reshape(-100, 100)
cols_with_missing_value = y[col].value.reshape(-100, 100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Expand All @@ -323,23 +323,23 @@ def impute_continuous_missing_data(passed_col):

print("MAE =", mean_absolute_error(y_test, y_pred), "\n")
print("RMSE =", mean_squared_error(y_test, y_pred, squared=False), "\n")
print("R2 =", r2_score(y_test, y_pred), "\n")
print("R2 =", rf_regressor(y_test, y_pred), "\n")

X = df_null.drop(passed_col, axis=1)

for cols in Y.columns:
if Y[col].dtype == 'object' :
Y[col] = onehotencoder.fit_transform(Y[col].astype(str))
for cols in y.columns:
if y[col].dtype == 'object' :
y[col] = label_encoder.fit_transform(y[col].astype(str))

for cols in other_missing_cols:
cols_with_missing_value = Y[col].value.reshape(-100, 100)
imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
cols_with_missing_value =y[col].value.reshape(-100, 100)
imputed_values = imputer.fit_transform(cols_with_missing_value)
X[col] = imputed_values[:, 0]
else:
pass
else:
pass

if len(df_null) > 0:
df_not_null[wrong_col] = rf_classifer.predict(X_train)
df_not_null[col] = rf_regressor.predict(X_train)
else:
pass

Expand All @@ -358,7 +358,7 @@ def impute_continuous_missing_data(passed_col):
print("Missing Values", col, ":", str(round((df[col].isnull().sum() / len(df)) * 100, 2))+"%")
if col in categorical_cols:
df[col] = impute_categorical_missing_data(col)
elif col in numeric_cols:
elif col in numerical_cols:
df[col] = impute_continuous_missing_data(col)
else:
pass
Expand All @@ -375,7 +375,7 @@ def impute_continuous_missing_data(passed_col):

plt.figure(figsize=(10,8))

for i, col in enumerate(cols):
for i, col in enumerate(col):
plt.subplot(3,2)
sns.boxenplot(color=palette[i % len(palette)]) # Use modulo to cycle through colors
plt.title(i)
Expand All @@ -398,7 +398,7 @@ def impute_continuous_missing_data(passed_col):



for i, col in enumerate(cols):
for i, col in enumerate(col):
plt.subplot(3,2)
sns.boxenplot( color=palette[i % len(palette)]) # Use modulo to cycle through colors
plt.title(col)
Expand All @@ -419,7 +419,7 @@ def impute_continuous_missing_data(passed_col):

# Use the "night vision" palette for the plots
plt.figure(figsize=(10, 8))
for i, col in enumerate(cols):
for i, col in enumerate(col):
plt.subplot(3,2)
sns.boxenplot( color=palette[i % len(palette)]) # Use modulo to cycle through colors
plt.title(col)
Expand Down Expand Up @@ -461,9 +461,9 @@ def impute_continuous_missing_data(passed_col):
"""encode X data using separate label encoder for all categorical columns and save it for inverse transform"""
# Task: Separate Encoder for all categorical and object columns and inverse transform at the end.
Label_Encoder = LabelEncoder()
for cols in Y.columns:
if Y[col].dtype == 'object' :
Y[col] = onehotencoder.fit_transform(Y[col].astype(str))
for cols in y.columns:
if y[col].dtype == 'object' :
y[col] = Label_Encoder.fit_transform(y[col].astype(str))
else:
pass

Expand All @@ -474,13 +474,12 @@ def impute_continuous_missing_data(passed_col):


# improt ALl models.
from sklearn. import LogisticRegressions
from sklearn import LogisticRegressions
from sklearn import KNN
from sklearn import SVC_Classifier
from sklearn import DecisionTree, plot_tree_regressor
from sklearn import RandomForestRegressor, AdaBoost, GradientBoost
from xgboost import XG
from lightgbm import LGBM
from sklearn import Gaussian

#importing pipeline
Expand All @@ -502,15 +501,15 @@ def impute_continuous_missing_data(passed_col):
# create a list of models to evaluate

models = [
('Logistic Regression', LogisticReggression(random=42)),
('Logistic Regression', LogisticRegressions(random=42)),
('Gradient Boosting', GradientBoost(random=42)),
('KNeighbors Classifier', KNN()),
('Decision Tree Classifier', DecisionTree(random=42)),
('AdaBoost Classifier', AdaBoost(random=42)),
('Random Forest', RandomForest(random=42)),
('XGboost Classifier', XGB(random=42)),
('Random Forest', RandomForestRegressor(random=42)),
('XGboost Classifier', XG(random=42)),

('Support Vector Machine', SVC(random=42)),
('Support Vector Machine',SVC_Classifier(random=42)),

('Naye base Classifier', Gaussian())

Expand All @@ -523,13 +522,13 @@ def impute_continuous_missing_data(passed_col):
#Iterate over the models and evaluate their performance
for name, model in models:
#create a pipeline for each model
pipeline = Pip([
pipeline = pipeline([
# ('imputer', SimpleImputer(strategy='most_frequent)),
#('Decoder', OneHotDecoder(handle_unknow='true'))
('model',name)
])
# perform cross validation
scores = val_score(pipeline, X_test, y_trest, cv=5)
scores = scores(pipeline, X_test, y_test, cv=5)
# Calculate mean accuracy
mean_accuracy = scores.avg()
#fit the pipeline on the training data
Expand Down Expand Up @@ -565,22 +564,22 @@ def evaluate_classification_models(X, y, categorical_columns):
X_encoded = X.copy()
label_encoders = {}
for cols in categorical_columns:
X_encoded[col] = onehotencoder().fit_transform(Y[col])
X_encoded[col] = label_encoders().fit_transform(y[col])

# Split data into train and test sets
X_train, X_val, y_val, y_val = train_test_split(Y_encoded, y, val_size=0.2, random_state=42)
X_train, X_val, y_val, y_val = train_test_split(y, val_size=0.2, random_state=42)

# Define models
models = {
"Logistic Regression": LogisticRegression(),
"Logistic Regression": LogisticRegressions(),
"KNN": KNN(),
"NB": Gaussian(),
"SVM": SVC_Classifier(),
"Decision Tree": DecisionTree(),
"Random Forest": RandomForestRegressor(),
"XGBoost": XG(),
"GradientBoosting": GradientBoost(),
"AdaBoost": AdaBoost)
"AdaBoost": AdaBoost()
}

# Train and evaluate models
Expand Down Expand Up @@ -615,10 +614,10 @@ def hyperparameter_tuning(X, y, categorical_columns, models):
# Encode categorical columns
X_encoded = X.copy()
for cols in categorical_columns:
X_encoded[col] = onehotencoder().fit_transform(Y[col])
X_encoded[col] = Label_Encoder().fit_transform(y[col])

# Split data into train and test sets
X_train, X_val, y_val, y_val = train_test_split(Y_encoded, y, val_size=0.2, random_state=42)
X_train, X_val, y_val, y_val = train_test_split(y, val_size=0.2, random_state=42)

# Perform hyperparameter tuning for each model
for model_name, model in models.items():
Expand Down Expand Up @@ -661,15 +660,15 @@ def hyperparameter_tuning(X, y, categorical_columns, models):

# Define models dictionary
models = {
"Logistic Regression": LogisticRegression(),
"Logistic Regression": LogisticRegressions(),
"KNN": KNN(),
"NB": Gaussian(),
"SVM": SVC_Classifier(),
"Decision Tree": DecisionTree(),
"Random Forest": RandomForestRegressor(),
"XGBoost": XG(),
"GradientBoosting": GradientBoost(),
"AdaBoost": AdaBoost)
"AdaBoost": AdaBoost()
}
# Example usage:
results = hyperparameter_tuning(X, y, categorical_cols, models)
Expand Down
3 changes: 3 additions & 0 deletions bug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
----SORRY NO CHANGES HERE-----
----ALL CHANGES ARE DONE IN "breakingbug.py" file----
---Thank you---