Skip to content

Commit 44a81ff

Browse files
committed
automatic phase assignment functionality using random forest algorithm for analysing feature importance is added
1 parent 3feb3de commit 44a81ff

File tree

5 files changed

+365
-29
lines changed

5 files changed

+365
-29
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ traj.*
1515
*.h5
1616
*.vtu
1717
*.png
18-
./output/
18+
./output/
19+
./tests/data/

compositionspace/datautils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def get_apt_dataframe(self):
230230

231231
if filename.endswith(".RRNG"):
232232
path = os.path.join(self.params["input_path"], filename)
233-
ions,rrngs = self.get_rrng(rrange_file)
233+
ions,rrngs = self.get_rrng(path)
234234

235235
return (df_Mass_POS_lst, file_name_lst, ions, rrngs)
236236

@@ -278,6 +278,7 @@ def get_big_slices(self):
278278
group1 = hdf.create_group("group_xyz_Da_spec")
279279
group1.attrs["columns"] = ["x","y","z","Da","spec"]
280280
group1.attrs["spec_name_order"] = list(c)
281+
self.chemical_species = list(c) # Added A.S. 2024.06.25
281282
sublength_x= abs((max(sorted_df['z'])-min(sorted_df['z']))/self.params["n_big_slices"])
282283

283284
start = min(sorted_df['z'])

compositionspace/segmentation.py

+93
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from pyevtk.hl import gridToVTK#, pointsToVTKAsTIN
1414
import yaml
1515
import pyvista as pv
16+
from sklearn.mixture import GaussianMixture
17+
from sklearn.ensemble import RandomForestClassifier
1618

1719
class CompositionClustering():
1820

@@ -273,4 +275,95 @@ def plot3d(self, **kwargs):
273275
grid.plot(**kwargs, jupyter_backend="panel")
274276

275277

278+
def plot_relative_importance(self, feature_importances, feature_names, sorted_idx):
279+
# Plotting the feature importances
280+
# Create the vertical bar graph
281+
plt.figure(figsize=(10, 12))
282+
plt.title("Feature Importances")
283+
plt.bar(range(len(sorted_idx)), feature_importances[sorted_idx], align="center")
284+
plt.xticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx], rotation=45)
285+
plt.ylabel("Relative Importance")
286+
plt.xlabel("Features")
287+
plt.show()
288+
289+
def auto_phase_assign(self, Slices_file, Vox_ratios_file,
290+
initial_guess_phases, plot=False,
291+
print_importance=False,
292+
modified_comp_analysis=None,
293+
n_trunc_spec=None):
294+
295+
296+
with h5py.File(Slices_file , "r") as hdfr:
297+
group1 = hdfr.get("group_xyz_Da_spec")
298+
Chem_list =list(list(group1.attrs.values())[1])
299+
#hdfr['Group_xyz_Da_spec'].attrs.keys()
300+
301+
302+
with h5py.File(Vox_ratios_file , "r") as hdfr:
303+
Ratios = np.array(hdfr.get("vox_ratios"))
304+
group = hdfr.get("vox_ratios")
305+
Ratios_colomns = list(list(hdfr.attrs.values())[0])
306+
307+
308+
Ratios = pd.DataFrame(data=Ratios, columns=Ratios_colomns)
309+
310+
X = Ratios.drop(['Total_no','vox'], axis=1)
311+
312+
gm = GaussianMixture(n_components=initial_guess_phases, max_iter=100000,verbose=0)
313+
gm.fit(X)
314+
y_pred=gm.predict(X)
315+
Ratios = pd.DataFrame(data=X.values, columns=Chem_list)
316+
317+
318+
# Replace this with your actual dataset loading code
319+
X_ = X.values
320+
y = y_pred
321+
# Initialize the Random Forest Classifier
322+
rf = RandomForestClassifier(n_estimators=100, random_state=42)
323+
324+
# Fit the model to the data
325+
rf.fit(X_, y)
276326

327+
# Get the feature importances
328+
feature_importances = rf.feature_importances_
329+
330+
# Sort the features by their importances
331+
#sorted_idx = np.argsort(feature_importances)
332+
sorted_idx = feature_importances.argsort()[::-1]
333+
334+
335+
336+
# Print sorted feature importances along with their corresponding feature numbers
337+
feature_names = Chem_list
338+
339+
340+
if plot==True:
341+
self.plot_relative_importance(feature_importances, feature_names, sorted_idx)
342+
343+
if print_importance == True:
344+
for index in sorted_idx:
345+
print(f" {feature_names[index]} - Importance: {feature_importances[index]}")
346+
347+
# BIC analysis on modified compositions
348+
if modified_comp_analysis == True:
349+
350+
#n_trunc_spec = 2
351+
X_modified = X.values[:, sorted_idx][:,0:n_trunc_spec]
352+
gm_scores=[]
353+
aics=[]
354+
bics=[]
355+
356+
n_clusters=list(range(1,11))
357+
for n_cluster in tqdm(n_clusters):
358+
gm = GaussianMixture(n_components=n_cluster,verbose=0)
359+
gm.fit(X_modified)
360+
y_pred=gm.predict(X_modified)
361+
#gm_scores.append(homogeneity_score(y,y_pred))
362+
aics.append(gm.aic(X_modified))
363+
bics.append(gm.bic(X_modified))
364+
365+
plt.plot(n_clusters, bics, "-o",label="BIC")
366+
367+
return sorted_idx
368+
369+

tests/20230303_Workflow.ipynb

+266-25
Large diffs are not rendered by default.

tests/experiment_params.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,5 @@ ml_models:
1414
max_depth: 0
1515
n_estimators: 0
1616
DBScan:
17-
eps: 3
18-
min_samples: 5
17+
eps: 5
18+
min_samples: 25

0 commit comments

Comments
 (0)