Merge pull request #61 from llegregam/56-utilisation-de-la-colonne-no…

…rmalized-area-pour-les-ratios-c12c13-skyline 56 utilisation de la colonne normalized area pour les ratios c12c13 skyline
llegregam · Sep 9, 2024 · 58aed0b · 58aed0b
2 parents 90613c1 + 0821176
commit 58aed0b
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 75 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,21 @@
 # Changelog
 
-## [1.7.0] - 2024-04-05
+## [1.7.0] - 2024-09-09
 
 ### Added
 
 - Changes introduced in new versions are now explicitly detailed in the graphical user interface
 
+### Changed
+
+- Ratios are not calculated any more, just parsed from data
+
+### Fixed
+
+- Fixed bug on launch where the program wouldn't start because of multiple 
+  calls to set_config
+- Removed deprecated pandas functions
+
 ## [1.6.2] - 2024-03-27
 
 ### Added

diff --git a/ms_reader/app.py b/ms_reader/app.py
@@ -75,7 +75,7 @@ def df_format(x):
     return x.astype(str)
 
 
-@st.cache
+@st.cache_data
 def convert_df(df):
     """
     Convert dataframe to excel file stored in RAM and return it
@@ -88,6 +88,7 @@ def convert_df(df):
     df.to_excel(buffer)
     return buffer.getvalue()
 
+# BEGINNING OF APP
 
 st.set_page_config(page_title=f"MS_Reader (v{__version__})")
 st.title(f"Welcome to MS_Reader (v{__version__})")

diff --git a/ms_reader/extract.py b/ms_reader/extract.py
@@ -63,7 +63,7 @@ def __init__(self, data, calrep=None, metadata=None, met_class="CM"):
 
         self.data.drop("Filename", axis=1, inplace=True)
         columns = [
-            "Compound", "Sample_Name", "Area", "Sample Type",
+            "Compound", "Sample_Name", "Area", "Response Ratio", "Sample Type",
             "Calculated Amt", "Theoretical Amt",
             "Excluded", "%Diff"
         ]
@@ -274,7 +274,10 @@ def _replace_nf(self):
         Replace initial N/F with 0
         :return: None
         """
-        self.data["Area"] = self.data["Area"].replace("N/F", 0)
+        # Set the option to raise an error when downcasting
+        pd.set_option('future.no_silent_downcasting', True)
+
+        self.data["Area"] = self.data["Area"].replace("N/F", 0).infer_objects(copy=False)
         self.data["Calculated Amt"] = self.data["Calculated Amt"].replace("N/F", 0).copy()
 
     def _split_dataframes(self):
@@ -325,6 +328,8 @@ def _get_excluded(self):
         self.calib_data["Calculated Amt"] = pd.to_numeric(
             self.calib_data["Calculated Amt"]
         )
+
+        self.calib_data["Calculated Amt"] = self.calib_data["Calculated Amt"].astype(object)
         self.calib_data.loc[
             self.calib_data["Excluded"] == "True", "Calculated Amt"
         ] = "Excluded"
@@ -708,12 +713,21 @@ def generate_concentrations_table(self, loq_export, base_unit=None):
                 )
 
     def generate_ratios(self):
+        """
+        Generate ratios between c12 and c13. 
+        - Check if all metabolites are present in both datasets 
+        - Checks whether c12 and c13 have the same values in the "Response Ratio" column 
+            (ratio c12/c13 taking into account the subrogation of the c13 signal).
+        - Then compute the ratios and format the tables for export.
+        
+        """
 
-        # Isolate missing c13 compounds
+        # Isolate c12 and c13 data
         c12 = self.sample_data[
             ~self.sample_data["Compound"].str.contains("C13")].copy()
         c13 = self.sample_data[
-            self.sample_data["Compound"].str.contains("C13")].copy()
+            self.sample_data["Compound"].str.contains("C13")].copy() 
+        # Use the _check_if_std method to check for missing c13 compounds   
         c12_compounds, missing_c13_std = self._check_if_std(
             list(c12["Compound"].unique()), list(c13["Compound"].unique())
         )
@@ -722,102 +736,175 @@ def generate_ratios(self):
                 f"Metabolites missing from IDMS: \n{missing_c13_std}")
         else:
             self.logger.info("All metabolites are present in the IDMS")
-
-        # Drop missing compounds from c12 df
+
         c13.loc[:, "Compound"] = c13.loc[:, "Compound"].str.slice(0, -4)
+        # Set indexes and sort them for both c12 and c13 dataframes to ensure they are the same
         c12.set_index(["Compound", "Sample_Name"], inplace=True)
         c13.set_index(["Compound", "Sample_Name"], inplace=True)
         c12.sort_index(level=['Compound', 'Sample_Name'], inplace=True)
         c13.sort_index(level=['Compound', 'Sample_Name'], inplace=True)
+        # Drop missing compounds from c12 df
         if missing_c13_std:
             c12.drop(missing_c13_std, inplace=True)
-        to_log = pd.pivot_table(
-            c13[c13['Area'] == 0], 'Area', 'Compound', 'Sample_Name'
-        )
-        self.logger.warning(
-            f"\nMetabolites with null areas in c13 data:\n"
-            f"{to_log}\n")
-
-        # Ensure that c12 and C13 have same indexes. Check both ways and
-        # isolate missing indexes. Compute ratios
-        if c12.index.difference(c13.index).levshape != (
-                0, 0) and c13.index.difference(c12.index).levshape != (0, 0):
-            c12_diff = c12.index.difference(c13.index)
-            c13_diff = c13.index.difference(c12.index)
-            intercept = c12.index.intersection(c13.index)
-            self.ratios = c12.loc[intercept, "Area"].divide(
-                c13.loc[intercept, "Area"])
-            self.ratios.name = "Ratios"
-            self.no_ratio = {
-                "c12": c12.loc[c12_diff, :],
-                "c13": c13.loc[c13_diff, :]
-            }
-            self.logger.debug(
-                f"Some index levels are in C12 data and not in C13 data. "
-                f"Differences:\n{self.no_ratio['c12']} "
-                f"\n Some index levels are in C13 data and not in C12 data. "
-                f"Differences: \n{self.no_ratio['c13']}")
-        else:
-            if c12.index.difference(c13.index).levshape != (0, 0):
-                c12_c13_diff = c12.index.difference(c13.index)
-                self.ratios = c12.drop(c12_c13_diff).loc[
-                    c12_c13_diff, "Area"].divide(c13.loc[:, "Area"])
-                self.ratios.name = "Ratios"
-                self.no_ratio = c12.loc[c12_c13_diff, :]
-                self.logger.info(
-                    f"Some index levels are in C12 data and not in C13 data. "
-                    f"Differences:\n{c12_c13_diff}"
-                )
-                print(f"Ratios calculated:\n{self.ratios}")
-            elif c13.index.difference(c12.index).levshape != (0, 0):
-                c13_c12_diff = c13.index.difference(c12.index)
-                self.ratios = c12.loc[:, "Area"].divide(
-                    c13.drop(c13_c12_diff).loc[:, "Area"])
-                self.ratios.name = "Ratios"
-                self.no_ratio = c13.loc[c13_c12_diff, :]
-                self.logger.info(
-                    f"Some index levels are in C13 data and not in C12 data. "
-                    f"Differences:\n{c13_c12_diff}"
-                )
-                print(f"Ratios calculated:\n{self.ratios}")
-            else:
-                self.ratios = c12.loc[:, "Area"].divide(c13.loc[:, "Area"])
-                self.ratios.name = "Ratios"
-                print(
-                    f"Ratios calculated with no differences detected between "
-                    f"c12 and c13 indexes. Ratios:\n{self.ratios}"
-                )
+
+        # "Response Ratio: contains a ratio between c12 and c13 (takes c13 signal subrogation into account) for each compound and sample
+        self.ratios = c12["Response Ratio"]
+        self.ratios.name = "Ratios"
+
         self.ratios = self.ratios.reset_index(level="Sample_Name")
+        # Values in the ratios column are in scientific notation, convert them to numeric
+        self.ratios["Ratios"] = pd.to_numeric(self.ratios["Ratios"], errors="coerce")
         self.ratios = pd.pivot_table(self.ratios, "Ratios", "Compound",
-                                     "Sample_Name")
+                                    "Sample_Name")
+
+        # Add a unit column to the ratios table
         base_unit = "12C/13C"
         new_cols = natsorted(self.ratios.columns)
         new_cols.insert(0, "unit")
+
         if self.metadata is not None:
             self.normalised_ratios = self.normalise(
                 self.ratios.copy(),
                 multiply=False
             )
-            self.normalised_ratios = self.normalised_ratios.applymap(format)
+            self.normalised_ratios = self.normalised_ratios.map(format)
             self.normalised_ratios["unit"] = f"{base_unit}/{self.norm_unit}"
             self.normalised_ratios = self.normalised_ratios[new_cols]
             self.normalised_ratios = self._replace(
                 self.normalised_ratios, [np.inf, np.nan], "NA", "dataframe"
             )
-        self.ratios = self.ratios.applymap(format)
+
+
+        self.ratios = self.ratios.map(format)
         self.ratios["unit"] = base_unit
         self.ratios = self.ratios[new_cols]
         self.ratios = self._replace(
             self.ratios, [np.inf, np.nan], "NA", "dataframe"
         )
+
         self.excel_tables.append(
             ("Ratios", self.ratios)
         )
+
         if self.metadata is not None:
             self.excel_tables.append(
                 ("Normalised_Ratios", self.normalised_ratios)
             )
-
+
+    # def generate_ratios(self):
+
+    #     # Isolate missing c13 compounds
+    #     c12 = self.sample_data[
+    #         ~self.sample_data["Compound"].str.contains("C13")].copy()
+    #     c13 = self.sample_data[
+    #         self.sample_data["Compound"].str.contains("C13")].copy()
+    #     c12_compounds, missing_c13_std = self._check_if_std(
+    #         list(c12["Compound"].unique()), list(c13["Compound"].unique())
+    #     )
+    #     if missing_c13_std:
+    #         self.logger.info(
+    #             f"Metabolites missing from IDMS: \n{missing_c13_std}")
+    #     else:
+    #         self.logger.info("All metabolites are present in the IDMS")
+
+    #     # Drop missing compounds from c12 df
+    #     c13.loc[:, "Compound"] = c13.loc[:, "Compound"].str.slice(0, -4)
+    #     c12.set_index(["Compound", "Sample_Name"], inplace=True)
+    #     c13.set_index(["Compound", "Sample_Name"], inplace=True)
+    #     c12.sort_index(level=['Compound', 'Sample_Name'], inplace=True)
+    #     c13.sort_index(level=['Compound', 'Sample_Name'], inplace=True)
+    #     if missing_c13_std:
+    #         c12.drop(missing_c13_std, inplace=True)
+    #     to_log = pd.pivot_table(
+    #         c13[c13['Area'] == 0], 'Area', 'Compound', 'Sample_Name'
+    #     )
+    #     self.logger.warning(
+    #         f"\nMetabolites with null areas in c13 data:\n"
+    #         f"{to_log}\n")
+
+    #     # Ensure that c12 and C13 have same indexes. Check both ways and
+    #     # isolate missing indexes. Compute ratios
+    #     if c12.index.difference(c13.index).levshape != (
+    #             0, 0) and c13.index.difference(c12.index).levshape != (0, 0):
+    #         c12_diff = c12.index.difference(c13.index)
+    #         c13_diff = c13.index.difference(c12.index)
+    #         intercept = c12.index.intersection(c13.index)
+    #         self.ratios = c12.loc[intercept, "Area"].divide(
+    #             c13.loc[intercept, "Area"])
+    #         self.ratios.name = "Ratios"
+    #         self.no_ratio = {
+    #             "c12": c12.loc[c12_diff, :],
+    #             "c13": c13.loc[c13_diff, :]
+    #         }
+
+    #         self.logger.debug(
+    #             f"Some index levels are in C12 data and not in C13 data. "
+    #             f"Differences:\n{self.no_ratio['c12']} "
+    #             f"\n Some index levels are in C13 data and not in C12 data. "
+    #             f"Differences: \n{self.no_ratio['c13']}")
+
+    #     else:
+    #         if c12.index.difference(c13.index).levshape != (0, 0):
+    #             c12_c13_diff = c12.index.difference(c13.index)
+    #             self.ratios = c12.drop(c12_c13_diff).loc[
+    #                 c12_c13_diff, "Area"].divide(c13.loc[:, "Area"])
+    #             self.ratios.name = "Ratios"
+    #             self.no_ratio = c12.loc[c12_c13_diff, :]
+    #             self.logger.info(
+    #                 f"Some index levels are in C12 data and not in C13 data. "
+    #                 f"Differences:\n{c12_c13_diff}"
+    #             )
+    #             print(f"Ratios calculated:\n{self.ratios}")
+    #         elif c13.index.difference(c12.index).levshape != (0, 0):
+    #             c13_c12_diff = c13.index.difference(c12.index)
+    #             self.ratios = c12.loc[:, "Area"].divide(
+    #                 c13.drop(c13_c12_diff).loc[:, "Area"])
+    #             self.ratios.name = "Ratios"
+    #             self.no_ratio = c13.loc[c13_c12_diff, :]
+    #             self.logger.info(
+    #                 f"Some index levels are in C13 data and not in C12 data. "
+    #                 f"Differences:\n{c13_c12_diff}"
+    #             )
+    #             print(f"Ratios calculated:\n{self.ratios}")
+    #         else:
+    #             self.ratios = c12.loc[:, "Area"].divide(c13.loc[:, "Area"])
+    #             self.ratios.name = "Ratios"
+    #             print(
+    #                 f"Ratios calculated with no differences detected between "
+    #                 f"c12 and c13 indexes. Ratios:\n{self.ratios}"
+    #             )
+    #     self.ratios = self.ratios.reset_index(level="Sample_Name")
+    #     self.ratios = pd.pivot_table(self.ratios, "Ratios", "Compound",
+    #                                  "Sample_Name")
+
+    #     base_unit = "12C/13C"
+    #     new_cols = natsorted(self.ratios.columns)
+    #     new_cols.insert(0, "unit")
+    #     if self.metadata is not None:
+    #         self.normalised_ratios = self.normalise(
+    #             self.ratios.copy(),
+    #             multiply=False
+    #         )
+    #         self.normalised_ratios = self.normalised_ratios.applymap(format)
+    #         self.normalised_ratios["unit"] = f"{base_unit}/{self.norm_unit}"
+    #         self.normalised_ratios = self.normalised_ratios[new_cols]
+    #         self.normalised_ratios = self._replace(
+    #             self.normalised_ratios, [np.inf, np.nan], "NA", "dataframe"
+    #         )
+    #     self.ratios = self.ratios.map(format)
+    #     self.ratios["unit"] = base_unit
+    #     self.ratios = self.ratios[new_cols]
+    #     self.ratios = self._replace(
+    #         self.ratios, [np.inf, np.nan], "NA", "dataframe"
+    #     )
+    #     self.excel_tables.append(
+    #         ("Ratios", self.ratios)
+    #     )
+    #     if self.metadata is not None:
+    #         self.excel_tables.append(
+    #             ("Normalised_Ratios", self.normalised_ratios)
+    #         )
+
     @staticmethod
     def _check_if_std(c12_compounds, c13_compounds):
         """
@@ -1162,3 +1249,13 @@ class QCError(Error):
 
     def __init__(self, message):
         self.message = message
+
+# if __name__ == "__main__":
+#     from ms_reader.skyline_convert import import_skyline_dataset
+#     with open(r"C:\Users\kouakou\Documents\MSREADER\data\20240715_GUILLOT_HILIC-POSNEG_QUANT_sansAA-NEG.tsv", "rb") as file:
+#         data = import_skyline_dataset(file)
+#         # data.to_excel(r"C:\Users\kouakou\Documents\MSREADER\data\test2.xlsx")
+
+#         extract = Extractor(data)
+#         extract.generate_ratios()
+
diff --git a/ms_reader/skyline_convert.py b/ms_reader/skyline_convert.py
@@ -14,7 +14,8 @@
     "Quantification": "Calculated Amt",
     "Explicit Analyte Concentration": "Theoretical Amt",
     "Exclude From Calibration": "Excluded",
-    "Accuracy": "%Diff"
+    "Accuracy": "%Diff",
+    "Normalized Area": "Response Ratio"
 }
 
 SAMPLE_TYPE_MAPPING = {
@@ -155,8 +156,9 @@ def import_skyline_dataset(skyline_file):
     return data
 
 
-if __name__ == "__main__":
+# if __name__ == "__main__":
 
-
-    data = import_skyline_dataset(r"C:\Users\legregam\PycharmProjects\MSReader\tests\data\skyline\Quantif-MC.csv")
-    data.to_excel(r"C:\Users\legregam\Desktop\test\test2.xlsx")
+#     with open(r"C:\Users\kouakou\Documents\MSREADER\data\20240715_GUILLOT_HILIC-POSNEG_QUANT_sansAA-NEG.tsv", "rb") as file:
+#         data = import_skyline_dataset(file)
+#         print(data["Response Ratio"])
+#         # data.to_excel(r"C:\Users\kouakou\Documents\MSREADER\data\test2.xlsx")
diff --git a/setup.cfg b/setup.cfg
@@ -18,11 +18,11 @@ classifiers =
 packages = find:
 python_requires = >=3.8
 install_requires =
-    pandas >= 1.3.4
+    pandas >= 2.0.3
     numpy >= 1.21.4
     natsort >= 8.0.0
     streamlit>=1.8.0
-    openpyxl >= 3.0.9
+    openpyxl >= 3.1.0
 
 [options.entry_points]
 console_scripts =