Skip to content

Commit

Permalink
Analysis: Improve
Browse files Browse the repository at this point in the history
  • Loading branch information
lrolando committed Jun 14, 2024
1 parent ef911b6 commit 2fd3ebf
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 49 deletions.
18 changes: 12 additions & 6 deletions ceruleo/dataset/analysis/sample_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,25 @@
import numpy as np
import pandas as pd
from pydantic import BaseModel

from typing import List
from ceruleo.dataset.ts_dataset import AbstractPDMDataset
from ceruleo.utils import pydantic_to_dict

logger = logging.getLogger(__name__)


class SampleRateAnalysis(BaseModel):
mode: float
median: float
mean: float
std: float
unit: str

def to_pandas(self) -> pd.Series:
return pd.Series(pydantic_to_dict(self)).to_frame().T

def __repr__(self) -> str:
return f"Mode: {self.median} | {self.mean} +- {self.std} [{self.unit}]"


def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
"""Obtain an array of time difference between two consecutive samples
Expand All @@ -33,9 +37,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
Array of time differences
"""
time_diff = []
time_diff : List[float ]= []
for life in ds:
diff = np.diff(life.index.values)
diff = diff[diff <= np.median(diff)]
if pd.api.types.is_timedelta64_ns_dtype(diff.dtype):
diff = diff / np.timedelta64(1, unit)
time_diff.extend(diff)
Expand All @@ -44,10 +49,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:


def sample_rate_summary(
ds: AbstractPDMDataset, unit: Optional[str] = "s"
ds: AbstractPDMDataset, unit: str = "s"
) -> SampleRateAnalysis:
"""
Obtain the mean, mode and standard deviation of the sample rate of the dataset
Obtain the mean, median and standard deviation of the sample rate of the dataset
Parameters:
ds: The dataset
Expand All @@ -60,5 +65,6 @@ def sample_rate_summary(
return SampleRateAnalysis(
mean=np.mean(sr),
std=np.std(sr),
mode=pd.Series(sr).mode().values[0],
median=np.median(sr),
unit=unit
)
18 changes: 8 additions & 10 deletions ceruleo/dataset/ts_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,6 @@ def number_of_samples_of_time_series(self, i: int) -> int:
def rul_column(self) -> str:
raise NotImplementedError

def duration(self, life: pd.DataFrame) -> float:
return life[self.rul_column].max()

def number_of_samples(self) -> List[int]:
return [
self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
]

def duration(self, life: pd.DataFrame) -> float:
"""Obtain the duration of the time-series
Expand All @@ -82,8 +74,14 @@ def duration(self, life: pd.DataFrame) -> float:
Returns:
Duration of the life
"""
v = life.index
return v.max() - v.min()
return life[self.rul_column].max()

def number_of_samples(self) -> List[int]:
return [
self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
]



def durations(self, show_progress: bool = False) -> List[float]:
"""
Expand Down
78 changes: 45 additions & 33 deletions ceruleo/graphics/duration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import numpy as np
import seaborn as sns
from ceruleo.dataset.ts_dataset import AbstractPDMDataset

from datetime import timedelta
from typing import Iterable
import pandas as pd

def add_vertical_line(ax, v_x, label, color, line, n_lines):

Expand All @@ -29,8 +31,8 @@ def durations_histogram(
label: Union[str, List[str]] = "1",
bins: int = 15,
units: str = "m",
vlines: Tuple[float, str] = [],
ax:matplotlib.axes.Axes=None,
vlines: List[Tuple[float, str]] = [],
ax:Optional[matplotlib.axes.Axes]=None,
add_mean: bool = True,
add_median: bool = True,
transform: Callable[[float], float] = lambda x: x,
Expand Down Expand Up @@ -68,10 +70,13 @@ def durations_histogram(
"""
if isinstance(datasets, list):
assert isinstance(label,list)
assert len(datasets) == len(label)
label_list = label
else:
assert isinstance(label, str)
datasets = [datasets]
label = [label]
label_list = [label]

durations = []
for ds in datasets:
Expand All @@ -80,7 +85,7 @@ def durations_histogram(
return histogram_from_durations(
durations,
xlabel=xlabel,
label=label,
label=label_list,
bins=bins,
units=units,
vlines=vlines,
Expand All @@ -93,50 +98,57 @@ def durations_histogram(
)



def histogram_from_durations(
durations: Union[List[float], List[List[float]]],
durations: List[List[float]],
xlabel: str,
label: Union[str, List[str]] = "",
label: List[str],
bins: int = 15,
units: str = "m",
vlines: List[Tuple[float, str]] = [],
ax=None,
add_mean: bool = True,
add_median: bool = True,
threshold: float = np.inf,
threshold: float = np.inf,
color=None,
alpha=1.0,
**kwargs,
**kwargs
) -> matplotlib.axes.Axes:

if ax is None:
_, ax = plt.subplots(1, 1, **kwargs)

if isinstance(durations[0], list):
assert isinstance(label, list)
assert len(durations) == len(label)
else:
durations = [durations]
label = [label]

assert isinstance(label, list)
assert len(durations) == len(label)


elem_is_timedelta = isinstance(durations[0][0], timedelta)




for l, dur in zip(label, durations):
if len(l) > 0:
l += " "
vlines = copy(vlines)
durations_array = np.array(dur)
if elem_is_timedelta:
durations_array = durations / pd.Timedelta(1, units)
if add_mean:
vlines.append((np.mean(dur), l + "Mean"))
vlines.append((float(np.mean(durations_array)), l + "Mean"))
if add_median:
vlines.append((np.median(dur), l + "Median"))
dur = [d for d in dur if d < threshold]
ax.hist(dur, bins, color=color, alpha=alpha, label=l)
vlines.append((float(np.median(durations_array)), l + "Median"))
durations_array = durations_array[durations_array<threshold]
ax.hist(durations_array, bins, color=color, alpha=alpha, label=l)

ax.set_xlabel(xlabel)
ax.set_ylabel("Number of run-to-failure cycles")

colors = sns.color_palette("hls", len(vlines))
for i, (v_x, l) in enumerate(vlines):
label = f"{l}: {v_x:.2f} {units}"
add_vertical_line(ax, v_x, label, colors[i], i, len(vlines))
vertical_label = f"{l}: {v_x:.2f} {units}"
add_vertical_line(ax, v_x, vertical_label, colors[i], i, len(vlines))
ax.legend()

return ax
Expand Down Expand Up @@ -178,18 +190,22 @@ def durations_boxplot(
"""
if isinstance(datasets, list):
assert isinstance(xlabel, list)
assert isinstance(datasets, list)
assert len(datasets) == len(xlabel)
xlabel_list = xlabel
datasets_list = datasets
else:
datasets = [datasets]
xlabel = [xlabel]
assert isinstance(xlabel, str)
datasets_list = [datasets]
xlabel_list = [xlabel]

durations = []
for ds in datasets:
for ds in datasets_list:
durations.append([transform(duration) for duration in ds.durations()])

return boxplot_from_durations(
durations,
xlabel=xlabel,
xlabel=xlabel_list,
ylabel=ylabel,
ax=ax,
hlines=hlines,
Expand All @@ -200,8 +216,8 @@ def durations_boxplot(


def boxplot_from_durations(
durations: Union[List[float], List[List[float]]],
xlabel: Union[str, List[str]],
durations: List[List[float]],
xlabel: List[str],
ylabel: str,
ax=None,
hlines: List[Tuple[float, str]] = [],
Expand All @@ -210,12 +226,8 @@ def boxplot_from_durations(
**kwargs,
)-> matplotlib.axes.Axes:

if isinstance(durations[0], list):
assert isinstance(xlabel, list)
assert len(durations) == len(xlabel)
else:
durations = [durations]
xlabel = [xlabel]
assert isinstance(xlabel, list)
assert len(durations) == len(xlabel)

if ax is None:
fig, ax = plt.subplots(**kwargs)
Expand Down

0 comments on commit 2fd3ebf

Please sign in to comment.