Skip to content

Commit

Permalink
🐛🔥 remove old code, document and fix bug in scikit-learn interface
Browse files Browse the repository at this point in the history
- scikit-learn interface for CF did not specify validation split correctly
  • Loading branch information
Henry committed Jun 2, 2024
1 parent 0874f5f commit 077d305
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 32 deletions.
31 changes: 12 additions & 19 deletions vaep/models/collab.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import pandas as pd
# import explicit objects for functional annotations
from fastai.collab import *
from fastai.collab import (Categorify, CollabDataLoaders, IndexSplitter,
TabularCollab, TransformBlock)
from fastai.collab import (Categorify, IndexSplitter, TabularCollab,
TransformBlock)
from fastai.tabular.all import *

import vaep.io.dataloaders
Expand Down Expand Up @@ -49,37 +49,30 @@ def __init__(self,
item_column: str = 'peptide',
target_column: str = 'intensity',
model_kwargs: dict = None,
batch_size: int = 64):
batch_size: int = 1_024):
if datasplits.val_y is not None:
self.X, self.frac = combine_data(datasplits.train_X,
datasplits.val_y)
self.X, _ = combine_data(datasplits.train_X,
datasplits.val_y)
else:
self.X, self.frac = datasplits.train_X.reset_index(), 0.0
self.X, _ = datasplits.train_X.reset_index(), 0.0
self.batch_size = batch_size
self.dls = CollabDataLoaders.from_df(self.X, valid_pct=self.frac,
seed=42,
user_name=sample_column,
item_name=item_column,
rating_name=target_column,
bs=self.batch_size)
user_name = sample_column
item_name = item_column
rating_name = target_column
cat_names = [user_name, item_name]
ratings = self.X
splits = None
if datasplits.val_y is not None:
idx_splitter = IndexSplitter(
list(range(len(datasplits.train_X), len(datasplits.train_X) + len(datasplits.val_y))))
list(range(len(datasplits.train_X), len(self.X))))
splits = idx_splitter(self.X)
to = TabularCollab(
ratings,
[Categorify],
cat_names,
self.to = TabularCollab(
self.X,
procs=[Categorify],
cat_names=cat_names,
y_names=[rating_name],
y_block=TransformBlock(),
splits=splits)
self.dls = to.dataloaders(path='.', bs=self.batch_size)
self.dls = self.to.dataloaders(path='.', bs=self.batch_size)
self.params = {}
if model_kwargs is None:
model_kwargs = {}
Expand Down
22 changes: 9 additions & 13 deletions vaep/sklearn/cf_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
from fastai import learner
from fastai.callback.tracker import EarlyStoppingCallback
from fastai.collab import *
from fastai.collab import CollabDataLoaders, EmbeddingDotBias, TabularCollab
from fastai.collab import EmbeddingDotBias, TabularCollab
from fastai.data.block import TransformBlock
from fastai.data.transforms import IndexSplitter
from fastai.learner import Learner
from fastai.losses import MSELossFlat
from fastai.tabular.all import *
from fastai.tabular.all import TransformBlock
from fastai.tabular.core import Categorify
from fastai.torch_core import default_device
from sklearn.base import BaseEstimator, TransformerMixin
Expand Down Expand Up @@ -89,21 +90,16 @@ def fit(self, X: pd.Series, y: pd.Series = None,
if not cuda:
default_device(use=False) # set to cpu
if y is not None:
X, frac = collab.combine_data(X, y)
# Concatenate train and validation observations into on dataframe
first_N_train = len(X)
X, _ = collab.combine_data(X, y)
else:
X, frac = X.reset_index(), 0.0

self.dls = CollabDataLoaders.from_df(
X,
valid_pct=frac,
seed=42,
user_name=self.sample_column,
item_name=self.item_column,
rating_name=self.target_column,
bs=self.batch_size)
X, _ = X.reset_index(), 0.0

splits = None
if y is not None:
idx_splitter = IndexSplitter(list(range(len(X), len(X) + len(y))))
# specify positional indices of validation data
idx_splitter = IndexSplitter(list(range(first_N_train, len(X))))
splits = idx_splitter(X)

self.cat_names = [self.sample_column, self.item_column]
Expand Down

0 comments on commit 077d305

Please sign in to comment.