Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Command line and QIIME2 -- refactoring #16

Merged
merged 24 commits into from
Aug 28, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added Icon
Empty file.
44 changes: 44 additions & 0 deletions gemelli/_ctf_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Configuration file where you can set the parameter default values and
# descriptions.
DEFAULT_COMP = 3
DEFAULT_MSC = 0
DEFAULT_MFC = 5
DEFAULT_MAXITER = 50
DEFAULT_FMETA = None
DEFAULT_COND = None

DESC_INIT = ("The number of initialization vectors. Larger values will"
"give more accurate factorization but will be more "
"computationally expensive (suggested to be below 100; beware of"
" overfitting) [minimum 1]")
DESC_ITERATIONSALS = ("Max number of Alternating Least Square (ALS)"
" optimization iterations (suggested to be below 100;"
" beware of overfitting) [minimum 1]")
DESC_ITERATIONSRTPM = ("Max number of Robust Tensor Power Method (RTPM)"
" optimization iterations (suggested to be below 100;"
" beware of overfitting) [minimum 1]")
DESC_COMP = ("The underlying low-rank structure (suggested: 1 < rank < 10)"
" [minimum 2]")
DESC_MSC = "Minimum sum cutoff of sample across all features"
DESC_MFC = "Minimum sum cutoff of features across all samples"
DESC_OUT = "Location of output files."
DESC_FMETA = "Feature metadata file in QIIME2 formatting."
DESC_BIN = "Input table in biom format."
DESC_SMETA = "Sample metadata file in QIIME2 formatting."
DESC_SUBJ = ("Metadata column containing subject IDs to"
" use for pairing samples. WARNING: if"
" replicates exist for an individual ID at"
" either state_1 to state_N, that subject will"
" be mean grouped by default.")
DESC_COND = ("Metadata column containing state (e.g.,Time, BodySite)"
" across which samples are paired."
" At least one is required but up to four are allowed"
" by other state inputs.")
QORD = ("Compositional biplot of subjects as points and features as arrows."
" Where the variation between subject groupings is explained by the"
" log-ratio between opposing arrows. WARNING: The % variance explained"
" is spread over n_components and can be inflated.")
QDIST = (
"A sample-sample distance matrix generated from the euclidean distance"
" of the subject-state ordinations and itself.")
QLOAD = "An ordinational loading of subjects over states."
33 changes: 15 additions & 18 deletions gemelli/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,10 @@ class _BaseImpute(object):
def fit(self):
""" Placeholder for fit this
should be implemetned by sub-method"""

def transform(self):
""" return loadings
"""
return self.sample_loading, \
self.feature_loading, \
self.conditional_loading
@abstractmethod
def label(self):
""" Placeholder for fit this
should be implemetned by sub-method"""


class _BaseConstruct(object):
Expand All @@ -36,15 +33,15 @@ class _BaseConstruct(object):
"""
@abstractmethod
def construct(self):
"""
conditional_loading : array-like or list of array-like
The conditional loading vectors
of shape (conditions, r) if there is 1 type
of condition, and a list of such matrices if
there are more than 1 type of condition
feature_loading : array-like
The feature loading vectors
of shape (features, r)
sample_loading : array-like
The sample loading vectors
"""
conditional_loading : array-like or list of array-like
The conditional loading vectors
of shape (conditions, r) if there is 1 type
of condition, and a list of such matrices if
there are more than 1 type of condition
feature_loading : array-like
The feature loading vectors
of shape (features, r)
sample_loading : array-like
The sample loading vectors
of shape (samples, r) """
14 changes: 14 additions & 0 deletions gemelli/citations.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
@article {Martino2019,
author = {Martino, Cameron and Morton, James T. and Marotz, Clarisse A. and Thompson, Luke R. and Tripathi, Anupriya and Knight, Rob and Zengler, Karsten},
editor = {Neufeld, Josh D.},
title = {A Novel Sparse Compositional Technique Reveals Microbial Perturbations},
volume = {4},
number = {1},
elocation-id = {e00016-19},
year = {2019},
doi = {10.1128/mSystems.00016-19},
publisher = {American Society for Microbiology Journals},
URL = {https://msystems.asm.org/content/4/1/e00016-19},
eprint = {https://msystems.asm.org/content/4/1/e00016-19.full.pdf},
journal = {mSystems}
}
179 changes: 179 additions & 0 deletions gemelli/ctf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import biom
import skbio
import qiime2
from pandas import DataFrame
from qiime2 import Metadata
from skbio import OrdinationResults, DistanceMatrix
from gemelli.factorization import TensorFactorization
from gemelli.preprocessing import build, rclr
from gemelli._ctf_defaults import (DEFAULT_COMP, DEFAULT_MSC,
DEFAULT_MFC, DEFAULT_MAXITER,
DEFAULT_FMETA as DEFFM)


def ctf(table: biom.Table,
sample_metadata: qiime2.Metadata,
individual_id_column: str,
state_column: str,
n_components: int = DEFAULT_COMP,
min_sample_count: int = DEFAULT_MSC,
min_feature_count: int = DEFAULT_MFC,
max_iterations_als: int = DEFAULT_MAXITER,
max_iterations_rptm: int = DEFAULT_MAXITER,
n_initializations: int = DEFAULT_MAXITER,
feature_metadata: Metadata = DEFFM) -> (OrdinationResults,
DistanceMatrix,
DataFrame,
DataFrame):
# run CTF helper and parse output for QIIME
ord_res, dists, straj, ftraj = ctf_helper(table,
gwarmstrong marked this conversation as resolved.
Show resolved Hide resolved
sample_metadata,
individual_id_column,
[state_column],
n_components,
min_sample_count,
min_feature_count,
max_iterations_als,
max_iterations_rptm,
n_initializations,
feature_metadata)

PC_cols = ["PC%i" % (i + 1) for i in range(n_components)]
dists = list(dists.values())[0]
straj = list(straj.values())[0]
ftraj = list(ftraj.values())[0]
return ord_res, dists, straj, ftraj


def ctf_helper(table: biom.Table,
sample_metadata: DataFrame,
individual_id_column: str,
state_columns: list,
n_components: int = DEFAULT_COMP,
min_sample_count: int = DEFAULT_MSC,
min_feature_count: int = DEFAULT_MFC,
max_iterations_als: int = DEFAULT_MAXITER,
max_iterations_rptm: int = DEFAULT_MAXITER,
n_initializations: int = DEFAULT_MAXITER,
feature_metadata: DataFrame = DEFFM) -> (OrdinationResults,
dict,
tuple):
""" Runs Compositional Tensor Factorization CTF.
"""

# validate the metadata using q2 as a wrapper
if isinstance(sample_metadata, DataFrame):
sample_metadata = sample_metadata[state_columns +
[individual_id_column]]
sample_metadata = qiime2.Metadata(sample_metadata).to_dataframe()
else:
sample_metadata = sample_metadata.to_dataframe()[
state_columns + [individual_id_column]]
# validate the metadata using q2 as a wrapper
if isinstance(feature_metadata, DataFrame):
feature_metadata = qiime2.Metadata(feature_metadata).to_dataframe()
elif feature_metadata is not None:
feature_metadata = feature_metadata.to_dataframe()

# match the data (borrowed in part from gneiss.util.match)
subtablefids = table.ids('observation')
subtablesids = table.ids('sample')
if len(subtablesids) != len(set(subtablesids)):
raise ValueError('Data-table contains duplicate sample IDs')
if len(subtablefids) != len(set(subtablefids)):
raise ValueError('Data-table contains duplicate feature IDs')
submetadataids = set(sample_metadata.index)
subtablesids = set(subtablesids)
subtablefids = set(subtablefids)
if feature_metadata is not None:
submetadatafeat = set(feature_metadata.index)
fidx = subtablefids & submetadatafeat
if len(fidx) == 0:
raise ValueError(("No more features left. Check to make "
"sure that the sample names between "
"`feature-metadata` and `table` are "
"consistent"))
feature_metadata = feature_metadata.loc[fidx]
sidx = subtablesids & submetadataids
if len(sidx) == 0:
raise ValueError(("No more features left. Check to make sure that "
"the sample names between `sample-metadata` and"
" `table` are consistent"))
if feature_metadata is not None:
table.filter(list(fidx), axis='observation', inplace=True)
table.filter(list(sidx), axis='sample', inplace=True)
sample_metadata = sample_metadata.loc[sidx]

# filter and import table
for axis, min_sum in zip(['sample',
'observation'],
[min_sample_count,
min_feature_count]):
table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum],
axis=axis, inplace=True)

# table to dataframe
table = DataFrame(table.matrix_data.toarray(),
table.ids('observation'),
table.ids('sample'))

# tensor building
tensor = build()
tensor.construct(table, sample_metadata,
individual_id_column, state_columns)

# factorize
TF = TensorFactorization(
n_components=n_components,
max_als_iterations=max_iterations_als,
max_rtpm_iterations=max_iterations_rptm,
n_initializations=n_initializations).fit(rclr(tensor.counts))
# label tensor loadings
TF.label(tensor, taxonomy=feature_metadata)

# pickle the
gwarmstrong marked this conversation as resolved.
Show resolved Hide resolved

# if the n_components is two add PC3 of zeros
# this is referenced as in issue in
# <https://github.com/biocore/emperor/commit
# /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
if n_components == 2:
TF.subjects['PC3'] = [0] * len(TF.subjects.index)
TF.features['PC3'] = [0] * len(TF.features.index)
TF.proportion_explained.loc['PC3'] = 0
TF.eigvals.loc['PC3'] = 0

# save ordination results
short_method_name = 'CTF_Biplot'
long_method_name = 'Compositional Tensor Factorization Biplot'
# only keep PC -- other tools merge metadata
keep_PC = [col for col in TF.features.columns if 'PC' in col]
ordination = OrdinationResults(
short_method_name,
long_method_name,
TF.eigvals,
samples=TF.subjects[keep_PC],
features=TF.features[keep_PC],
proportion_explained=TF.proportion_explained)

# save distance matrix for each condition
distances = {}
subject_trajectories = {}
feature_trajectories = {}
for condition, dist, straj, ftraj in zip(tensor.conditions,
TF.subject_distances,
TF.subject_trajectory,
TF.feature_trajectory):
# match distances to metadata
ids = straj.index
ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids))
inter = set(ind_dict).intersection(sample_metadata.index)
indices = sorted([ind_dict[ind] for ind in inter])
dist = dist[indices, :][:, indices]
distances[condition] = skbio.stats.distance.DistanceMatrix(
dist, ids=ids[indices])
# save traj.
subject_trajectories[condition] = straj
ftraj.index = ftraj.index.astype(str)
feature_trajectories[condition] = ftraj
return ordination, distances, subject_trajectories, feature_trajectories
Loading