Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added activate score and minor bug fixes #17

Merged
merged 2 commits into from
Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions R/IDconverter.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ names(data) <- sub("^X", "", names(data)) # drop "X" string in columns name

### Warning ###
# Entrez ID might duplicate for Ensemble ID
data$entrez = mapIds(org.Hs.eg.db, keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first")
row.names(data)<-make.names(data$entrez, unique=TRUE)
entrez = mapIds(org.Hs.eg.db, keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first")
row.names(data)<-make.names(entrez, unique=TRUE)
row.names(data) <- sub("^X", "", row.names(data)) # drop "X" string in index name
write.table(data, outputFile, sep=',', row.names = TRUE, col.names = TRUE) # Write result
Empty file.
33 changes: 33 additions & 0 deletions notebook/notebook_lib/gene_zscore/standard_gzscore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
__author__ = "Junhee Yoon"
__version__ = "1.0.0"
__maintainer__ = "Junhee Yoon"
__email__ = "[email protected]"

"""
Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93
Description: calculating activation score by using z score
"""

import pandas as pd
import numpy as np

class calculator(object):

def __init__(self, df):
if df.empty:
raise ValueError("Input Dataframe is empty, please try with different one.")
else:
self.df = df

# function structure
def gs_zscore(self, names='Zscore', gene_set=[]):
zscore=[]
arr1_index = self.df.index.tolist()
inter = list(set(arr1_index).intersection(gene_set))

diff_mean = self.df.loc[inter].mean(axis=0).subtract(self.df.mean(axis=0))
len_norm = self.df.std(ddof=1, axis=0).apply(lambda x: np.sqrt(len(inter))/x)
zscore = diff_mean*len_norm
zscore = zscore.to_frame()
zscore.columns = [names]
return zscore
11 changes: 8 additions & 3 deletions notebook/notebook_lib/gene_zscore/threaded_gzscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
__email__ = "[email protected]"

"""
EXPERIMENTAL CODE
Manual: https://github.com/swiri021/Threaded_gsZscore
Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93
Description: calculating activation score by using threaded z score
Expand All @@ -13,6 +14,7 @@
import threading
import functools
import itertools
import math

class funcThread(object):
def __init__(self):
Expand All @@ -29,6 +31,9 @@ def run(*args, **kwargs):
####Divide Samples by number of threads
i_col = len(args[1].columns.tolist())
contents_numb = i_col/kwargs['nthread']
#contents_numb = math.ceil(contents_numb)
contents_numb = round(contents_numb) # round for matching thread number

split_columns = [args[1].columns.tolist()[i:i+contents_numb] for i in range(0, len(args[1].columns.tolist()), contents_numb)]
if len(split_columns)>kwargs['nthread']:
split_columns = split_columns[:kwargs['nthread']-1] + [list(itertools.chain(*split_columns[kwargs['nthread']-1:]))]
Expand All @@ -37,7 +42,7 @@ def run(*args, **kwargs):

####Running threads
for i, item in enumerate(split_columns):
threads[i] = threading.Thread(target = func, args=(args[0], args[1].ix[:,item], container, i), kwargs=kwargs)
threads[i] = threading.Thread(target = func, args=(args[0], args[1][item], container, i), kwargs=kwargs)
threads[i].start()
for i in range(len(threads)):
threads[i].join()
Expand All @@ -56,7 +61,7 @@ def __init__(self, df):
self.df = df

# Wrapper for controlling Threads
def gs_zscore(self, nthread=5, gene_set=[]):
def gs_zscore(self, nthread=4, gene_set=[]):
arr1 = self.df
container = None
i = None
Expand All @@ -66,7 +71,7 @@ def gs_zscore(self, nthread=5, gene_set=[]):
# function structure
# args(input, container, thread_index , **kwargs)
@funcThread()
def _calculating(self, arr1, container, i, nthread=5, gene_set=[]):
def _calculating(self, arr1, container, i, nthread=4, gene_set=[]):
zscore=[]
arr1_index = arr1.index.tolist()
inter = list(set(arr1_index).intersection(gene_set))
Expand Down
8 changes: 7 additions & 1 deletion utils/cleanup_normalized_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import argparse
import pandas as pd
import os
import numpy as np

parser = argparse.ArgumentParser(prog='cleanup_normalized_matrix.py')
# Input data
Expand All @@ -19,6 +19,9 @@
# Output path
parser.add_argument('-o','--output', type=str, dest='output', required=True,\
help='Output file name including path')

parser.add_argument('-v','--vst', dest='vst', action='store_true',default=False,\
help='Input data is vst normalized or not, default = False')
args = parser.parse_args()

if __name__ == "__main__":
Expand All @@ -35,6 +38,9 @@
df.columns = [x.split(".")[0] for x in df.columns.tolist()] # New column names
df = df[~df.index.duplicated(keep='first')] # Taking first values in duplicated index

if args.vst==False:
df=df.applymap(lambda x: np.log2(x+1)) # Apply log2 for non-vst normalized data

## Need to add file name handler
if '.csv' in args.input_df:
df.to_csv(args.output)
Expand Down
15 changes: 0 additions & 15 deletions utils/convert_ensembl_to_entrez.py

This file was deleted.

45 changes: 40 additions & 5 deletions utils/get_all_acitivation_scores.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,50 @@
import glob
import os
__author__ = "Junhee Yoon"
__version__ = "1.0.0"
__maintainer__ = "Junhee Yoon"
__email__ = "[email protected]"

"""
Description: This code generates activation scores by using MsigDB. This code needs expression matrix by EntrezID index
"""

## Move to setting.env in the future
MSIGDB_PATH = "/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data/MsigDB_list/msigdb.v7.4.entrez.gmt"

## sys import path for library calling
import sys; sys.path.append('/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib');
import argparse
import pandas as pd
from gene_zscore.standard_gzscore import calculator

parser = argparse.ArgumentParser(prog='get_all_activattion_scores.py')
# Input data
parser.add_argument('-i','--input', type=str, dest='input_df', required=True,\
help='Input data matrix')
# Output path
parser.add_argument('-o','--output', type=str, dest='filepath', required=True, default='./',\
help='Output directory')
parser.add_argument('-o','--output', type=str, dest='output', required=True, default='./',\
help='Output file')
args = parser.parse_args()

MSIGDB_PATH = "data/MsigDB_list/msigdb.v7.4.entrez.gmt"
if __name__ == "__main__":
# .gmt parsing
count = 0
gmt_arr = [] # gmt parsing array
with open(MSIGDB_PATH, 'r') as infile:
for line in infile:
gmt_value = line.strip().split("\t") # splitting line
sig_names = gmt_value[0] # signature name
gene_list = gmt_value[2:] # gene list
gmt_arr.append([sig_names]+gene_list)

# Sample loading, and some entrezIDs are duplicated in the matrix
gexpr = pd.read_csv(args.input_df, index_col=0)
gexpr.index = [x.split(".")[0] for x in gexpr.index.tolist()] # remove effect from R make.names
gexpr = gexpr.groupby(gexpr.index).max() # keeping max for duplicated index

zscore_arr = [] # result array
zscore_calculator = calculator(gexpr) # Set activation calculator
for sig in gmt_arr:
zscore_value = zscore_calculator.gs_zscore(names=sig[0], gene_set=sig[1:]) # using standard, threaded version has an error
zscore_arr.append(zscore_value)
zscore_df = pd.concat(zscore_arr, axis=1) # make dataframe
zscore_df.to_csv(args.output)