-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #30 from OpenKBC/engineering_dev
new feature extraction and features analysis result including pipeline update
- Loading branch information
Showing
17 changed files
with
1,633 additions
and
56 deletions.
There are no files selected for viewing
File renamed without changes.
38 changes: 9 additions & 29 deletions
38
notebook/notebook_archive/Jun09182021/step2_RFECV_ActScore.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
237 changes: 237 additions & 0 deletions
237
notebook/notebook_archive/Jun09232021/ACT_actExt_healthy_ver1.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
240 changes: 240 additions & 0 deletions
240
notebook/notebook_archive/Jun09232021/ACT_actExt_long_ver1.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
24 changes: 24 additions & 0 deletions
24
notebook/notebook_archive/Jun09232021/DAVID_result_with_genes.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR | ||
UP_KEYWORDS Signal-anchor 4 18.181818181818183 0.010618235535230702 9435, 92370, 84750, 482 22 455 20581 8.224175824175823 0.644942699379905 1.0 1.0 | ||
GOTERM_BP_DIRECT GO:0030007~cellular potassium ion homeostasis 2 9.090909090909092 0.014203886862870552 3778, 482 21 12 16792 133.26984126984127 0.8550364897890121 1.0 1.0 | ||
GOTERM_BP_DIRECT GO:0006970~response to osmotic stress 2 9.090909090909092 0.021233751409806934 1831, 3778 21 18 16792 88.84656084656085 0.9448350841481213 1.0 1.0 | ||
UP_SEQ_FEATURE splice variant 13 59.09090909090909 0.02672379874332258 9435, 25960, 92370, 2273, 84937, 9098, 29893, 1831, 27152, 60385, 84750, 3778, 11326 20 7760 20063 1.680534793814433 0.8915172884093386 1.0 1.0 | ||
GOTERM_CC_DIRECT GO:0000139~Golgi membrane 4 18.181818181818183 0.029224941783175234 9435, 84342, 92370, 84750 22 591 18224 5.60652207352715 0.7122723462224878 1.0 1.0 | ||
GOTERM_BP_DIRECT GO:0009790~embryo development 2 9.090909090909092 0.044324271083796926 10361, 84750 21 38 16792 42.08521303258145 0.9978025190277713 1.0 1.0 | ||
INTERPRO IPR000340:Dual specificity phosphatase, catalytic domain 2 9.090909090909092 0.04535550105440483 1844, 7803 21 43 18559 41.10520487264673 0.9662369069907797 1.0 1.0 | ||
GOTERM_MF_DIRECT GO:0016791~phosphatase activity 2 9.090909090909092 0.04636430022502955 92370, 7803 21 40 16881 40.19285714285714 0.8873853134098133 1.0 1.0 | ||
UP_KEYWORDS Golgi apparatus 4 18.181818181818183 0.047948316929936494 9435, 84342, 92370, 84750 22 812 20581 4.608374384236454 0.991487229254269 1.0 1.0 | ||
KEGG_PATHWAY hsa04911:Insulin secretion 2 9.090909090909092 0.048527607447158325 3778, 482 5 85 6879 32.371764705882356 0.7116600702180332 0.4416846321392304 0.4416846321392304 | ||
UP_SEQ_FEATURE mutagenesis site 6 27.27272727272727 0.04876123266351168 9435, 4953, 84937, 9098, 3778, 7803 20 2191 20063 2.747101780009128 0.9834139717878374 1.0 1.0 | ||
KEGG_PATHWAY hsa04970:Salivary secretion 2 9.090909090909092 0.049087791369392356 3778, 482 5 86 6879 31.995348837209303 0.7158742585745337 0.4416846321392304 0.4416846321392304 | ||
KEGG_PATHWAY hsa04972:Pancreatic secretion 2 9.090909090909092 0.053002155856707645 3778, 482 5 93 6879 29.58709677419355 0.7437140735976389 0.4416846321392304 0.4416846321392304 | ||
SMART SM00404:PTPc_motif 2 9.090909090909092 0.055007744369135844 1844, 7803 10 63 10057 31.926984126984127 0.658699645983222 1.0 1.0 | ||
UP_SEQ_FEATURE domain:Tyrosine-protein phosphatase 2 9.090909090909092 0.05803089125516087 1844, 7803 20 63 20063 31.84603174603175 0.9925697030377879 1.0 1.0 | ||
INTERPRO IPR003595:Protein-tyrosine phosphatase, catalytic 2 9.090909090909092 0.0657787434224818 1844, 7803 21 63 18559 28.055933484504912 0.9930364393651362 1.0 1.0 | ||
UP_KEYWORDS Developmental protein 4 18.181818181818183 0.07005658477911464 27152, 10361, 2273, 7803 22 949 20581 3.9430979978925187 0.9991284691002745 1.0 1.0 | ||
UP_SEQ_FEATURE active site:Phosphocysteine intermediate 2 9.090909090909092 0.0784103517723536 1844, 7803 20 86 20063 23.32906976744186 0.9987638185946439 1.0 1.0 | ||
INTERPRO IPR000387:Protein-tyrosine/Dual specificity phosphatase 2 9.090909090909092 0.08479581942452594 1844, 7803 21 82 18559 21.555168408826948 0.9984482789379427 1.0 1.0 | ||
UP_KEYWORDS Endosome 3 13.636363636363635 0.08549711150290609 84937, 9098, 7803 22 481 20581 5.834719334719335 0.9998282277870996 1.0 1.0 | ||
KEGG_PATHWAY hsa04022:cGMP-PKG signaling pathway 2 9.090909090909092 0.08877539992187655 3778, 482 5 158 6879 17.415189873417724 0.902133086037035 0.5548462495117285 0.5548462495117285 | ||
GOTERM_BP_DIRECT GO:0006813~potassium ion transport 2 9.090909090909092 0.09331516173239449 3778, 482 21 82 16792 19.502903600464574 0.9999981944575009 1.0 1.0 | ||
UP_KEYWORDS Membrane 12 54.54545454545454 0.0993540921976912 9435, 84342, 92370, 25960, 100526664, 84750, 84937, 9098, 3778, 7803, 482, 11326 22 7494 20581 1.497998398718975 0.9999609387831477 1.0 1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 74, | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import seaborn as sns\n", | ||
"import numpy as np\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"\n", | ||
"resultPath = \"resultFiles/featureExtractionV3_by_Jun/\"\n", | ||
"\n", | ||
"intersected_genes = []\n", | ||
"for x in ['CD4', 'CD8', 'CD14']:\n", | ||
" #df_healthy = pd.read_csv(resultPath+\"HealthyPatients/\"+x+\".Ranksum.RFECV.gene.result\", index_col=0)\n", | ||
" df_long = pd.read_csv(resultPath+\"LongDiseaseDuration/\"+x+\".Ranksum.RFECV.gene.result\", index_col=0)\n", | ||
" print(x)\n", | ||
" print(len(df_long))\n" | ||
], | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"CD4\n", | ||
"43\n", | ||
"CD8\n", | ||
"149\n", | ||
"CD14\n", | ||
"637\n" | ||
] | ||
} | ||
], | ||
"metadata": {} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 77, | ||
"source": [ | ||
"## Utils and Library for notebook\n", | ||
"from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n", | ||
"import itertools\n", | ||
"def _LoadDiseaseDuration(df, meta_data, returntype='long'):\n", | ||
" \"\"\"\n", | ||
" df : Expression or activation score matrix\n", | ||
" meta_data : meta data which contains duration and sample ID\n", | ||
" output: long DD samples and short DD samples by list, or healthy samples and short DD samples by list\n", | ||
" \"\"\"\n", | ||
" # Sample by disease category\n", | ||
" sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n", | ||
" \n", | ||
" # Sort by disease category and exclude uknown samples\n", | ||
" patient_samples = [] # patient samples\n", | ||
" healthy_samples = [] # healthy samples\n", | ||
" for samples, category in zip(sample_list, sample_category):\n", | ||
" if category=='Healthy':\n", | ||
" healthy_samples = samples\n", | ||
" else:\n", | ||
" if category!='Unknown':# Excluding unknown samples\n", | ||
" patient_samples.append(samples)\n", | ||
"\n", | ||
" patient_samples = list(itertools.chain(*patient_samples)) # flatten\n", | ||
" patient_samples = list(set(patient_samples).intersection(df.columns.tolist())) # intersected with act score matrix\n", | ||
" healthy_samples = list(set(healthy_samples).intersection(df.columns.tolist())) # intersected with act score matrix\n", | ||
" patient_meta = meta_data.loc[meta_data['HCVB_ID'].isin(patient_samples)] # Make patient metadata\n", | ||
"\n", | ||
" longDD_samples, shortDD_samples = exttoolkit.get_sample_name_by_contValues(patient_meta, 'HCVB_ID', 'DiseaseDuration', 50)\n", | ||
" longDD_samples = list(set(longDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix\n", | ||
" shortDD_samples = list(set(shortDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix\n", | ||
"\n", | ||
" if returntype=='long':\n", | ||
" return longDD_samples, shortDD_samples\n", | ||
" elif returntype=='healthy':\n", | ||
" return healthy_samples, shortDD_samples" | ||
], | ||
"outputs": [], | ||
"metadata": {} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 84, | ||
"source": [ | ||
"#df_cd4 = pd.read_csv(resultPath+\"LongDiseaseDuration/CD4.Ranksum.RFECV.act.csv\", index_col=0)\n", | ||
"df_cd4 = pd.read_csv(resultPath+\"LongDiseaseDuration/CD8.Ranksum.RFECV.gene.result\", index_col=0)\n", | ||
"#df_cd4 = pd.read_csv(\"../data/counts_normalized/IDConvertedFiles/counts_vst_CD4.converted.csv\", index_col=0)\n", | ||
"meta_data = pd.read_csv('../data/annotation_metadata/EPIC_HCvB_metadata_baseline_updated-share.csv')\n", | ||
"long_samples, shortDD_samples = _LoadDiseaseDuration(df_cd4, meta_data, 'long')\n", | ||
"df_cd4 = df_cd4[long_samples+shortDD_samples]\n", | ||
"df_cd4 = df_cd4.subtract(df_cd4.median(axis=1), axis=0)\n", | ||
"\n", | ||
"\n", | ||
"fold_change = (df_cd4[long_samples].mean(axis=1) - df_cd4[shortDD_samples].mean(axis=1)).apply(abs)\n", | ||
"fold_change = fold_change.sort_values(ascending=False)[:40].index.tolist()\n", | ||
"df_cd4 = df_cd4.loc[fold_change]\n", | ||
"\n", | ||
"# Import some data to play with\n", | ||
"X = df_cd4.T.values\n", | ||
"y = [0]*len(long_samples)+[1]*len(shortDD_samples) # Training y\n", | ||
"y = np.array(y)" | ||
], | ||
"outputs": [], | ||
"metadata": {} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 85, | ||
"source": [ | ||
"from sklearn.ensemble import RandomForestClassifier\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"from sklearn import metrics\n", | ||
"\n", | ||
"\n", | ||
"for t in list(range(45,50)):\n", | ||
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=t)\n", | ||
" #X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=t)\n", | ||
"\n", | ||
" randomState = list(range(40,60))\n", | ||
"\n", | ||
" test_auc = []\n", | ||
" val_auc = []\n", | ||
" for i in randomState:\n", | ||
" clf = RandomForestClassifier(max_depth=10, random_state=i)\n", | ||
" clf.fit(X_train, y_train)\n", | ||
"\n", | ||
" y_pred = clf.predict(X_test)\n", | ||
" fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)\n", | ||
" #print(\"test auc: \",metrics.auc(fpr, tpr))\n", | ||
" test_auc.append(metrics.auc(fpr, tpr))\n", | ||
" \n", | ||
" #y_val_pred = clf.predict(X_val)\n", | ||
" #fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred, pos_label=1)\n", | ||
" #print(\"val auc: \",metrics.auc(fpr, tpr))\n", | ||
" #val_auc.append(metrics.auc(fpr, tpr))\n", | ||
" #print(np.mean(test_auc), np.mean(val_auc))\n", | ||
" print(np.mean(test_auc))" | ||
], | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"0.6107080419580418\n", | ||
"0.5788095238095239\n", | ||
"0.6720274390243903\n", | ||
"0.5882142857142857\n", | ||
"0.6155594405594405\n" | ||
] | ||
} | ||
], | ||
"metadata": {} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"source": [], | ||
"outputs": [], | ||
"metadata": {} | ||
} | ||
], | ||
"metadata": { | ||
"orig_nbformat": 4, | ||
"language_info": { | ||
"name": "python", | ||
"version": "3.8.2", | ||
"mimetype": "text/x-python", | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"pygments_lexer": "ipython3", | ||
"nbconvert_exporter": "python", | ||
"file_extension": ".py" | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3.8.2 64-bit ('r-py-test': conda)" | ||
}, | ||
"interpreter": { | ||
"hash": "7508a6b53ffb04362d156591e4bfb20c197555e37f3cce3b1ec90fd899bbfe63" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
251 changes: 251 additions & 0 deletions
251
notebook/notebook_archive/Jun09232021/vis_testing.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.