In [5]:
Copied!
import glc
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import glc
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
GGM estimation¶
GLC uses the GeneNet R package for GGM estimation.
The classglc.EstGGM() can call the GeneNet package provided R is installed and configured on your system.
Alternatively, glc.GGM can accept a CSV file of the GGM adjacency matrix computed elsewhere provided the columns and index column correspond to the unique identifier of each feature (peak_id).
`
In [2]:
Copied!
# load example data
ds = glc.LoadExampleData()
feature_table = ds.feat_table
feature_table
# load example data
ds = glc.LoadExampleData()
feature_table = ds.feat_table
feature_table
Out[2]:
| peak_id | mz | mzmin | mzmax | rt | rtmin | rtmax | ALZ_LPOS_ToF05_S10W01 | ALZ_LPOS_ToF05_S10W02 | ALZ_LPOS_ToF05_S10W03 | ... | ALZ_LPOS_ToF05_S18W85 | ALZ_LPOS_ToF05_S18W86 | ALZ_LPOS_ToF05_S18W87 | ALZ_LPOS_ToF05_S18W88 | ALZ_LPOS_ToF05_S18W89 | ALZ_LPOS_ToF05_S18W90 | ALZ_LPOS_ToF05_S18W91 | ALZ_LPOS_ToF05_S18W92 | ALZ_LPOS_ToF05_S18W95_LTR | ALZ_LPOS_ToF05_S18W96_SR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 149.020788 | 149.019306 | 149.023645 | 216.244505 | 214.252996 | 219.310999 | 1090.250052 | 4490.669602 | 29455.387448 | ... | 58331.194080 | 18272.786685 | 36686.550679 | 40101.094219 | 13895.999509 | 18124.479888 | 66918.355458 | 33292.263065 | 6.997904e+06 | 28084.760086 |
| 1 | 2 | 184.070977 | 184.069395 | 184.074130 | 361.097002 | 356.983996 | 365.917997 | 111695.867233 | 125453.901289 | 178557.859139 | ... | 144635.427267 | 167737.194017 | 197555.712231 | 179264.621233 | 197252.251353 | 199957.914864 | 161728.923879 | 167901.055349 | 2.503246e+05 | 187805.589293 |
| 2 | 3 | 184.070739 | 184.069179 | 184.074066 | 473.921499 | 470.621996 | 479.506988 | 487520.978430 | 507689.908146 | 549142.660961 | ... | 552094.714656 | 572367.493929 | 585021.811458 | 607610.827728 | 548663.568528 | 608063.451880 | 587394.901075 | 551502.965857 | 5.248675e+05 | 563743.690681 |
| 3 | 4 | 184.070894 | 184.069490 | 184.074038 | 314.846992 | 310.094003 | 321.786003 | 241696.633518 | 243125.770031 | 338378.917343 | ... | 335716.656628 | 355086.755620 | 360378.519575 | 383559.519936 | 321654.963278 | 381142.385594 | 309565.494361 | 376864.941987 | 2.855071e+05 | 325279.628676 |
| 4 | 5 | 184.070681 | 184.069138 | 184.074102 | 445.146489 | 440.267000 | 448.554010 | 446536.156909 | 475157.552228 | 424077.154410 | ... | 327425.355258 | 454101.046639 | 392653.722571 | 424944.098846 | 360710.931921 | 618157.565794 | 341301.898479 | 526253.164215 | 5.093509e+05 | 525105.873510 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4881 | 4882 | 1632.409295 | 1632.405696 | 1632.412634 | 511.584521 | 508.135987 | 513.663025 | 11508.898981 | 89877.340981 | 76318.706706 | ... | 62417.600720 | 120379.717270 | 140937.089989 | 152739.053336 | 103865.067842 | 80509.476186 | 188957.833626 | 99366.085575 | 1.052733e+05 | 86738.921162 |
| 4882 | 4883 | 1634.124167 | 1634.120678 | 1634.127650 | 358.619013 | 354.560995 | 361.315012 | 33005.592010 | 46732.204993 | 53219.386404 | ... | 309096.469963 | 127210.101263 | 373049.763004 | 194427.254489 | 104790.876856 | 123118.831657 | 646751.427851 | 88808.180273 | 9.947741e+01 | 138854.237328 |
| 4883 | 4884 | 1635.128556 | 1635.125229 | 1635.132142 | 358.721995 | 354.699011 | 361.216993 | 1445.506325 | 1405.187955 | 25057.227229 | ... | 371567.285730 | 125806.958707 | 425259.418670 | 142788.950246 | 70218.766545 | 119030.375763 | 716938.797357 | 95171.573845 | 1.460314e+02 | 136818.076346 |
| 4884 | 4885 | 1648.351823 | 1648.348204 | 1648.355172 | 473.966503 | 470.331001 | 484.285984 | 57808.785112 | 23589.240458 | 106255.474515 | ... | 121497.892978 | 126829.091651 | 101291.995816 | 117553.697023 | 116148.910678 | 101294.542149 | 113278.757900 | 104079.367673 | 1.207888e+05 | 111775.863329 |
| 4885 | 4886 | 1649.356294 | 1649.352730 | 1649.359663 | 474.193511 | 471.252995 | 485.140000 | 283621.024184 | 8230.595415 | 3800.933475 | ... | 140585.420007 | 107097.016886 | 122525.915733 | 143853.409775 | 127977.500142 | 117553.300300 | 102634.254575 | 86916.283972 | 7.833669e+04 | 124108.528527 |
4886 rows × 763 columns
Here, we are going to preprocess the dataset for GGM estimation.
For this dataset this includes:
- retaining only study samples
- replacing missing values with np.nan
- removing features with more than 10% missing values in this case.
- log transformation
- imputation of missing values
In [3]:
Copied!
# extract just the intensity data
int_df = feature_table.iloc[:, 7:].copy()
# replace zeros with NaN for imputation
int_df.replace(0, np.nan, inplace=True)
# keep only the study samples and remove QC samples
keep_columns = int_df.columns[~int_df.columns.str.endswith(('_LTR', '_SR'))]
int_df = int_df[keep_columns]
int_df.index = feature_table['peak_id'].astype(int)
# log scale intensity data
int_df = np.log1p(int_df)
# drop features with more than 10% missing values
int_df = int_df.T
cols_drop = int_df.columns[int_df.isna().mean() > 0.1]
int_df.drop(columns=cols_drop, inplace=True)
# impute missing values using KNN imputer
imputer = KNNImputer()
int_arr = imputer.fit_transform(int_df).T
# get retained feature IDs that correspond to imputed data
feat_labels = int_df.columns.tolist()
# extract just the intensity data
int_df = feature_table.iloc[:, 7:].copy()
# replace zeros with NaN for imputation
int_df.replace(0, np.nan, inplace=True)
# keep only the study samples and remove QC samples
keep_columns = int_df.columns[~int_df.columns.str.endswith(('_LTR', '_SR'))]
int_df = int_df[keep_columns]
int_df.index = feature_table['peak_id'].astype(int)
# log scale intensity data
int_df = np.log1p(int_df)
# drop features with more than 10% missing values
int_df = int_df.T
cols_drop = int_df.columns[int_df.isna().mean() > 0.1]
int_df.drop(columns=cols_drop, inplace=True)
# impute missing values using KNN imputer
imputer = KNNImputer()
int_arr = imputer.fit_transform(int_df).T
# get retained feature IDs that correspond to imputed data
feat_labels = int_df.columns.tolist()
We will then estimate the GGM with glc.EstGGM which calls the GeneNet package.
In [4]:
Copied!
ggm = glc.EstGGM(int_array=int_arr, feat_labels=feat_labels).run_ggm()
ggm
ggm = glc.EstGGM(int_array=int_arr, feat_labels=feat_labels).run_ggm()
ggm
R callback write-console: Loading required package: corpcor R callback write-console: Loading required package: longitudinal R callback write-console: Loading required package: fdrtool
Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0407
Estimate (local) false discovery rates (partial correlations):
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
Significant edges: 591708
Corresponding to 5.02 % of possible edges
Out[4]:
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 4877 | 4878 | 4879 | 4880 | 4881 | 4882 | 4883 | 4884 | 4885 | 4886 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | ... | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2 | 0.0 | 0.000000 | 0.000000 | -0.021190 | 0.011132 | 0.000000 | 0.018480 | 0.000000 | 0.000000 | 0.0 | ... | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.014624 | -0.012490 | 0.000000 | 0.000000 |
| 3 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.023160 | 0.011105 | -0.013053 | -0.025270 | 0.012876 | 0.0 | ... | 0.0000 | 0.000000 | 0.012601 | 0.000000 | 0.000000 | 0.000000 | -0.024599 | 0.000000 | 0.010968 | 0.000000 |
| 4 | 0.0 | -0.021190 | 0.000000 | 0.000000 | 0.017182 | 0.000000 | 0.022713 | 0.000000 | 0.017155 | 0.0 | ... | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 5 | 0.0 | 0.011132 | 0.023160 | 0.017182 | 0.000000 | 0.000000 | 0.029097 | 0.020614 | 0.000000 | 0.0 | ... | 0.0000 | 0.011869 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4882 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.021935 | 0.000000 | 0.000000 | 0.000000 | 0.0 | ... | 0.0000 | 0.022951 | 0.011828 | 0.040779 | 0.034884 | 0.000000 | 0.000000 | 0.000000 | 0.019560 | 0.012583 |
| 4883 | 0.0 | 0.014624 | -0.024599 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | ... | 0.0248 | 0.036600 | 0.000000 | 0.012540 | 0.000000 | 0.000000 | 0.000000 | 0.062018 | 0.024195 | 0.000000 |
| 4884 | 0.0 | -0.012490 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.014622 | -0.012539 | 0.000000 | 0.0 | ... | 0.0000 | 0.012590 | -0.013891 | 0.000000 | 0.000000 | 0.000000 | 0.062018 | 0.000000 | 0.000000 | 0.000000 |
| 4885 | 0.0 | 0.000000 | 0.010968 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | ... | 0.0000 | 0.042083 | 0.028542 | 0.029287 | 0.012553 | 0.019560 | 0.024195 | 0.000000 | 0.000000 | 0.038776 |
| 4886 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | ... | 0.0000 | 0.015186 | 0.034283 | 0.014448 | 0.016365 | 0.012583 | 0.000000 | 0.000000 | 0.038776 | 0.000000 |
4858 rows × 4858 columns