In [5]:

Copied!





import glc
import pandas as pd 
import numpy as np
from sklearn.impute import KNNImputer
import glc
import pandas as pd 
import numpy as np
from sklearn.impute import KNNImputer

GGM estimation¶

GLC uses the GeneNet R package for GGM estimation.

The classglc.EstGGM() can call the GeneNet package provided R is installed and configured on your system.

Alternatively, glc.GGM can accept a CSV file of the GGM adjacency matrix computed elsewhere provided the columns and index column correspond to the unique identifier of each feature (peak_id). `

In [2]:

Copied!





# load example data
ds = glc.LoadExampleData()
feature_table = ds.feat_table
feature_table
# load example data
ds = glc.LoadExampleData()
feature_table = ds.feat_table
feature_table

Out[2]:

	peak_id	mz	mzmin	mzmax	rt	rtmin	rtmax	ALZ_LPOS_ToF05_S10W01	ALZ_LPOS_ToF05_S10W02	ALZ_LPOS_ToF05_S10W03	...	ALZ_LPOS_ToF05_S18W85	ALZ_LPOS_ToF05_S18W86	ALZ_LPOS_ToF05_S18W87	ALZ_LPOS_ToF05_S18W88	ALZ_LPOS_ToF05_S18W89	ALZ_LPOS_ToF05_S18W90	ALZ_LPOS_ToF05_S18W91	ALZ_LPOS_ToF05_S18W92	ALZ_LPOS_ToF05_S18W95_LTR	ALZ_LPOS_ToF05_S18W96_SR
0	1	149.020788	149.019306	149.023645	216.244505	214.252996	219.310999	1090.250052	4490.669602	29455.387448	...	58331.194080	18272.786685	36686.550679	40101.094219	13895.999509	18124.479888	66918.355458	33292.263065	6.997904e+06	28084.760086
1	2	184.070977	184.069395	184.074130	361.097002	356.983996	365.917997	111695.867233	125453.901289	178557.859139	...	144635.427267	167737.194017	197555.712231	179264.621233	197252.251353	199957.914864	161728.923879	167901.055349	2.503246e+05	187805.589293
2	3	184.070739	184.069179	184.074066	473.921499	470.621996	479.506988	487520.978430	507689.908146	549142.660961	...	552094.714656	572367.493929	585021.811458	607610.827728	548663.568528	608063.451880	587394.901075	551502.965857	5.248675e+05	563743.690681
3	4	184.070894	184.069490	184.074038	314.846992	310.094003	321.786003	241696.633518	243125.770031	338378.917343	...	335716.656628	355086.755620	360378.519575	383559.519936	321654.963278	381142.385594	309565.494361	376864.941987	2.855071e+05	325279.628676
4	5	184.070681	184.069138	184.074102	445.146489	440.267000	448.554010	446536.156909	475157.552228	424077.154410	...	327425.355258	454101.046639	392653.722571	424944.098846	360710.931921	618157.565794	341301.898479	526253.164215	5.093509e+05	525105.873510
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4881	4882	1632.409295	1632.405696	1632.412634	511.584521	508.135987	513.663025	11508.898981	89877.340981	76318.706706	...	62417.600720	120379.717270	140937.089989	152739.053336	103865.067842	80509.476186	188957.833626	99366.085575	1.052733e+05	86738.921162
4882	4883	1634.124167	1634.120678	1634.127650	358.619013	354.560995	361.315012	33005.592010	46732.204993	53219.386404	...	309096.469963	127210.101263	373049.763004	194427.254489	104790.876856	123118.831657	646751.427851	88808.180273	9.947741e+01	138854.237328
4883	4884	1635.128556	1635.125229	1635.132142	358.721995	354.699011	361.216993	1445.506325	1405.187955	25057.227229	...	371567.285730	125806.958707	425259.418670	142788.950246	70218.766545	119030.375763	716938.797357	95171.573845	1.460314e+02	136818.076346
4884	4885	1648.351823	1648.348204	1648.355172	473.966503	470.331001	484.285984	57808.785112	23589.240458	106255.474515	...	121497.892978	126829.091651	101291.995816	117553.697023	116148.910678	101294.542149	113278.757900	104079.367673	1.207888e+05	111775.863329
4885	4886	1649.356294	1649.352730	1649.359663	474.193511	471.252995	485.140000	283621.024184	8230.595415	3800.933475	...	140585.420007	107097.016886	122525.915733	143853.409775	127977.500142	117553.300300	102634.254575	86916.283972	7.833669e+04	124108.528527

4886 rows × 763 columns

Here, we are going to preprocess the dataset for GGM estimation.
For this dataset this includes:

retaining only study samples
replacing missing values with np.nan
removing features with more than 10% missing values in this case.
log transformation
imputation of missing values

In [3]:

Copied!





# extract just the intensity data
int_df = feature_table.iloc[:, 7:].copy()

# replace zeros with NaN for imputation
int_df.replace(0, np.nan, inplace=True)

# keep only the study samples and remove QC samples
keep_columns = int_df.columns[~int_df.columns.str.endswith(('_LTR', '_SR'))]
int_df = int_df[keep_columns]

int_df.index = feature_table['peak_id'].astype(int)

# log scale intensity data
int_df = np.log1p(int_df)

# drop features with more than 10% missing values
int_df = int_df.T
cols_drop = int_df.columns[int_df.isna().mean() > 0.1]
int_df.drop(columns=cols_drop, inplace=True)

# impute missing values using KNN imputer
imputer = KNNImputer()
int_arr = imputer.fit_transform(int_df).T


# get retained feature IDs that correspond to imputed data
feat_labels = int_df.columns.tolist()
# extract just the intensity data
int_df = feature_table.iloc[:, 7:].copy()

# replace zeros with NaN for imputation
int_df.replace(0, np.nan, inplace=True)

# keep only the study samples and remove QC samples
keep_columns = int_df.columns[~int_df.columns.str.endswith(('_LTR', '_SR'))]
int_df = int_df[keep_columns]

int_df.index = feature_table['peak_id'].astype(int)

# log scale intensity data
int_df = np.log1p(int_df)

# drop features with more than 10% missing values
int_df = int_df.T
cols_drop = int_df.columns[int_df.isna().mean() > 0.1]
int_df.drop(columns=cols_drop, inplace=True)

# impute missing values using KNN imputer
imputer = KNNImputer()
int_arr = imputer.fit_transform(int_df).T


# get retained feature IDs that correspond to imputed data
feat_labels = int_df.columns.tolist()

We will then estimate the GGM with glc.EstGGM which calls the GeneNet package.

In [4]:

Copied!

ggm = glc.EstGGM(int_array=int_arr, feat_labels=feat_labels).run_ggm()
ggm
ggm = glc.EstGGM(int_array=int_arr, feat_labels=feat_labels).run_ggm()
ggm

R callback write-console: Loading required package: corpcor
  
R callback write-console: Loading required package: longitudinal
  
R callback write-console: Loading required package: fdrtool

Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0407 

Estimate (local) false discovery rates (partial correlations):
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr


Significant edges:  591708 
    Corresponding to  5.02 %  of possible edges

Out[4]:

	1	2	3	4	5	6	7	8	9	10	...	4877	4878	4879	4880	4881	4882	4883	4884	4885	4886
1	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	...	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
2	0.0	0.000000	0.000000	-0.021190	0.011132	0.000000	0.018480	0.000000	0.000000	0.0	...	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.014624	-0.012490	0.000000	0.000000
3	0.0	0.000000	0.000000	0.000000	0.023160	0.011105	-0.013053	-0.025270	0.012876	0.0	...	0.0000	0.000000	0.012601	0.000000	0.000000	0.000000	-0.024599	0.000000	0.010968	0.000000
4	0.0	-0.021190	0.000000	0.000000	0.017182	0.000000	0.022713	0.000000	0.017155	0.0	...	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
5	0.0	0.011132	0.023160	0.017182	0.000000	0.000000	0.029097	0.020614	0.000000	0.0	...	0.0000	0.011869	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4882	0.0	0.000000	0.000000	0.000000	0.000000	0.021935	0.000000	0.000000	0.000000	0.0	...	0.0000	0.022951	0.011828	0.040779	0.034884	0.000000	0.000000	0.000000	0.019560	0.012583
4883	0.0	0.014624	-0.024599	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	...	0.0248	0.036600	0.000000	0.012540	0.000000	0.000000	0.000000	0.062018	0.024195	0.000000
4884	0.0	-0.012490	0.000000	0.000000	0.000000	0.000000	-0.014622	-0.012539	0.000000	0.0	...	0.0000	0.012590	-0.013891	0.000000	0.000000	0.000000	0.062018	0.000000	0.000000	0.000000
4885	0.0	0.000000	0.010968	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	...	0.0000	0.042083	0.028542	0.029287	0.012553	0.019560	0.024195	0.000000	0.000000	0.038776
4886	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	...	0.0000	0.015186	0.034283	0.014448	0.016365	0.012583	0.000000	0.000000	0.038776	0.000000

4858 rows × 4858 columns