Skip to content

GLC

glc.GLCModel

Graph-based lipid classifier (GLC).

This class predicts lipid classes using a Gaussian Graphical Model (GGM), feature metadata, and accurate mass matches to database (LMSD). Scoring uses first-hop and second-hop neighbor information, weighted by partial correlations and with retention-time filtering.

Source code in src/glc/glc_model.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
class GLCModel:
    """Graph-based lipid classifier (GLC).

    This class predicts lipid classes using a Gaussian Graphical Model (GGM),
    feature metadata, and accurate mass matches to database (LMSD). Scoring uses first-hop and
    second-hop neighbor information, weighted by partial correlations and with
    retention-time filtering.
    """

    def __init__(
        self,
        ggm: GGM,
        feat_dicts: FeatDicts,
        node_ids: Dict[int, List[str]],
        db_df: pd.DataFrame,
        db_id_col: str = "LM_ID",
        class_level: str = "SUB_CLASS",
        rt_thresh: float = 50.0,
        feat_weight: float = 5.0,
        label_propogation: bool = False,
    ) -> None:
        """Initialize the GLC predictor.

        Args:
            ggm: Object containing the graph (`G`) and partial correlations (`pcor_dict`). Object intended to be from ```glc.GGM```
            feat_dicts: Object providing metadata (e.g., retention times via `feat_dicts.rt`). Intended to be from ```glc.FeatDicts```
            node_ids: Mapping of feature/node IDs to lists of database IDs.
            db_df: Database dataframe containing lipid annotations.
            db_id_col: Column in `db_df` containing the database ID (e.g., "LM_ID").
            class_level: Level of lipid classification to predict (e.g., "SUB_CLASS").
            rt_thresh: Maximum retention-time difference allowed for neighbor filtering.
            feat_weight: Weight multiplier for the feature itself when scoring.
            label_propogation: Whether to use harmonic label propagation for nodes with
                no assigned class. (Default: ``False``). A very small handful of nodes in the subgraph
                may have no annotated neighbors; enabling this option assigns them classes
                based on the labels of other nodes in the graph. Typically, less than 10 nodes in a dataset. 
        """
        self.ggm = ggm
        self.feat_dicts = feat_dicts
        self.node_ids = node_ids
        self.db_df = db_df
        self.db_id_col = db_id_col
        self.class_level = class_level
        self.rt_thresh = rt_thresh
        self.feat_weight = feat_weight
        self.label_propogation = label_propogation
        self.lmid2class = db_df.set_index(db_id_col)[class_level].to_dict()

    @lru_cache(maxsize=None)
    def _get_neighbors(self, feature_id: int) -> List[int]:
        """Return immediate neighbors of a node in the GGM graph.

        Args:
            feature_id: Node/feature ID.

        Returns:
            List of neighboring node IDs.
        """
        return list(nx.neighbors(self.ggm.G, feature_id))

    def _filter_neighbors(
        self, feature_id: int, neighbors: List[int]
    ) -> Tuple[Iterable[int], Iterable[float]]:
        """Filter neighbors by retention time & compute absolute partial correlations.

        Args:
            feature_id: Feature ID whose neighbors are being filtered.
            neighbors: List of immediate neighbors.

        Returns:
            A tuple `(filtered_neighbor_ids, weights)` where:
                - filtered_neighbor_ids: Iterable of node IDs.
                - weights: Iterable of absolute partial correlation coefficients.
        """
        rt = self.feat_dicts.rt[feature_id]
        filtered = [
            (node, self.ggm.pcor_dict[f"{feature_id}::{node}"])
            for node in neighbors
            if abs(self.feat_dicts.rt[node] - rt) <= self.rt_thresh
        ]
        return zip(*filtered) if filtered else ([], [])

    def score_node(self, feature_id: int) -> List[Tuple[str, float]]:
        """Score lipid classes for a single feature ID.

        Scoring is based on:
            - filtered first-hop neighbors,
            - second-hop neighbors,
            - partial correlations as weights,
            - retention-time constraints.
            - feature itself weighted by `feat_weight`.

        Args:
            feature_id: Node/feature ID to classify.

        Returns:
            A list of `(class_name, score)` tuples sorted by score (top 10).
        """
        try:
            neighbors = self._get_neighbors(feature_id)
            neighbors, weights = self._filter_neighbors(feature_id, neighbors)

            neighbors = [feature_id] + list(neighbors)
            weights = [max(weights, default=0) * self.feat_weight] + list(weights)

            counter = Counter()
            for node, weight in zip(neighbors, weights):
                categories = {self.lmid2class[lmid] for lmid in self.node_ids[node]}
                for category in categories:
                    counter[category] += weight

            counter = self._second_hop_score(neighbors[1:], weights[1:], counter)
            return counter.most_common(10)

        except KeyError as e:
            raise KeyError(
                f"Feature ID {feature_id} not in GGM subgraph. Missing feature: {e}"
            )
        except Exception as e:
            raise RuntimeError(f"Error scoring feature {feature_id}: {e}")

    def _second_hop_score(
        self,
        neighbors: List[int],
        weights: List[float],
        counter: Counter,
    ) -> Counter:
        """Compute scores from second-hop neighbors.

        Uses only maximal scoring path for each second-hop feature to avoid
        double counting.

        Args:
            neighbors: First-hop neighbor IDs.
            weights: Weights for each first-hop neighbor.
            counter: Score accumulator.

        Returns:
            Counter updated with second-hop class contributions.
        """
        max_weights: Dict[int, float] = {}

        for feature_id, weight_1hop in zip(neighbors, weights):
            second_neighbors = self._get_neighbors(feature_id)
            second_neighbors, weights_2hop = self._filter_neighbors(
                feature_id, second_neighbors
            )

            for node, weight_2hop in zip(second_neighbors, weights_2hop):
                score = weight_1hop * weight_2hop
                if node not in max_weights or score > max_weights[node]:
                    max_weights[node] = score

        for node, score in max_weights.items():
            categories = {self.lmid2class[lmid] for lmid in self.node_ids[node]}
            for category in categories:
                counter[category] += score

        return counter

    def predict_all(self) -> Dict[int, List[Tuple[str, float]]]:
        """Predict classes for all features in the main GGM subgraph.

        Returns:
            A dict mapping feature IDs → list of `(class, score)` predictions.
        """
        predictions: Dict[int, List[Tuple[str, float]]] = {}
        missing_feats: List[int] = []

        for feature_id in tqdm(self.ggm.G.nodes, desc="Predicting feature classes"):
            try:
                predictions[feature_id] = self.score_node(feature_id)
                if predictions[feature_id] == []:
                    missing_feats.append(feature_id)
            except Exception:
                missing_feats.append(feature_id)

        if not self.label_propogation:
            return predictions

        nx.set_node_attributes(
            self.ggm.G,
            {node: preds[0][0] for node, preds in predictions.items() if preds},
            "label",
        )

        prop_labels = nx.algorithms.node_classification.harmonic_function(self.ggm.G)

        for feature_id in missing_feats:
            predictions[feature_id] = [(prop_labels[feature_id], np.nan)]

        return predictions

    def convert2mainclass(
        self, predictions: Dict[int, List[Tuple[str, float]]]
    ) -> Dict[int, List[Tuple[str, float]]]:
        """Convert predicted SUB_CLASS values to MAIN_CLASS values.

        Args:
            predictions: Output from `predict_all()` using SUB_CLASS predictions.

        Returns:
            A dict mapping feature IDs -> list of `(MAIN_CLASS, score)` pairs.
        """
        mainclass_converter_dict = dict(
            zip(self.db_df["SUB_CLASS"], self.db_df["MAIN_CLASS"])
        )

        output_dict: Dict[int, List[Tuple[str, float]]] = {}

        for key, value_list in predictions.items():
            unique_classes = set()
            output_dict[key] = [
                (mainclass_converter_dict[item], score)
                for item, score in value_list
                if mainclass_converter_dict.get(item)
                and mainclass_converter_dict[item] not in unique_classes
                and not unique_classes.add(mainclass_converter_dict[item])
            ]

        return output_dict

__init__(ggm, feat_dicts, node_ids, db_df, db_id_col='LM_ID', class_level='SUB_CLASS', rt_thresh=50.0, feat_weight=5.0, label_propogation=False)

Initialize the GLC predictor.

Parameters:

Name Type Description Default
ggm GGM

Object containing the graph (G) and partial correlations (pcor_dict). Object intended to be from glc.GGM

required
feat_dicts FeatDicts

Object providing metadata (e.g., retention times via feat_dicts.rt). Intended to be from glc.FeatDicts

required
node_ids Dict[int, List[str]]

Mapping of feature/node IDs to lists of database IDs.

required
db_df DataFrame

Database dataframe containing lipid annotations.

required
db_id_col str

Column in db_df containing the database ID (e.g., "LM_ID").

'LM_ID'
class_level str

Level of lipid classification to predict (e.g., "SUB_CLASS").

'SUB_CLASS'
rt_thresh float

Maximum retention-time difference allowed for neighbor filtering.

50.0
feat_weight float

Weight multiplier for the feature itself when scoring.

5.0
label_propogation bool

Whether to use harmonic label propagation for nodes with no assigned class. (Default: False). A very small handful of nodes in the subgraph may have no annotated neighbors; enabling this option assigns them classes based on the labels of other nodes in the graph. Typically, less than 10 nodes in a dataset.

False
Source code in src/glc/glc_model.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(
    self,
    ggm: GGM,
    feat_dicts: FeatDicts,
    node_ids: Dict[int, List[str]],
    db_df: pd.DataFrame,
    db_id_col: str = "LM_ID",
    class_level: str = "SUB_CLASS",
    rt_thresh: float = 50.0,
    feat_weight: float = 5.0,
    label_propogation: bool = False,
) -> None:
    """Initialize the GLC predictor.

    Args:
        ggm: Object containing the graph (`G`) and partial correlations (`pcor_dict`). Object intended to be from ```glc.GGM```
        feat_dicts: Object providing metadata (e.g., retention times via `feat_dicts.rt`). Intended to be from ```glc.FeatDicts```
        node_ids: Mapping of feature/node IDs to lists of database IDs.
        db_df: Database dataframe containing lipid annotations.
        db_id_col: Column in `db_df` containing the database ID (e.g., "LM_ID").
        class_level: Level of lipid classification to predict (e.g., "SUB_CLASS").
        rt_thresh: Maximum retention-time difference allowed for neighbor filtering.
        feat_weight: Weight multiplier for the feature itself when scoring.
        label_propogation: Whether to use harmonic label propagation for nodes with
            no assigned class. (Default: ``False``). A very small handful of nodes in the subgraph
            may have no annotated neighbors; enabling this option assigns them classes
            based on the labels of other nodes in the graph. Typically, less than 10 nodes in a dataset. 
    """
    self.ggm = ggm
    self.feat_dicts = feat_dicts
    self.node_ids = node_ids
    self.db_df = db_df
    self.db_id_col = db_id_col
    self.class_level = class_level
    self.rt_thresh = rt_thresh
    self.feat_weight = feat_weight
    self.label_propogation = label_propogation
    self.lmid2class = db_df.set_index(db_id_col)[class_level].to_dict()

convert2mainclass(predictions)

Convert predicted SUB_CLASS values to MAIN_CLASS values.

Parameters:

Name Type Description Default
predictions Dict[int, List[Tuple[str, float]]]

Output from predict_all() using SUB_CLASS predictions.

required

Returns:

Type Description
Dict[int, List[Tuple[str, float]]]

A dict mapping feature IDs -> list of (MAIN_CLASS, score) pairs.

Source code in src/glc/glc_model.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def convert2mainclass(
    self, predictions: Dict[int, List[Tuple[str, float]]]
) -> Dict[int, List[Tuple[str, float]]]:
    """Convert predicted SUB_CLASS values to MAIN_CLASS values.

    Args:
        predictions: Output from `predict_all()` using SUB_CLASS predictions.

    Returns:
        A dict mapping feature IDs -> list of `(MAIN_CLASS, score)` pairs.
    """
    mainclass_converter_dict = dict(
        zip(self.db_df["SUB_CLASS"], self.db_df["MAIN_CLASS"])
    )

    output_dict: Dict[int, List[Tuple[str, float]]] = {}

    for key, value_list in predictions.items():
        unique_classes = set()
        output_dict[key] = [
            (mainclass_converter_dict[item], score)
            for item, score in value_list
            if mainclass_converter_dict.get(item)
            and mainclass_converter_dict[item] not in unique_classes
            and not unique_classes.add(mainclass_converter_dict[item])
        ]

    return output_dict

predict_all()

Predict classes for all features in the main GGM subgraph.

Returns:

Type Description
Dict[int, List[Tuple[str, float]]]

A dict mapping feature IDs → list of (class, score) predictions.

Source code in src/glc/glc_model.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def predict_all(self) -> Dict[int, List[Tuple[str, float]]]:
    """Predict classes for all features in the main GGM subgraph.

    Returns:
        A dict mapping feature IDs → list of `(class, score)` predictions.
    """
    predictions: Dict[int, List[Tuple[str, float]]] = {}
    missing_feats: List[int] = []

    for feature_id in tqdm(self.ggm.G.nodes, desc="Predicting feature classes"):
        try:
            predictions[feature_id] = self.score_node(feature_id)
            if predictions[feature_id] == []:
                missing_feats.append(feature_id)
        except Exception:
            missing_feats.append(feature_id)

    if not self.label_propogation:
        return predictions

    nx.set_node_attributes(
        self.ggm.G,
        {node: preds[0][0] for node, preds in predictions.items() if preds},
        "label",
    )

    prop_labels = nx.algorithms.node_classification.harmonic_function(self.ggm.G)

    for feature_id in missing_feats:
        predictions[feature_id] = [(prop_labels[feature_id], np.nan)]

    return predictions

score_node(feature_id)

Score lipid classes for a single feature ID.

Scoring is based on
  • filtered first-hop neighbors,
  • second-hop neighbors,
  • partial correlations as weights,
  • retention-time constraints.
  • feature itself weighted by feat_weight.

Parameters:

Name Type Description Default
feature_id int

Node/feature ID to classify.

required

Returns:

Type Description
List[Tuple[str, float]]

A list of (class_name, score) tuples sorted by score (top 10).

Source code in src/glc/glc_model.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def score_node(self, feature_id: int) -> List[Tuple[str, float]]:
    """Score lipid classes for a single feature ID.

    Scoring is based on:
        - filtered first-hop neighbors,
        - second-hop neighbors,
        - partial correlations as weights,
        - retention-time constraints.
        - feature itself weighted by `feat_weight`.

    Args:
        feature_id: Node/feature ID to classify.

    Returns:
        A list of `(class_name, score)` tuples sorted by score (top 10).
    """
    try:
        neighbors = self._get_neighbors(feature_id)
        neighbors, weights = self._filter_neighbors(feature_id, neighbors)

        neighbors = [feature_id] + list(neighbors)
        weights = [max(weights, default=0) * self.feat_weight] + list(weights)

        counter = Counter()
        for node, weight in zip(neighbors, weights):
            categories = {self.lmid2class[lmid] for lmid in self.node_ids[node]}
            for category in categories:
                counter[category] += weight

        counter = self._second_hop_score(neighbors[1:], weights[1:], counter)
        return counter.most_common(10)

    except KeyError as e:
        raise KeyError(
            f"Feature ID {feature_id} not in GGM subgraph. Missing feature: {e}"
        )
    except Exception as e:
        raise RuntimeError(f"Error scoring feature {feature_id}: {e}")