How to make HDBScan an inductive clustering method

There is a large difference between inductive and transductive clustering methods. While the first are more similar to supervised learning, in the sense that once trained on N examples they can generalize to M unseen new samples, transductive method instead need to see all data, but cannot generalize to new samples.

HDBscan is largely a transductive method, and to make it able to generalize to new data we can wrap in a new class the approximate_predict method offered by the library.

from hdbscan import HDBSCAN
from hdbscan import approximate_predict

class HDBSCANPredict(HDBSCAN):
    def __init__(self, min_cluster_size=10, alpha=1.0, cluster_selection_epsilon=0.0):
        super().__init__(
            min_cluster_size=min_cluster_size,
            alpha=alpha,
            cluster_selection_epsilon=cluster_selection_epsilon,
            prediction_data=True,
        )

    def predict(self, X, y=None):
        self.generate_prediction_data()
        return approximate_predict(self, X)[0]

    def predict_proba(self, X):
        self.generate_prediction_data()
        return approximate_predict(self, X)[1]

    def fit_transform(self, X, y=None):
        super().fit(X)
        out = approximate_predict(self, X)[0][:, None]
        return out

    def transform(self, X, y=None):
        return approximate_predict(self, X)[0][:, None]

    def fit(self, X, y=None):
        super().fit(X, y)
        self.generate_prediction_data()
        return self