[docs]classLightGBMClassifier(Estimator):"""LightGBM Classifier. Args: boosting_type (string): Type of boosting to use. Defaults to "gbdt". - 'gbdt' uses traditional Gradient Boosting Decision Tree - "dart", uses Dropouts meet Multiple Additive Regression Trees - "goss", uses Gradient-based One-Side Sampling - "rf", uses Random Forest learning_rate (float): Boosting learning rate. Defaults to 0.1. n_estimators (int): Number of boosted trees to fit. Defaults to 100. max_depth (int): Maximum tree depth for base learners, <=0 means no limit. Defaults to 0. num_leaves (int): Maximum tree leaves for base learners. Defaults to 31. min_child_samples (int): Minimum number of data needed in a child (leaf). Defaults to 20. bagging_fraction (float): LightGBM will randomly select a subset of features on each iteration (tree) without resampling if this is smaller than 1.0. For example, if set to 0.8, LightGBM will select 80% of features before training each tree. This can be used to speed up training and deal with overfitting. Defaults to 0.9. bagging_freq (int): Frequency for bagging. 0 means bagging is disabled. k means perform bagging at every k iteration. Every k-th iteration, LightGBM will randomly select bagging_fraction * 100 % of the data to use for the next k iterations. Defaults to 0. n_jobs (int or None): Number of threads to run in parallel. -1 uses all threads. Defaults to -1. random_seed (int): Seed for the random number generator. Defaults to 0. """name="LightGBM Classifier"hyperparameter_ranges={"learning_rate":Real(0.000001,1),"boosting_type":["gbdt","dart","goss","rf"],"n_estimators":Integer(10,100),"max_depth":Integer(0,10),"num_leaves":Integer(2,100),"min_child_samples":Integer(1,100),"bagging_fraction":Real(0.000001,1),"bagging_freq":Integer(0,1),}"""{ "learning_rate": Real(0.000001, 1), "boosting_type": ["gbdt", "dart", "goss", "rf"], "n_estimators": Integer(10, 100), "max_depth": Integer(0, 10), "num_leaves": Integer(2, 100), "min_child_samples": Integer(1, 100), "bagging_fraction": Real(0.000001, 1), "bagging_freq": Integer(0, 1), }"""model_family=ModelFamily.LIGHTGBM"""ModelFamily.LIGHTGBM"""supported_problem_types=[ProblemTypes.BINARY,ProblemTypes.MULTICLASS,ProblemTypes.TIME_SERIES_BINARY,ProblemTypes.TIME_SERIES_MULTICLASS,]"""[ ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ]"""SEED_MIN=0SEED_MAX=SEED_BOUNDS.max_bound"""SEED_BOUNDS.max_bound"""def__init__(self,boosting_type="gbdt",learning_rate=0.1,n_estimators=100,max_depth=0,num_leaves=31,min_child_samples=20,bagging_fraction=0.9,bagging_freq=0,n_jobs=-1,random_seed=0,**kwargs,):parameters={"boosting_type":boosting_type,"learning_rate":learning_rate,"n_estimators":n_estimators,"max_depth":max_depth,"num_leaves":num_leaves,"min_child_samples":min_child_samples,"n_jobs":n_jobs,"bagging_freq":bagging_freq,"bagging_fraction":bagging_fraction,"verbose":-1,}parameters.update(kwargs)lg_parameters=copy.copy(parameters)# when boosting type is random forest (rf), LightGBM requires bagging_freq == 1 and 0 < bagging_fraction < 1.0ifboosting_type=="rf":lg_parameters["bagging_freq"]=1# when boosting type is goss, LightGBM requires bagging_fraction == 1elifboosting_type=="goss":lg_parameters["bagging_fraction"]=1# avoid lightgbm warnings having to do with parameter aliasesif(lg_parameters["bagging_freq"]isnotNoneorlg_parameters["bagging_fraction"]isnotNone):lg_parameters.update({"subsample":None,"subsample_freq":None})lgbm_error_msg=("LightGBM is not installed. Please install using `pip install lightgbm`.")lgbm=import_or_raise("lightgbm",error_msg=lgbm_error_msg)self._ordinal_encoder=Noneself._label_encoder=Nonelgbm_classifier=lgbm.sklearn.LGBMClassifier(random_state=random_seed,**lg_parameters)super().__init__(parameters=parameters,component_obj=lgbm_classifier,random_seed=random_seed,)def_encode_categories(self,X,fit=False):"""Encodes each categorical feature using ordinal encoding."""X=infer_feature_types(X)cat_cols=list(X.ww.select("category",return_schema=True).columns)iffit:self.input_feature_names=list(X.columns)X_encoded=_rename_column_names_to_numeric(X)rename_cols_dict=dict(zip(X.columns,X_encoded.columns))cat_cols=[rename_cols_dict[col]forcolincat_cols]iflen(cat_cols)==0:returnX_encodediffit:self._ordinal_encoder=OrdinalEncoder()encoder_output=self._ordinal_encoder.fit_transform(X_encoded[cat_cols])else:encoder_output=self._ordinal_encoder.transform(X_encoded[cat_cols])X_encoded[cat_cols]=pd.DataFrame(encoder_output)X_encoded[cat_cols]=X_encoded[cat_cols].astype("category")returnX_encodeddef_encode_labels(self,y):y_encoded=infer_feature_types(y)# change only if dtype isn't intifnotis_integer_dtype(y_encoded):self._label_encoder=LabelEncoder()y_encoded=pd.Series(self._label_encoder.fit_transform(None,y_encoded)[1],dtype="int64",)returny_encoded
[docs]deffit(self,X,y=None):"""Fits LightGBM classifier component to data. Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. Returns: self """X=infer_feature_types(X)ifyisnotNone:y=infer_feature_types(y)X_encoded=self._encode_categories(X,fit=True)y_encoded=self._encode_labels(y)self._component_obj.fit(X_encoded,y_encoded)returnself
[docs]defpredict(self,X):"""Make predictions using the fitted LightGBM classifier. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. Returns: pd.DataFrame: Predicted values. """X_encoded=self._encode_categories(X)predictions=super().predict(X_encoded)ifnotself._label_encoder:returnpredictionspredictions=self._label_encoder.inverse_transform(predictions.astype(np.int64),)returnpredictions
[docs]defpredict_proba(self,X):"""Make prediction probabilities using the fitted LightGBM classifier. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. Returns: pd.DataFrame: Predicted probability values. """X_encoded=self._encode_categories(X)returnsuper().predict_proba(X_encoded)