⚠️ filtro is under active development; breaking changes may occur.
filtro is tidy tools to apply filter-based supervised feature selection methods. These methods score and rank feature relevance using metrics such as p-values, correlation, feature importance, information gain, and more.
The package provides functions to rank and select a top proportion or number of features using built-in methods and the desirability2 package, and supports streamlined preprocessing, either standalone or within tidymodels workflows such as the recipes package.
For a detailed introduction, please see vignette(“filtro”).
Install the released version of filtro from CRAN with:
install.packages("filtro")
Install the development version from GitHub with:
# install.packages("pak")
::pak("tidymodels/filtro") pak
Currently, the implemented filters include:
ANOVA F-test
Correlation
Random forest feature importance
Information gain
Area under the ROC curve
Cross tabulation (Chi-squared test and Fisher’s exact test)
library(filtro)
library(desirability2)
library(dplyr)
library(modeldata)
<- modeldata::ames |>
ames_subset # Use a subset of data for demonstration
::select(
dplyr
Sale_Price,
MS_SubClass,
MS_Zoning,
Lot_Frontage,
Lot_Area,
Street
)<- ames_subset |>
ames_subset ::mutate(Sale_Price = log10(Sale_Price)) dplyr
# ANOVA p-value
<-
ames_aov_pval_res |>
score_aov_pval fit(Sale_Price ~ ., data = ames_subset)
@results
ames_aov_pval_res#> # A tibble: 5 × 4
#> name score outcome predictor
#> <chr> <dbl> <chr> <chr>
#> 1 aov_pval 237. Sale_Price MS_SubClass
#> 2 aov_pval 130. Sale_Price MS_Zoning
#> 3 aov_pval NA Sale_Price Lot_Frontage
#> 4 aov_pval NA Sale_Price Lot_Area
#> 5 aov_pval 5.75 Sale_Price Street
# Pearson correlation
<-
ames_cor_pearson_res |>
score_cor_pearson fit(Sale_Price ~ ., data = ames_subset)
@results
ames_cor_pearson_res#> # A tibble: 5 × 4
#> name score outcome predictor
#> <chr> <dbl> <chr> <chr>
#> 1 cor_pearson NA Sale_Price MS_SubClass
#> 2 cor_pearson NA Sale_Price MS_Zoning
#> 3 cor_pearson 0.165 Sale_Price Lot_Frontage
#> 4 cor_pearson 0.255 Sale_Price Lot_Area
#> 5 cor_pearson NA Sale_Price Street
# Forest importance
<-
ames_imp_rf_reg_res |>
score_imp_rf fit(Sale_Price ~ ., data = ames_subset, seed = 42)
@results
ames_imp_rf_reg_res#> # A tibble: 5 × 4
#> name score outcome predictor
#> <chr> <dbl> <chr> <chr>
#> 1 imp_rf 0.0144 Sale_Price MS_SubClass
#> 2 imp_rf 0.0102 Sale_Price MS_Zoning
#> 3 imp_rf 0.00693 Sale_Price Lot_Frontage
#> 4 imp_rf 0.0144 Sale_Price Lot_Area
#> 5 imp_rf 0.0000308 Sale_Price Street
# Information gain
<-
ames_info_gain_reg_res |>
score_info_gain fit(Sale_Price ~ ., data = ames_subset)
@results
ames_info_gain_reg_res#> # A tibble: 5 × 4
#> name score outcome predictor
#> <chr> <dbl> <chr> <chr>
#> 1 infogain 0.266 Sale_Price MS_SubClass
#> 2 infogain 0.113 Sale_Price MS_Zoning
#> 3 infogain 0.146 Sale_Price Lot_Frontage
#> 4 infogain 0.140 Sale_Price Lot_Area
#> 5 infogain 0.00365 Sale_Price Street
@results
ames_aov_pval_res#> # A tibble: 5 × 4
#> name score outcome predictor
#> <chr> <dbl> <chr> <chr>
#> 1 aov_pval 237. Sale_Price MS_SubClass
#> 2 aov_pval 130. Sale_Price MS_Zoning
#> 3 aov_pval NA Sale_Price Lot_Frontage
#> 4 aov_pval NA Sale_Price Lot_Area
#> 5 aov_pval 5.75 Sale_Price Street
# Show best score, based on proportion of predictors
|> show_best_score_prop(prop_terms = 0.2)
ames_aov_pval_res #> # A tibble: 1 × 4
#> name score outcome predictor
#> <chr> <dbl> <chr> <chr>
#> 1 aov_pval 237. Sale_Price MS_SubClass
# Fill safe value, then show best score
<- ames_aov_pval_res |> fill_safe_value()
ames_aov_pval_res |> show_best_score_prop(prop_terms = 0.2)
ames_aov_pval_res #> # A tibble: 2 × 4
#> name score outcome predictor
#> <chr> <dbl> <chr> <chr>
#> 1 aov_pval Inf Sale_Price Lot_Frontage
#> 2 aov_pval Inf Sale_Price Lot_Area
# Create a list
<- list(
class_score_list
ames_cor_pearson_res,
ames_imp_rf_reg_res,
ames_info_gain_reg_res )
# Fill safe values
<- class_score_list |>
ames_scores_results fill_safe_values() |>
# Remove outcome
::select(-outcome)
dplyr
ames_scores_results#> # A tibble: 5 × 4
#> predictor cor_pearson imp_rf infogain
#> <chr> <dbl> <dbl> <dbl>
#> 1 MS_SubClass 1 0.0144 0.266
#> 2 MS_Zoning 1 0.0102 0.113
#> 3 Lot_Frontage 0.165 0.00693 0.146
#> 4 Lot_Area 0.255 0.0144 0.140
#> 5 Street 1 0.0000308 0.00365
# Single and multi-parameter optimization using desirability functions
# Optimize correlation alone
|>
ames_scores_results show_best_desirability_prop(
maximize(cor_pearson, low = 0, high = 1)
)#> # A tibble: 5 × 6
#> predictor cor_pearson imp_rf infogain .d_max_cor_pearson .d_overall
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 MS_SubClass 1 0.0144 0.266 1 1
#> 2 MS_Zoning 1 0.0102 0.113 1 1
#> 3 Street 1 0.0000308 0.00365 1 1
#> 4 Lot_Area 0.255 0.0144 0.140 0.255 0.255
#> 5 Lot_Frontage 0.165 0.00693 0.146 0.165 0.165
# Optimize correlation and forest importance
|>
ames_scores_results show_best_desirability_prop(
maximize(cor_pearson, low = 0, high = 1),
maximize(imp_rf)
)#> # A tibble: 5 × 7
#> predictor cor_pearson imp_rf infogain .d_max_cor_pearson .d_max_imp_rf
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 MS_SubClass 1 0.0144 0.266 1 1
#> 2 MS_Zoning 1 0.0102 0.113 1 0.705
#> 3 Lot_Area 0.255 0.0144 0.140 0.255 0.994
#> 4 Lot_Frontage 0.165 0.00693 0.146 0.165 0.479
#> 5 Street 1 0.0000308 0.00365 1 0
#> # ℹ 1 more variable: .d_overall <dbl>
# Optimize correlation, forest importance and information gain
|>
ames_scores_results show_best_desirability_prop(
maximize(cor_pearson, low = 0, high = 1),
maximize(imp_rf),
maximize(infogain)
)#> # A tibble: 5 × 8
#> predictor cor_pearson imp_rf infogain .d_max_cor_pearson .d_max_imp_rf
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 MS_SubClass 1 0.0144 0.266 1 1
#> 2 MS_Zoning 1 0.0102 0.113 1 0.705
#> 3 Lot_Area 0.255 0.0144 0.140 0.255 0.994
#> 4 Lot_Frontage 0.165 0.00693 0.146 0.165 0.479
#> 5 Street 1 0.0000308 0.00365 1 0
#> # ℹ 2 more variables: .d_max_infogain <dbl>, .d_overall <dbl>
# Same as above, but retain only a proportion of predictors
|>
ames_scores_results show_best_desirability_prop(
maximize(cor_pearson, low = 0, high = 1),
maximize(imp_rf),
maximize(infogain),
prop_terms = 0.2
)#> # A tibble: 1 × 8
#> predictor cor_pearson imp_rf infogain .d_max_cor_pearson .d_max_imp_rf
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 MS_SubClass 1 0.0144 0.266 1 1
#> # ℹ 2 more variables: .d_max_infogain <dbl>, .d_overall <dbl>
# Optimize toward a target
|>
ames_scores_results show_best_desirability_prop(
target(cor_pearson, low = 0.2, target = 0.255, high = 0.9)
)#> # A tibble: 5 × 6
#> predictor cor_pearson imp_rf infogain .d_target_cor_pearson .d_overall
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Lot_Area 0.255 0.0144 0.140 1.00 1.00
#> 2 MS_SubClass 1 0.0144 0.266 0 0
#> 3 MS_Zoning 1 0.0102 0.113 0 0
#> 4 Lot_Frontage 0.165 0.00693 0.146 0 0
#> 5 Street 1 0.0000308 0.00365 0 0
# Optimize with box constraints
|>
ames_scores_results show_best_desirability_prop(
constrain(cor_pearson, low = 0.2, high = 1)
)#> # A tibble: 5 × 6
#> predictor cor_pearson imp_rf infogain .d_box_cor_pearson .d_overall
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 MS_SubClass 1 0.0144 0.266 1 1
#> 2 MS_Zoning 1 0.0102 0.113 1 1
#> 3 Lot_Area 0.255 0.0144 0.140 1 1
#> 4 Street 1 0.0000308 0.00365 1 1
#> 5 Lot_Frontage 0.165 0.00693 0.146 0 0
Please note that the filtro project is released with a Contributor Code of Conduct. By contributing to this project, you agree to abide by its terms.
For questions and discussions about tidymodels packages, modeling, and machine learning, please post on Posit Community.
If you think you have encountered a bug, please submit an issue.
Either way, learn how to create and share a reprex (a minimal, reproducible example), to clearly communicate about your code.
Check out further details on contributing guidelines for tidymodels packages and how to get help.